You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zsymv_U.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define A $7
  42. #define LDA $8
  43. #define X $9
  44. #define INCX $10
  45. #define Y $11
  46. #define INCY $5
  47. #define BUFFER $6
  48. #define XX $12
  49. #define YY $13
  50. #define I $14
  51. #define IS $15
  52. #define AO1 $16
  53. #define AO2 $17
  54. #define Y1 $18
  55. #define TEMP $19
  56. #define ALPHA_R $f13
  57. #define ALPHA_I $f14
  58. #define a1 $f0
  59. #define a2 $f1
  60. #define a3 $f2
  61. #define a4 $f3
  62. #define a5 $f4
  63. #define a6 $f5
  64. #define a7 $f6
  65. #define a8 $f7
  66. #define alpha1 $f8
  67. #define alpha2 $f9
  68. #define alpha3 $f10
  69. #define alpha4 $f11
  70. #define x1 $f12
  71. #define x2 $f15
  72. #define x3 $f16
  73. #define x4 $f17
  74. #define xsum1 $f18
  75. #define xsum2 $f19
  76. #define xsum3 $f20
  77. #define xsum4 $f21
  78. #define ysum1 $f22
  79. #define ysum2 $f23
  80. #define ysum3 $f24
  81. #define ysum4 $f25
  82. #ifndef HEMV
  83. #define ADD1 NMSUB
  84. #define ADD2 MADD
  85. #else
  86. #define ADD1 MADD
  87. #define ADD2 NMSUB
  88. #endif
  89. PROLOGUE
  90. LDARG INCY, 0($sp)
  91. LDARG BUFFER, 8($sp)
  92. #ifdef __64BIT__
  93. daddiu $sp, $sp, -64
  94. #else
  95. daddiu $sp, $sp, -80
  96. #endif
  97. SDARG $16, 0($sp)
  98. dsll LDA, LDA, ZBASE_SHIFT
  99. SDARG $17, 8($sp)
  100. dsll INCX, INCX, ZBASE_SHIFT
  101. SDARG $18, 16($sp)
  102. dsll INCY, INCY, ZBASE_SHIFT
  103. SDARG $19, 24($sp)
  104. nop
  105. sdc1 $f24, 32($sp)
  106. sdc1 $f25, 40($sp)
  107. #ifndef __64BIT__
  108. sdc1 $f20, 48($sp)
  109. sdc1 $f21, 56($sp)
  110. sdc1 $f22, 64($sp)
  111. sdc1 $f23, 72($sp)
  112. #endif
  113. blez M, .L999
  114. li IS, 2 * SIZE
  115. beq IS, INCX, .L05
  116. move Y1, Y
  117. dsra I, M, 2
  118. move XX, X
  119. blez I, .L02
  120. move X, BUFFER
  121. .align 3
  122. .L01:
  123. LD a1, 0 * SIZE(XX)
  124. LD a2, 1 * SIZE(XX)
  125. daddu XX, XX, INCX
  126. LD a3, 0 * SIZE(XX)
  127. LD a4, 1 * SIZE(XX)
  128. daddu XX, XX, INCX
  129. LD a5, 0 * SIZE(XX)
  130. LD a6, 1 * SIZE(XX)
  131. daddu XX, XX, INCX
  132. LD a7, 0 * SIZE(XX)
  133. LD a8, 1 * SIZE(XX)
  134. daddu XX, XX, INCX
  135. ST a1, 0 * SIZE(BUFFER)
  136. ST a2, 1 * SIZE(BUFFER)
  137. ST a3, 2 * SIZE(BUFFER)
  138. ST a4, 3 * SIZE(BUFFER)
  139. ST a5, 4 * SIZE(BUFFER)
  140. ST a6, 5 * SIZE(BUFFER)
  141. ST a7, 6 * SIZE(BUFFER)
  142. ST a8, 7 * SIZE(BUFFER)
  143. daddiu I, I, -1
  144. bgtz I, .L01
  145. daddiu BUFFER, BUFFER, 8 * SIZE
  146. .align 3
  147. .L02:
  148. andi I, M, 3
  149. blez I, .L05
  150. NOP
  151. .align 3
  152. .L03:
  153. LD a1, 0 * SIZE(XX)
  154. LD a2, 1 * SIZE(XX)
  155. daddu XX, XX, INCX
  156. ST a1, 0 * SIZE(BUFFER)
  157. ST a2, 1 * SIZE(BUFFER)
  158. daddiu I, I, -1
  159. bgtz I, .L03
  160. daddiu BUFFER, BUFFER, 2 * SIZE
  161. .align 3
  162. .L05:
  163. beq IS, INCY, .L10
  164. daddiu BUFFER, BUFFER, 255
  165. li TEMP, -256
  166. and BUFFER, BUFFER, TEMP
  167. dsra I, M, 2
  168. move Y1, BUFFER
  169. blez I, .L07
  170. move YY, Y
  171. .align 3
  172. .L06:
  173. LD a1, 0 * SIZE(YY)
  174. LD a2, 1 * SIZE(YY)
  175. daddu YY, YY, INCY
  176. LD a3, 0 * SIZE(YY)
  177. LD a4, 1 * SIZE(YY)
  178. daddu YY, YY, INCY
  179. LD a5, 0 * SIZE(YY)
  180. LD a6, 1 * SIZE(YY)
  181. daddu YY, YY, INCY
  182. LD a7, 0 * SIZE(YY)
  183. LD a8, 1 * SIZE(YY)
  184. daddu YY, YY, INCY
  185. ST a1, 0 * SIZE(BUFFER)
  186. ST a2, 1 * SIZE(BUFFER)
  187. ST a3, 2 * SIZE(BUFFER)
  188. ST a4, 3 * SIZE(BUFFER)
  189. ST a5, 4 * SIZE(BUFFER)
  190. ST a6, 5 * SIZE(BUFFER)
  191. ST a7, 6 * SIZE(BUFFER)
  192. ST a8, 7 * SIZE(BUFFER)
  193. daddiu I, I, -1
  194. bgtz I, .L06
  195. daddiu BUFFER, BUFFER, 8 * SIZE
  196. .align 3
  197. .L07:
  198. andi I, M, 3
  199. blez I, .L10
  200. NOP
  201. .align 3
  202. .L08:
  203. LD a1, 0 * SIZE(YY)
  204. LD a2, 1 * SIZE(YY)
  205. daddu YY, YY, INCY
  206. ST a1, 0 * SIZE(BUFFER)
  207. ST a2, 1 * SIZE(BUFFER)
  208. daddiu I, I, -1
  209. bgtz I, .L08
  210. daddiu BUFFER, BUFFER, 2 * SIZE
  211. .align 3
  212. .L10:
  213. slti TEMP, M, 2
  214. nop
  215. bgtz TEMP, .L20
  216. li IS, 0
  217. .align 3
  218. .L11:
  219. dsll TEMP, IS, ZBASE_SHIFT
  220. daddu TEMP, X, TEMP
  221. LD x1, 0 * SIZE(TEMP)
  222. LD x2, 1 * SIZE(TEMP)
  223. LD x3, 2 * SIZE(TEMP)
  224. LD x4, 3 * SIZE(TEMP)
  225. MTC $0, xsum1
  226. MTC $0, xsum2
  227. MTC $0, xsum3
  228. MTC $0, xsum4
  229. MUL alpha1, ALPHA_R, x1
  230. move AO1, A
  231. MUL alpha2, ALPHA_I, x1
  232. dsra I, IS, 1
  233. MUL alpha3, ALPHA_R, x3
  234. daddu AO2, A, LDA
  235. MUL alpha4, ALPHA_I, x3
  236. daddu A, AO2, LDA
  237. NMSUB alpha1, alpha1, ALPHA_I, x2
  238. move XX, X
  239. MADD alpha2, alpha2, ALPHA_R, x2
  240. move YY, Y1
  241. NMSUB alpha3, alpha3, ALPHA_I, x4
  242. MADD alpha4, alpha4, ALPHA_R, x4
  243. blez I, .L15
  244. daddiu I, I, -1
  245. LD x1, 0 * SIZE(XX)
  246. LD x2, 1 * SIZE(XX)
  247. LD x4, 3 * SIZE(XX)
  248. LD a1, 0 * SIZE(AO1)
  249. LD a2, 1 * SIZE(AO1)
  250. LD a3, 2 * SIZE(AO1)
  251. LD a4, 3 * SIZE(AO1)
  252. LD a5, 0 * SIZE(AO2)
  253. LD a6, 1 * SIZE(AO2)
  254. LD a7, 2 * SIZE(AO2)
  255. LD a8, 3 * SIZE(AO2)
  256. LD ysum1, 0 * SIZE(YY)
  257. blez I, .L13
  258. LD ysum2, 1 * SIZE(YY)
  259. .align 3
  260. .L12:
  261. MADD ysum1, ysum1, alpha1, a1
  262. LD ysum3, 2 * SIZE(YY)
  263. MADD ysum2, ysum2, alpha2, a1
  264. LD ysum4, 3 * SIZE(YY)
  265. MADD xsum1, xsum1, x1, a1
  266. LD a8, 3 * SIZE(AO2)
  267. MADD xsum2, xsum2, x2, a1
  268. LD a1, 4 * SIZE(AO1)
  269. MADD ysum3, ysum3, alpha1, a3
  270. LD x3, 2 * SIZE(XX)
  271. MADD ysum4, ysum4, alpha2, a3
  272. daddiu I, I, -1
  273. MADD xsum3, xsum3, x1, a5
  274. MADD xsum4, xsum4, x2, a5
  275. NMSUB ysum1, ysum1, alpha2, a2
  276. MADD ysum2, ysum2, alpha1, a2
  277. ADD1 xsum1, xsum1, x2, a2
  278. daddiu AO2, AO2, 4 * SIZE
  279. ADD2 xsum2, xsum2, x1, a2
  280. LD a2, 5 * SIZE(AO1)
  281. NMSUB ysum3, ysum3, alpha2, a4
  282. MADD ysum4, ysum4, alpha1, a4
  283. ADD1 xsum3, xsum3, x2, a6
  284. LD x2, 5 * SIZE(XX)
  285. ADD2 xsum4, xsum4, x1, a6
  286. LD x1, 4 * SIZE(XX)
  287. MADD ysum1, ysum1, alpha3, a5
  288. MADD ysum2, ysum2, alpha4, a5
  289. MADD xsum1, xsum1, x3, a3
  290. LD a5, 0 * SIZE(AO2)
  291. MADD xsum2, xsum2, x4, a3
  292. LD a3, 6 * SIZE(AO1)
  293. MADD ysum3, ysum3, alpha3, a7
  294. MADD ysum4, ysum4, alpha4, a7
  295. MADD xsum3, xsum3, x3, a7
  296. daddiu AO1, AO1, 4 * SIZE
  297. MADD xsum4, xsum4, x4, a7
  298. LD a7, 2 * SIZE(AO2)
  299. NMSUB ysum1, ysum1, alpha4, a6
  300. daddiu XX, XX, 4 * SIZE
  301. MADD ysum2, ysum2, alpha3, a6
  302. LD a6, 1 * SIZE(AO2)
  303. ADD1 xsum1, xsum1, x4, a4
  304. daddiu YY, YY, 4 * SIZE
  305. ADD2 xsum2, xsum2, x3, a4
  306. LD a4, 3 * SIZE(AO1)
  307. NMSUB ysum3, ysum3, alpha4, a8
  308. ST ysum1,-4 * SIZE(YY)
  309. MADD ysum4, ysum4, alpha3, a8
  310. ST ysum2,-3 * SIZE(YY)
  311. LD ysum1, 0 * SIZE(YY)
  312. LD ysum2, 1 * SIZE(YY)
  313. ADD1 xsum3, xsum3, x4, a8
  314. LD x4, 3 * SIZE(XX)
  315. ADD2 xsum4, xsum4, x3, a8
  316. ST ysum3,-2 * SIZE(YY)
  317. bgtz I, .L12
  318. ST ysum4,-1 * SIZE(YY)
  319. .align 3
  320. .L13:
  321. MADD ysum1, ysum1, alpha1, a1
  322. LD ysum3, 2 * SIZE(YY)
  323. MADD ysum2, ysum2, alpha2, a1
  324. LD ysum4, 3 * SIZE(YY)
  325. MADD xsum1, xsum1, x1, a1
  326. LD a8, 3 * SIZE(AO2)
  327. MADD xsum2, xsum2, x2, a1
  328. LD x3, 2 * SIZE(XX)
  329. MADD ysum3, ysum3, alpha1, a3
  330. MADD ysum4, ysum4, alpha2, a3
  331. MADD xsum3, xsum3, x1, a5
  332. MADD xsum4, xsum4, x2, a5
  333. NMSUB ysum1, ysum1, alpha2, a2
  334. MADD ysum2, ysum2, alpha1, a2
  335. ADD1 xsum1, xsum1, x2, a2
  336. ADD2 xsum2, xsum2, x1, a2
  337. NMSUB ysum3, ysum3, alpha2, a4
  338. MADD ysum4, ysum4, alpha1, a4
  339. ADD1 xsum3, xsum3, x2, a6
  340. ADD2 xsum4, xsum4, x1, a6
  341. MADD ysum1, ysum1, alpha3, a5
  342. MADD ysum2, ysum2, alpha4, a5
  343. MADD xsum1, xsum1, x3, a3
  344. MADD xsum2, xsum2, x4, a3
  345. MADD ysum3, ysum3, alpha3, a7
  346. MADD ysum4, ysum4, alpha4, a7
  347. MADD xsum3, xsum3, x3, a7
  348. MADD xsum4, xsum4, x4, a7
  349. NMSUB ysum1, ysum1, alpha4, a6
  350. MADD ysum2, ysum2, alpha3, a6
  351. ADD1 xsum1, xsum1, x4, a4
  352. ADD2 xsum2, xsum2, x3, a4
  353. NMSUB ysum3, ysum3, alpha4, a8
  354. daddiu XX, XX, 4 * SIZE
  355. MADD ysum4, ysum4, alpha3, a8
  356. daddiu YY, YY, 4 * SIZE
  357. ADD1 xsum3, xsum3, x4, a8
  358. daddiu AO1, AO1, 4 * SIZE
  359. ADD2 xsum4, xsum4, x3, a8
  360. daddiu AO2, AO2, 4 * SIZE
  361. ST ysum1, -4 * SIZE(YY)
  362. ST ysum2, -3 * SIZE(YY)
  363. ST ysum3, -2 * SIZE(YY)
  364. ST ysum4, -1 * SIZE(YY)
  365. .align 3
  366. .L15:
  367. dsll TEMP, IS, ZBASE_SHIFT
  368. daddu TEMP, Y1, TEMP
  369. LD ysum1, 0 * SIZE(TEMP)
  370. LD ysum2, 1 * SIZE(TEMP)
  371. LD ysum3, 2 * SIZE(TEMP)
  372. LD ysum4, 3 * SIZE(TEMP)
  373. LD a1, 0 * SIZE(AO1)
  374. LD a2, 1 * SIZE(AO1)
  375. LD a3, 2 * SIZE(AO1)
  376. LD a4, 3 * SIZE(AO1)
  377. LD a5, 0 * SIZE(AO2)
  378. LD a6, 1 * SIZE(AO2)
  379. LD a7, 2 * SIZE(AO2)
  380. LD a8, 3 * SIZE(AO2)
  381. MOV x1, xsum1
  382. MOV x2, xsum2
  383. MOV x3, xsum3
  384. MOV x4, xsum4
  385. MUL xsum1, ALPHA_R, xsum1
  386. MUL xsum2, ALPHA_R, xsum2
  387. MUL xsum3, ALPHA_R, xsum3
  388. MUL xsum4, ALPHA_R, xsum4
  389. NMSUB xsum1, xsum1, ALPHA_I, x2
  390. MADD xsum2, xsum2, ALPHA_I, x1
  391. NMSUB xsum3, xsum3, ALPHA_I, x4
  392. MADD xsum4, xsum4, ALPHA_I, x3
  393. MADD xsum1, xsum1, alpha1, a1
  394. MADD xsum2, xsum2, alpha2, a1
  395. MADD xsum3, xsum3, alpha1, a5
  396. MADD xsum4, xsum4, alpha2, a5
  397. #ifndef HEMV
  398. ADD1 xsum1, xsum1, alpha2, a2
  399. ADD2 xsum2, xsum2, alpha1, a2
  400. #endif
  401. ADD1 xsum3, xsum3, alpha2, a6
  402. ADD2 xsum4, xsum4, alpha1, a6
  403. MADD xsum1, xsum1, alpha3, a5
  404. MADD xsum2, xsum2, alpha4, a5
  405. MADD xsum3, xsum3, alpha3, a7
  406. MADD xsum4, xsum4, alpha4, a7
  407. NMSUB xsum1, xsum1, alpha4, a6
  408. MADD xsum2, xsum2, alpha3, a6
  409. #ifndef HEMV
  410. ADD1 xsum3, xsum3, alpha4, a8
  411. ADD2 xsum4, xsum4, alpha3, a8
  412. #endif
  413. ADD ysum1, ysum1, xsum1
  414. ADD ysum2, ysum2, xsum2
  415. ADD ysum3, ysum3, xsum3
  416. ADD ysum4, ysum4, xsum4
  417. ST ysum1, 0 * SIZE(TEMP)
  418. ST ysum2, 1 * SIZE(TEMP)
  419. ST ysum3, 2 * SIZE(TEMP)
  420. ST ysum4, 3 * SIZE(TEMP)
  421. daddiu TEMP, IS, 4
  422. slt TEMP, M, TEMP
  423. beqz TEMP, .L11
  424. daddiu IS, IS, 2
  425. .align 3
  426. .L20:
  427. andi TEMP, M, 1
  428. nop
  429. blez TEMP, .L900
  430. nop
  431. dsll TEMP, IS, ZBASE_SHIFT
  432. daddu TEMP, X, TEMP
  433. LD x1, 0 * SIZE(TEMP)
  434. LD x2, 1 * SIZE(TEMP)
  435. MTC $0, xsum1
  436. MTC $0, xsum2
  437. MUL alpha1, ALPHA_R, x1
  438. move AO1, A
  439. MUL alpha2, ALPHA_I, x1
  440. move I, IS
  441. daddu A, AO1, LDA
  442. NMSUB alpha1, alpha1, ALPHA_I, x2
  443. move XX, X
  444. MADD alpha2, alpha2, ALPHA_R, x2
  445. move YY, Y1
  446. blez I, .L25
  447. daddiu I, I, -1
  448. LD x1, 0 * SIZE(XX)
  449. LD x2, 1 * SIZE(XX)
  450. LD a1, 0 * SIZE(AO1)
  451. LD a2, 1 * SIZE(AO1)
  452. LD ysum1, 0 * SIZE(YY)
  453. blez I, .L23
  454. LD ysum2, 1 * SIZE(YY)
  455. .align 3
  456. .L22:
  457. MADD ysum1, ysum1, alpha1, a1
  458. daddiu XX, XX, 2 * SIZE
  459. MADD ysum2, ysum2, alpha2, a1
  460. daddiu YY, YY, 2 * SIZE
  461. MADD xsum1, xsum1, x1, a1
  462. daddiu AO1, AO1, 2 * SIZE
  463. MADD xsum2, xsum2, x2, a1
  464. daddiu I, I, -1
  465. NMSUB ysum1, ysum1, alpha2, a2
  466. MADD ysum2, ysum2, alpha1, a2
  467. ADD1 xsum1, xsum1, x2, a2
  468. LD x2, 1 * SIZE(XX)
  469. ADD2 xsum2, xsum2, x1, a2
  470. LD x1, 0 * SIZE(XX)
  471. LD a1, 0 * SIZE(AO1)
  472. LD a2, 1 * SIZE(AO1)
  473. ST ysum1, -2 * SIZE(YY)
  474. LD ysum1, 0 * SIZE(YY)
  475. ST ysum2, -1 * SIZE(YY)
  476. bgtz I, .L22
  477. LD ysum2, 1 * SIZE(YY)
  478. .align 3
  479. .L23:
  480. MADD ysum1, ysum1, alpha1, a1
  481. MADD ysum2, ysum2, alpha2, a1
  482. MADD xsum1, xsum1, x1, a1
  483. MADD xsum2, xsum2, x2, a1
  484. NMSUB ysum1, ysum1, alpha2, a2
  485. daddiu XX, XX, 2 * SIZE
  486. MADD ysum2, ysum2, alpha1, a2
  487. daddiu YY, YY, 2 * SIZE
  488. ADD1 xsum1, xsum1, x2, a2
  489. daddiu AO1, AO1, 2 * SIZE
  490. ADD2 xsum2, xsum2, x1, a2
  491. nop
  492. ST ysum1, -2 * SIZE(YY)
  493. ST ysum2, -1 * SIZE(YY)
  494. .align 3
  495. .L25:
  496. dsll TEMP, IS, ZBASE_SHIFT
  497. daddu TEMP, Y1, TEMP
  498. LD ysum1, 0 * SIZE(TEMP)
  499. LD ysum2, 1 * SIZE(TEMP)
  500. LD a1, 0 * SIZE(AO1)
  501. LD a2, 1 * SIZE(AO1)
  502. MOV x1, xsum1
  503. MOV x2, xsum2
  504. MUL xsum1, ALPHA_R, xsum1
  505. MUL xsum2, ALPHA_R, xsum2
  506. NMSUB xsum1, xsum1, ALPHA_I, x2
  507. MADD xsum2, xsum2, ALPHA_I, x1
  508. MADD xsum1, xsum1, alpha1, a1
  509. MADD xsum2, xsum2, alpha2, a1
  510. #ifndef HEMV
  511. NMSUB xsum1, xsum1, alpha2, a2
  512. MADD xsum2, xsum2, alpha1, a2
  513. #endif
  514. ADD ysum1, ysum1, xsum1
  515. ADD ysum2, ysum2, xsum2
  516. ST ysum1, 0 * SIZE(TEMP)
  517. ST ysum2, 1 * SIZE(TEMP)
  518. .align 3
  519. .L900:
  520. li IS, 2 * SIZE
  521. beq INCY, IS, .L999
  522. NOP
  523. dsra I, M, 2
  524. blez I, .L905
  525. NOP
  526. .align 3
  527. .L902:
  528. LD a1, 0 * SIZE(Y1)
  529. LD a2, 1 * SIZE(Y1)
  530. LD a3, 2 * SIZE(Y1)
  531. LD a4, 3 * SIZE(Y1)
  532. LD a5, 4 * SIZE(Y1)
  533. LD a6, 5 * SIZE(Y1)
  534. LD a7, 6 * SIZE(Y1)
  535. LD a8, 7 * SIZE(Y1)
  536. ST a1, 0 * SIZE(Y)
  537. ST a2, 1 * SIZE(Y)
  538. daddu Y, Y, INCY
  539. ST a3, 0 * SIZE(Y)
  540. ST a4, 1 * SIZE(Y)
  541. daddu Y, Y, INCY
  542. ST a5, 0 * SIZE(Y)
  543. ST a6, 1 * SIZE(Y)
  544. daddu Y, Y, INCY
  545. ST a7, 0 * SIZE(Y)
  546. ST a8, 1 * SIZE(Y)
  547. daddu Y, Y, INCY
  548. daddiu I, I, -1
  549. bgtz I, .L902
  550. daddiu Y1, Y1, 8 * SIZE
  551. .align 3
  552. .L905:
  553. andi I, M, 3
  554. blez I, .L999
  555. NOP
  556. .align 3
  557. .L906:
  558. LD a1, 0 * SIZE(Y1)
  559. LD a2, 1 * SIZE(Y1)
  560. daddiu Y1, Y1, 2 * SIZE
  561. ST a1, 0 * SIZE(Y)
  562. ST a2, 1 * SIZE(Y)
  563. daddiu I, I, -1
  564. bgtz I, .L906
  565. daddu Y, Y, INCY
  566. .align 3
  567. .L999:
  568. LDARG $16, 0($sp)
  569. LDARG $17, 8($sp)
  570. LDARG $18, 16($sp)
  571. LDARG $19, 24($sp)
  572. ldc1 $f24, 32($sp)
  573. ldc1 $f25, 40($sp)
  574. #ifndef __64BIT__
  575. ldc1 $f20, 48($sp)
  576. ldc1 $f21, 56($sp)
  577. ldc1 $f22, 64($sp)
  578. ldc1 $f23, 72($sp)
  579. #endif
  580. j $31
  581. #ifdef __64BIT__
  582. daddiu $sp, $sp, 64
  583. #else
  584. daddiu $sp, $sp, 80
  585. #endif
  586. EPILOGUE