You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n.S 18 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACKSIZE 64
  41. #define PREFETCHSIZE 32
  42. #define M $16
  43. #define N $17
  44. #define A $21
  45. #define LDA $18
  46. #define X $19
  47. #define INCX $20
  48. #define Y $22
  49. #define INCY $23
  50. #define BUFFER $24
  51. #define I $25
  52. #define J $27
  53. #define Y1 $4
  54. #define A1 $5
  55. #define A2 $6
  56. #define alpha_r $f19
  57. #define alpha_i $f20
  58. #define alpha1 $f0
  59. #define alpha2 $f1
  60. #define alpha3 $f10
  61. #define alpha4 $f11
  62. #define y0 $f12
  63. #define y1 $f13
  64. #define y2 $f14
  65. #define y3 $f15
  66. #define y4 $f16
  67. #define y5 $f17
  68. #define y6 $f18
  69. #define y7 $f21
  70. #define a0 $f22
  71. #define a1 $f23
  72. #define a2 $f24
  73. #define a3 $f25
  74. #define a4 $f26
  75. #define a5 $f27
  76. #define a6 $f28
  77. #define a7 $f29
  78. #define t0 $f2
  79. #define t1 $f3
  80. #define t2 $f4
  81. #define t3 $f5
  82. #if !defined(CONJ) && !defined(XCONJ)
  83. #define ADD1 ADD
  84. #define ADD2 ADD
  85. #define ADD3 SUB
  86. #define ADD4 ADD
  87. #elif defined(CONJ) && !defined(XCONJ)
  88. #define ADD1 ADD
  89. #define ADD2 SUB
  90. #define ADD3 ADD
  91. #define ADD4 ADD
  92. #elif !defined(CONJ) && defined(XCONJ)
  93. #define ADD1 ADD
  94. #define ADD2 ADD
  95. #define ADD3 ADD
  96. #define ADD4 SUB
  97. #else
  98. #define ADD1 ADD
  99. #define ADD2 SUB
  100. #define ADD3 SUB
  101. #define ADD4 SUB
  102. #endif
  103. PROLOGUE
  104. lda $sp, -STACKSIZE($sp)
  105. ldq LDA, 0 + STACKSIZE($sp)
  106. ldq X, 8 + STACKSIZE($sp)
  107. ldq INCX, 16 + STACKSIZE($sp)
  108. ldq Y, 24 + STACKSIZE($sp)
  109. ldq INCY, 32 + STACKSIZE($sp)
  110. ldq BUFFER, 40 + STACKSIZE($sp)
  111. stt $f2, 0($sp)
  112. stt $f3, 8($sp)
  113. stt $f4, 16($sp)
  114. stt $f5, 24($sp)
  115. stt $f6, 32($sp)
  116. stt $f7, 40($sp)
  117. stt $f8, 48($sp)
  118. stt $f9, 56($sp)
  119. PROFCODE
  120. cmple M, 0, $0
  121. sll INCX, ZBASE_SHIFT, INCX
  122. cmple N, 0, $1
  123. sll INCY, ZBASE_SHIFT, INCY
  124. or $0, $1, $0
  125. bne $0, $L999
  126. cmpeq INCY, 2 * SIZE, $0
  127. sll LDA, ZBASE_SHIFT,LDA
  128. bne $0, $L10
  129. mov BUFFER, Y1
  130. mov Y, BUFFER
  131. mov Y1, Y
  132. sra M, 2, I
  133. ble I, $L05
  134. .align 4
  135. $L02:
  136. ST $f31, 0 * SIZE(Y1)
  137. ST $f31, 1 * SIZE(Y1)
  138. ST $f31, 2 * SIZE(Y1)
  139. ST $f31, 3 * SIZE(Y1)
  140. ST $f31, 4 * SIZE(Y1)
  141. ST $f31, 5 * SIZE(Y1)
  142. ST $f31, 6 * SIZE(Y1)
  143. ST $f31, 7 * SIZE(Y1)
  144. lda Y1, 8 * SIZE(Y1)
  145. lda I, -1(I)
  146. bgt I, $L02
  147. .align 4
  148. $L05:
  149. and M, 3, I
  150. ble I, $L10
  151. .align 4
  152. $L06:
  153. ST $f31, 0 * SIZE(Y1)
  154. ST $f31, 1 * SIZE(Y1)
  155. addq Y1, 2 * SIZE, Y1
  156. lda I, -1(I)
  157. bgt I, $L06
  158. .align 4
  159. $L10:
  160. sra N, 1, J
  161. ble J, $L20
  162. .align 4
  163. $L11:
  164. LD alpha1, 0 * SIZE(X)
  165. LD alpha2, 1 * SIZE(X)
  166. addq X, INCX, X
  167. LD alpha3, 0 * SIZE(X)
  168. LD alpha4, 1 * SIZE(X)
  169. addq X, INCX, X
  170. MUL alpha_r, alpha1, y0
  171. MUL alpha_r, alpha2, y1
  172. MUL alpha_r, alpha3, y2
  173. MUL alpha_r, alpha4, y3
  174. MUL alpha_i, alpha2, t0
  175. mov A, A1
  176. MUL alpha_i, alpha1, t1
  177. addq A, LDA, A2
  178. MUL alpha_i, alpha4, t2
  179. addq A2, LDA, A
  180. MUL alpha_i, alpha3, t3
  181. mov Y, Y1
  182. #ifndef XCONJ
  183. SUB y0, t0, alpha1
  184. ADD y1, t1, alpha2
  185. SUB y2, t2, alpha3
  186. ADD y3, t3, alpha4
  187. #else
  188. ADD y0, t0, alpha1
  189. SUB y1, t1, alpha2
  190. ADD y2, t2, alpha3
  191. SUB y3, t3, alpha4
  192. #endif
  193. ldl $31, 4 * SIZE(X)
  194. sra M, 2, I
  195. ble I, $L15
  196. LD a0, 0 * SIZE(A1)
  197. LD a1, 1 * SIZE(A1)
  198. LD a2, 2 * SIZE(A1)
  199. LD a3, 3 * SIZE(A1)
  200. LD a4, 0 * SIZE(A2)
  201. LD a5, 1 * SIZE(A2)
  202. LD a6, 2 * SIZE(A2)
  203. LD a7, 3 * SIZE(A2)
  204. MUL alpha1, a0, t0
  205. LD y0, 0 * SIZE(Y1)
  206. MUL alpha1, a1, t1
  207. LD y1, 1 * SIZE(Y1)
  208. MUL alpha1, a2, t2
  209. LD y2, 2 * SIZE(Y1)
  210. MUL alpha1, a3, t3
  211. LD y3, 3 * SIZE(Y1)
  212. ADD1 y0, t0, y0
  213. unop
  214. MUL alpha3, a4, t0
  215. LD y4, 4 * SIZE(Y1)
  216. ADD2 y1, t1, y1
  217. unop
  218. MUL alpha3, a5, t1
  219. LD y5, 5 * SIZE(Y1)
  220. ADD1 y2, t2, y2
  221. unop
  222. MUL alpha3, a6, t2
  223. LD y6, 6 * SIZE(Y1)
  224. ADD2 y3, t3, y3
  225. unop
  226. MUL alpha3, a7, t3
  227. LD y7, 7 * SIZE(Y1)
  228. ADD1 y0, t0, y0
  229. unop
  230. MUL alpha2, a1, t0
  231. LD a1, 5 * SIZE(A1)
  232. ADD2 y1, t1, y1
  233. unop
  234. MUL alpha2, a0, t1
  235. LD a0, 4 * SIZE(A1)
  236. ADD1 y2, t2, y2
  237. unop
  238. MUL alpha2, a3, t2
  239. LD a3, 7 * SIZE(A1)
  240. ADD2 y3, t3, y3
  241. unop
  242. MUL alpha2, a2, t3
  243. LD a2, 6 * SIZE(A1)
  244. ADD3 y0, t0, y0
  245. unop
  246. MUL alpha4, a5, t0
  247. LD a5, 5 * SIZE(A2)
  248. ADD4 y1, t1, y1
  249. unop
  250. MUL alpha4, a4, t1
  251. LD a4, 4 * SIZE(A2)
  252. ADD3 y2, t2, y2
  253. unop
  254. MUL alpha4, a7, t2
  255. LD a7, 7 * SIZE(A2)
  256. ADD4 y3, t3, y3
  257. unop
  258. MUL alpha4, a6, t3
  259. LD a6, 6 * SIZE(A2)
  260. ADD3 y0, t0, y0
  261. MUL alpha1, a0, t0
  262. ADD4 y1, t1, y1
  263. MUL alpha1, a1, t1
  264. ADD3 y2, t2, y2
  265. unop
  266. MUL alpha1, a2, t2
  267. unop
  268. ADD4 y3, t3, y3
  269. lda I, -1(I)
  270. MUL alpha1, a3, t3
  271. ble I, $L13
  272. .align 4
  273. $L12:
  274. ADD1 y4, t0, y4
  275. ST y0, 0 * SIZE(Y1)
  276. MUL alpha3, a4, t0
  277. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  278. ADD2 y5, t1, y5
  279. ST y1, 1 * SIZE(Y1)
  280. MUL alpha3, a5, t1
  281. lda I, -1(I)
  282. ADD1 y6, t2, y6
  283. ST y2, 2 * SIZE(Y1)
  284. MUL alpha3, a6, t2
  285. unop
  286. ADD2 y7, t3, y7
  287. ST y3, 3 * SIZE(Y1)
  288. MUL alpha3, a7, t3
  289. unop
  290. ADD1 y4, t0, y4
  291. unop
  292. MUL alpha2, a1, t0
  293. LD a1, 9 * SIZE(A1)
  294. ADD2 y5, t1, y5
  295. unop
  296. MUL alpha2, a0, t1
  297. LD a0, 8 * SIZE(A1)
  298. ADD1 y6, t2, y6
  299. unop
  300. MUL alpha2, a3, t2
  301. LD a3, 11 * SIZE(A1)
  302. ADD2 y7, t3, y7
  303. unop
  304. MUL alpha2, a2, t3
  305. LD a2, 10 * SIZE(A1)
  306. ADD3 y4, t0, y4
  307. lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
  308. MUL alpha4, a5, t0
  309. LD a5, 9 * SIZE(A2)
  310. ADD4 y5, t1, y5
  311. unop
  312. MUL alpha4, a4, t1
  313. LD a4, 8 * SIZE(A2)
  314. ADD3 y6, t2, y6
  315. unop
  316. MUL alpha4, a7, t2
  317. LD a7, 11 * SIZE(A2)
  318. ADD4 y7, t3, y7
  319. unop
  320. MUL alpha4, a6, t3
  321. LD a6, 10 * SIZE(A2)
  322. ADD3 y4, t0, y4
  323. unop
  324. MUL alpha1, a0, t0
  325. LD y0, 8 * SIZE(Y1)
  326. ADD4 y5, t1, y5
  327. unop
  328. MUL alpha1, a1, t1
  329. LD y1, 9 * SIZE(Y1)
  330. ADD3 y6, t2, y6
  331. unop
  332. MUL alpha1, a2, t2
  333. LD y2, 10 * SIZE(Y1)
  334. ADD4 y7, t3, y7
  335. unop
  336. MUL alpha1, a3, t3
  337. LD y3, 11 * SIZE(Y1)
  338. ADD1 y0, t0, y0
  339. ST y4, 4 * SIZE(Y1)
  340. MUL alpha3, a4, t0
  341. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  342. ADD2 y1, t1, y1
  343. ST y5, 5 * SIZE(Y1)
  344. MUL alpha3, a5, t1
  345. unop
  346. ADD1 y2, t2, y2
  347. ST y6, 6 * SIZE(Y1)
  348. MUL alpha3, a6, t2
  349. unop
  350. ADD2 y3, t3, y3
  351. ST y7, 7 * SIZE(Y1)
  352. MUL alpha3, a7, t3
  353. lda Y1, 8 * SIZE(Y1)
  354. ADD1 y0, t0, y0
  355. unop
  356. MUL alpha2, a1, t0
  357. LD a1, 13 * SIZE(A1)
  358. ADD2 y1, t1, y1
  359. unop
  360. MUL alpha2, a0, t1
  361. LD a0, 12 * SIZE(A1)
  362. ADD1 y2, t2, y2
  363. unop
  364. MUL alpha2, a3, t2
  365. LD a3, 15 * SIZE(A1)
  366. ADD2 y3, t3, y3
  367. unop
  368. MUL alpha2, a2, t3
  369. LD a2, 14 * SIZE(A1)
  370. ADD3 y0, t0, y0
  371. unop
  372. MUL alpha4, a5, t0
  373. LD a5, 13 * SIZE(A2)
  374. ADD4 y1, t1, y1
  375. unop
  376. MUL alpha4, a4, t1
  377. LD a4, 12 * SIZE(A2)
  378. ADD3 y2, t2, y2
  379. unop
  380. MUL alpha4, a7, t2
  381. LD a7, 15 * SIZE(A2)
  382. ADD4 y3, t3, y3
  383. unop
  384. MUL alpha4, a6, t3
  385. LD a6, 14 * SIZE(A2)
  386. ADD3 y0, t0, y0
  387. unop
  388. MUL alpha1, a0, t0
  389. LD y4, 4 * SIZE(Y1)
  390. ADD4 y1, t1, y1
  391. lda A2, 8 * SIZE(A2)
  392. MUL alpha1, a1, t1
  393. LD y5, 5 * SIZE(Y1)
  394. ADD3 y2, t2, y2
  395. lda A1, 8 * SIZE(A1)
  396. MUL alpha1, a2, t2
  397. LD y6, 6 * SIZE(Y1)
  398. ADD4 y3, t3, y3
  399. MUL alpha1, a3, t3
  400. LD y7, 7 * SIZE(Y1)
  401. bgt I, $L12
  402. .align 4
  403. $L13:
  404. ADD1 y4, t0, y4
  405. ST y0, 0 * SIZE(Y1)
  406. MUL alpha3, a4, t0
  407. unop
  408. ADD2 y5, t1, y5
  409. ST y1, 1 * SIZE(Y1)
  410. MUL alpha3, a5, t1
  411. unop
  412. ADD1 y6, t2, y6
  413. ST y2, 2 * SIZE(Y1)
  414. MUL alpha3, a6, t2
  415. unop
  416. ADD2 y7, t3, y7
  417. ST y3, 3 * SIZE(Y1)
  418. MUL alpha3, a7, t3
  419. unop
  420. ADD1 y4, t0, y4
  421. MUL alpha2, a1, t0
  422. ADD2 y5, t1, y5
  423. MUL alpha2, a0, t1
  424. ADD1 y6, t2, y6
  425. MUL alpha2, a3, t2
  426. ADD2 y7, t3, y7
  427. MUL alpha2, a2, t3
  428. ADD3 y4, t0, y4
  429. MUL alpha4, a5, t0
  430. ADD4 y5, t1, y5
  431. MUL alpha4, a4, t1
  432. ADD3 y6, t2, y6
  433. MUL alpha4, a7, t2
  434. ADD4 y7, t3, y7
  435. MUL alpha4, a6, t3
  436. ADD3 y4, t0, y4
  437. ADD4 y5, t1, y5
  438. ADD3 y6, t2, y6
  439. ADD4 y7, t3, y7
  440. ST y4, 4 * SIZE(Y1)
  441. lda A1, 8 * SIZE(A1)
  442. ST y5, 5 * SIZE(Y1)
  443. lda A2, 8 * SIZE(A2)
  444. ST y6, 6 * SIZE(Y1)
  445. unop
  446. ST y7, 7 * SIZE(Y1)
  447. lda Y1, 8 * SIZE(Y1)
  448. .align 4
  449. $L15:
  450. and M, 2, I
  451. ble I, $L17
  452. LD a0, 0 * SIZE(A1)
  453. LD a1, 1 * SIZE(A1)
  454. LD a2, 2 * SIZE(A1)
  455. LD a3, 3 * SIZE(A1)
  456. LD a4, 0 * SIZE(A2)
  457. LD a5, 1 * SIZE(A2)
  458. LD a6, 2 * SIZE(A2)
  459. LD a7, 3 * SIZE(A2)
  460. MUL alpha1, a0, t0
  461. LD y0, 0 * SIZE(Y1)
  462. MUL alpha1, a1, t1
  463. LD y1, 1 * SIZE(Y1)
  464. MUL alpha1, a2, t2
  465. LD y2, 2 * SIZE(Y1)
  466. MUL alpha1, a3, t3
  467. LD y3, 3 * SIZE(Y1)
  468. ADD1 y0, t0, y0
  469. MUL alpha3, a4, t0
  470. ADD2 y1, t1, y1
  471. MUL alpha3, a5, t1
  472. ADD1 y2, t2, y2
  473. MUL alpha3, a6, t2
  474. ADD2 y3, t3, y3
  475. MUL alpha3, a7, t3
  476. ADD1 y0, t0, y0
  477. MUL alpha2, a1, t0
  478. ADD2 y1, t1, y1
  479. MUL alpha2, a0, t1
  480. ADD1 y2, t2, y2
  481. MUL alpha2, a3, t2
  482. ADD2 y3, t3, y3
  483. MUL alpha2, a2, t3
  484. ADD3 y0, t0, y0
  485. MUL alpha4, a5, t0
  486. ADD4 y1, t1, y1
  487. MUL alpha4, a4, t1
  488. ADD3 y2, t2, y2
  489. MUL alpha4, a7, t2
  490. ADD4 y3, t3, y3
  491. MUL alpha4, a6, t3
  492. ADD3 y0, t0, y0
  493. ADD4 y1, t1, y1
  494. ADD3 y2, t2, y2
  495. ADD4 y3, t3, y3
  496. ST y0, 0 * SIZE(Y1)
  497. lda A1, 4 * SIZE(A1)
  498. ST y1, 1 * SIZE(Y1)
  499. lda A2, 4 * SIZE(A2)
  500. ST y2, 2 * SIZE(Y1)
  501. unop
  502. ST y3, 3 * SIZE(Y1)
  503. lda Y1, 4 * SIZE(Y1)
  504. .align 4
  505. $L17:
  506. blbc M, $L18
  507. LD a0, 0 * SIZE(A1)
  508. LD a1, 1 * SIZE(A1)
  509. LD a2, 0 * SIZE(A2)
  510. LD a3, 1 * SIZE(A2)
  511. LD y0, 0 * SIZE(Y1)
  512. LD y1, 1 * SIZE(Y1)
  513. MUL alpha1, a0, t0
  514. MUL alpha1, a1, t1
  515. ADD1 y0, t0, y0
  516. MUL alpha3, a2, t0
  517. ADD2 y1, t1, y1
  518. MUL alpha3, a3, t1
  519. ADD1 y0, t0, y0
  520. MUL alpha2, a1, t0
  521. ADD2 y1, t1, y1
  522. MUL alpha2, a0, t1
  523. ADD3 y0, t0, y0
  524. MUL alpha4, a3, t0
  525. ADD4 y1, t1, y1
  526. MUL alpha4, a2, t1
  527. ADD3 y0, t0, y0
  528. ADD4 y1, t1, y1
  529. ST y0, 0 * SIZE(Y1)
  530. ST y1, 1 * SIZE(Y1)
  531. .align 4
  532. $L18:
  533. lda J, -1(J)
  534. bgt J, $L11
  535. .align 4
  536. $L20:
  537. blbc N, $L990
  538. LD alpha1, 0 * SIZE(X)
  539. LD alpha2, 1 * SIZE(X)
  540. MUL alpha_r, alpha1, y0
  541. MUL alpha_r, alpha2, y1
  542. MUL alpha_i, alpha2, t0
  543. mov A, A1
  544. MUL alpha_i, alpha1, t1
  545. mov Y, Y1
  546. #ifndef XCONJ
  547. SUB y0, t0, alpha1
  548. ADD y1, t1, alpha2
  549. #else
  550. ADD y0, t0, alpha1
  551. SUB y1, t1, alpha2
  552. #endif
  553. sra M, 2, I
  554. ble I, $L25
  555. LD a0, 0 * SIZE(A1)
  556. LD a1, 1 * SIZE(A1)
  557. LD a2, 2 * SIZE(A1)
  558. LD a3, 3 * SIZE(A1)
  559. LD y0, 0 * SIZE(Y1)
  560. LD y1, 1 * SIZE(Y1)
  561. LD y2, 2 * SIZE(Y1)
  562. LD y3, 3 * SIZE(Y1)
  563. MUL alpha1, a0, t0
  564. LD a4, 4 * SIZE(A1)
  565. MUL alpha1, a1, t1
  566. LD a5, 5 * SIZE(A1)
  567. MUL alpha1, a2, t2
  568. LD a6, 6 * SIZE(A1)
  569. MUL alpha1, a3, t3
  570. LD a7, 7 * SIZE(A1)
  571. ADD1 y0, t0, y0
  572. unop
  573. MUL alpha2, a1, t0
  574. LD a1, 9 * SIZE(A1)
  575. ADD2 y1, t1, y1
  576. unop
  577. MUL alpha2, a0, t1
  578. LD a0, 8 * SIZE(A1)
  579. ADD1 y2, t2, y2
  580. unop
  581. MUL alpha2, a3, t2
  582. LD a3, 11 * SIZE(A1)
  583. ADD2 y3, t3, y3
  584. unop
  585. MUL alpha2, a2, t3
  586. LD a2, 10 * SIZE(A1)
  587. ADD3 y0, t0, y0
  588. unop
  589. LD y4, 4 * SIZE(Y1)
  590. MUL alpha1, a4, t0
  591. ADD4 y1, t1, y1
  592. unop
  593. LD y5, 5 * SIZE(Y1)
  594. MUL alpha1, a5, t1
  595. ADD3 y2, t2, y2
  596. LD y6, 6 * SIZE(Y1)
  597. MUL alpha1, a6, t2
  598. lda I, -1(I)
  599. ADD4 y3, t3, y3
  600. LD y7, 7 * SIZE(Y1)
  601. MUL alpha1, a7, t3
  602. ble I, $L23
  603. .align 4
  604. $L22:
  605. ADD1 y4, t0, y4
  606. ST y0, 0 * SIZE(Y1)
  607. MUL alpha2, a5, t0
  608. LD a5, 13 * SIZE(A1)
  609. ADD2 y5, t1, y5
  610. ST y1, 1 * SIZE(Y1)
  611. MUL alpha2, a4, t1
  612. LD a4, 12 * SIZE(A1)
  613. ADD1 y6, t2, y6
  614. ST y2, 2 * SIZE(Y1)
  615. MUL alpha2, a7, t2
  616. LD a7, 15 * SIZE(A1)
  617. ADD2 y7, t3, y7
  618. ST y3, 3 * SIZE(Y1)
  619. MUL alpha2, a6, t3
  620. LD a6, 14 * SIZE(A1)
  621. ADD3 y4, t0, y4
  622. LD y0, 8 * SIZE(Y1)
  623. MUL alpha1, a0, t0
  624. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  625. ADD4 y5, t1, y5
  626. LD y1, 9 * SIZE(Y1)
  627. MUL alpha1, a1, t1
  628. lda I, -1(I)
  629. ADD3 y6, t2, y6
  630. LD y2, 10 * SIZE(Y1)
  631. MUL alpha1, a2, t2
  632. unop
  633. ADD4 y7, t3, y7
  634. LD y3, 11 * SIZE(Y1)
  635. MUL alpha1, a3, t3
  636. unop
  637. ADD1 y0, t0, y0
  638. ST y4, 4 * SIZE(Y1)
  639. MUL alpha2, a1, t0
  640. LD a1, 17 * SIZE(A1)
  641. ADD2 y1, t1, y1
  642. ST y5, 5 * SIZE(Y1)
  643. MUL alpha2, a0, t1
  644. LD a0, 16 * SIZE(A1)
  645. ADD1 y2, t2, y2
  646. ST y6, 6 * SIZE(Y1)
  647. MUL alpha2, a3, t2
  648. LD a3, 19 * SIZE(A1)
  649. ADD2 y3, t3, y3
  650. ST y7, 7 * SIZE(Y1)
  651. MUL alpha2, a2, t3
  652. LD a2, 18 * SIZE(A1)
  653. ADD3 y0, t0, y0
  654. LD y4, 12 * SIZE(Y1)
  655. MUL alpha1, a4, t0
  656. ldl $31, (PREFETCHSIZE + 0) * SIZE(Y1)
  657. ADD4 y1, t1, y1
  658. LD y5, 13 * SIZE(Y1)
  659. MUL alpha1, a5, t1
  660. lda A1, 8 * SIZE(A1)
  661. ADD3 y2, t2, y2
  662. LD y6, 14 * SIZE(Y1)
  663. MUL alpha1, a6, t2
  664. lda Y1, 8 * SIZE(Y1)
  665. ADD4 y3, t3, y3
  666. LD y7, 7 * SIZE(Y1)
  667. MUL alpha1, a7, t3
  668. bgt I, $L22
  669. .align 4
  670. $L23:
  671. ADD1 y4, t0, y4
  672. ST y0, 0 * SIZE(Y1)
  673. MUL alpha2, a5, t0
  674. unop
  675. ADD2 y5, t1, y5
  676. ST y1, 1 * SIZE(Y1)
  677. MUL alpha2, a4, t1
  678. unop
  679. ADD1 y6, t2, y6
  680. ST y2, 2 * SIZE(Y1)
  681. MUL alpha2, a7, t2
  682. unop
  683. ADD2 y7, t3, y7
  684. ST y3, 3 * SIZE(Y1)
  685. MUL alpha2, a6, t3
  686. unop
  687. ADD3 y4, t0, y4
  688. ADD4 y5, t1, y5
  689. ADD3 y6, t2, y6
  690. ADD4 y7, t3, y7
  691. ST y4, 4 * SIZE(Y1)
  692. unop
  693. ST y5, 5 * SIZE(Y1)
  694. unop
  695. ST y6, 6 * SIZE(Y1)
  696. lda A1, 8 * SIZE(A1)
  697. ST y7, 7 * SIZE(Y1)
  698. lda Y1, 8 * SIZE(Y1)
  699. .align 4
  700. $L25:
  701. and M, 2, I
  702. ble I, $L27
  703. LD a0, 0 * SIZE(A1)
  704. LD a1, 1 * SIZE(A1)
  705. LD a2, 2 * SIZE(A1)
  706. LD a3, 3 * SIZE(A1)
  707. MUL alpha1, a0, t0
  708. LD y0, 0 * SIZE(Y1)
  709. MUL alpha1, a1, t1
  710. LD y1, 1 * SIZE(Y1)
  711. MUL alpha1, a2, t2
  712. LD y2, 2 * SIZE(Y1)
  713. MUL alpha1, a3, t3
  714. LD y3, 3 * SIZE(Y1)
  715. ADD1 y0, t0, y0
  716. MUL alpha2, a1, t0
  717. ADD2 y1, t1, y1
  718. MUL alpha2, a0, t1
  719. ADD1 y2, t2, y2
  720. MUL alpha2, a3, t2
  721. ADD2 y3, t3, y3
  722. MUL alpha2, a2, t3
  723. ADD3 y0, t0, y0
  724. ADD4 y1, t1, y1
  725. ADD3 y2, t2, y2
  726. ADD4 y3, t3, y3
  727. ST y0, 0 * SIZE(Y1)
  728. ST y1, 1 * SIZE(Y1)
  729. ST y2, 2 * SIZE(Y1)
  730. lda A1, 4 * SIZE(A1)
  731. ST y3, 3 * SIZE(Y1)
  732. lda Y1, 4 * SIZE(Y1)
  733. .align 4
  734. $L27:
  735. blbc M, $L990
  736. LD a0, 0 * SIZE(A1)
  737. LD a1, 1 * SIZE(A1)
  738. MUL alpha1, a0, t0
  739. LD y0, 0 * SIZE(Y1)
  740. MUL alpha1, a1, t1
  741. LD y1, 1 * SIZE(Y1)
  742. ADD1 y0, t0, y0
  743. MUL alpha2, a1, t0
  744. ADD2 y1, t1, y1
  745. MUL alpha2, a0, t1
  746. ADD3 y0, t0, y0
  747. ADD4 y1, t1, y1
  748. ST y0, 0 * SIZE(Y1)
  749. ST y1, 1 * SIZE(Y1)
  750. .align 4
  751. $L990:
  752. cmpeq INCY, 2 * SIZE, $0
  753. bne $0, $L999
  754. mov BUFFER, Y1
  755. sra M, 2, I
  756. ble I, $L995
  757. .align 4
  758. $L992:
  759. LD a0, 0 * SIZE(BUFFER)
  760. LD a1, 1 * SIZE(BUFFER)
  761. addq BUFFER, INCY, BUFFER
  762. LD a2, 0 * SIZE(BUFFER)
  763. LD a3, 1 * SIZE(BUFFER)
  764. addq BUFFER, INCY, BUFFER
  765. LD y0, 0 * SIZE(Y)
  766. LD y1, 1 * SIZE(Y)
  767. LD y2, 2 * SIZE(Y)
  768. LD y3, 3 * SIZE(Y)
  769. LD a4, 0 * SIZE(BUFFER)
  770. LD a5, 1 * SIZE(BUFFER)
  771. addq BUFFER, INCY, BUFFER
  772. LD a6, 0 * SIZE(BUFFER)
  773. LD a7, 1 * SIZE(BUFFER)
  774. addq BUFFER, INCY, BUFFER
  775. LD y4, 4 * SIZE(Y)
  776. LD y5, 5 * SIZE(Y)
  777. LD y6, 6 * SIZE(Y)
  778. LD y7, 7 * SIZE(Y)
  779. ADD a0, y0, a0
  780. ADD a1, y1, a1
  781. ADD a2, y2, a2
  782. ADD a3, y3, a3
  783. ST a0, 0 * SIZE(Y1)
  784. ADD a4, y4, a4
  785. ST a1, 1 * SIZE(Y1)
  786. ADD a5, y5, a5
  787. addq Y1, INCY, Y1
  788. ST a2, 0 * SIZE(Y1)
  789. ADD a6, y6, a6
  790. ST a3, 1 * SIZE(Y1)
  791. ADD a7, y7, a7
  792. addq Y1, INCY, Y1
  793. ST a4, 0 * SIZE(Y1)
  794. ST a5, 1 * SIZE(Y1)
  795. addq Y1, INCY, Y1
  796. ST a6, 0 * SIZE(Y1)
  797. ST a7, 1 * SIZE(Y1)
  798. addq Y1, INCY, Y1
  799. lda I, -1(I)
  800. lda Y, 8 * SIZE(Y)
  801. bgt I, $L992
  802. .align 4
  803. $L995:
  804. and M, 3, I
  805. ble I, $L999
  806. .align 4
  807. $L996:
  808. LD a0, 0 * SIZE(BUFFER)
  809. LD a1, 1 * SIZE(BUFFER)
  810. addq BUFFER, INCY, BUFFER
  811. LD y0, 0 * SIZE(Y)
  812. LD y1, 1 * SIZE(Y)
  813. lda Y, 2 * SIZE(Y)
  814. ADD a0, y0, a0
  815. ADD a1, y1, a1
  816. ST a0, 0 * SIZE(Y1)
  817. ST a1, 1 * SIZE(Y1)
  818. addq Y1, INCY, Y1
  819. lda I, -1(I)
  820. bgt I, $L996
  821. .align 4
  822. $L999:
  823. ldt $f2, 0($sp)
  824. ldt $f3, 8($sp)
  825. ldt $f4, 16($sp)
  826. ldt $f5, 24($sp)
  827. ldt $f6, 32($sp)
  828. ldt $f7, 40($sp)
  829. ldt $f8, 48($sp)
  830. ldt $f9, 56($sp)
  831. lda $sp, STACKSIZE($sp)
  832. ret
  833. EPILOGUE