You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACKSIZE 64
  41. #define PREFETCHSIZE 32
  42. #define M $16
  43. #define N $17
  44. #define A $21
  45. #define LDA $18
  46. #define X $19
  47. #define INCX $20
  48. #define Y $22
  49. #define INCY $23
  50. #define BUFFER $24
  51. #define I $25
  52. #define J $27
  53. #define X1 $3
  54. #define Y1 $4
  55. #define A1 $5
  56. #define A2 $6
  57. #define alpha_r $f19
  58. #define alpha_i $f20
  59. #define s0 $f0
  60. #define s1 $f1
  61. #define s2 $f10
  62. #define s3 $f11
  63. #define t0 $f12
  64. #define t1 $f13
  65. #define t2 $f14
  66. #define t3 $f15
  67. #define x0 $f16
  68. #define x1 $f17
  69. #define x2 $f18
  70. #define x3 $f21
  71. #define a0 $f22
  72. #define a1 $f23
  73. #define a2 $f24
  74. #define a3 $f25
  75. #define a4 $f26
  76. #define a5 $f27
  77. #define a6 $f28
  78. #define a7 $f29
  79. #define a8 $f2
  80. #define a9 $f3
  81. #define a10 $f4
  82. #define a11 $f5
  83. #define a12 $f6
  84. #define a13 $f7
  85. #define a14 $f8
  86. #define a15 $f9
  87. #if !defined(CONJ) && !defined(XCONJ)
  88. #define ADD1 ADD
  89. #define ADD2 ADD
  90. #define ADD3 SUB
  91. #define ADD4 ADD
  92. #elif !defined(CONJ) && defined(XCONJ)
  93. #define ADD1 ADD
  94. #define ADD2 ADD
  95. #define ADD3 ADD
  96. #define ADD4 SUB
  97. #elif defined(CONJ) && !defined(XCONJ)
  98. #define ADD1 ADD
  99. #define ADD2 SUB
  100. #define ADD3 ADD
  101. #define ADD4 ADD
  102. #else
  103. #define ADD1 ADD
  104. #define ADD2 SUB
  105. #define ADD3 SUB
  106. #define ADD4 SUB
  107. #endif
  108. PROLOGUE
  109. lda $sp, -STACKSIZE($sp)
  110. ldq LDA, 0 + STACKSIZE($sp)
  111. ldq X, 8 + STACKSIZE($sp)
  112. ldq INCX, 16 + STACKSIZE($sp)
  113. ldq Y, 24 + STACKSIZE($sp)
  114. ldq INCY, 32 + STACKSIZE($sp)
  115. ldq BUFFER, 40 + STACKSIZE($sp)
  116. stt $f2, 0($sp)
  117. stt $f3, 8($sp)
  118. stt $f4, 16($sp)
  119. stt $f5, 24($sp)
  120. stt $f6, 32($sp)
  121. stt $f7, 40($sp)
  122. stt $f8, 48($sp)
  123. stt $f9, 56($sp)
  124. PROFCODE
  125. cmple M, 0, $0
  126. sll INCX, ZBASE_SHIFT, INCX
  127. cmple N, 0, $1
  128. sll INCY, ZBASE_SHIFT, INCY
  129. or $0, $1, $0
  130. bne $0, $L999
  131. cmpeq INCX, 2 * SIZE, $0
  132. mov X, X1
  133. sll LDA, ZBASE_SHIFT,LDA
  134. bne $0, $L10
  135. sra M, 2, I
  136. mov BUFFER, Y1
  137. mov BUFFER, X
  138. ble I, $L05
  139. .align 4
  140. $L02:
  141. ldl $31, (PREFETCHSIZE + 0) * SIZE(X1)
  142. lda I, -1(I)
  143. LD a0, 0 * SIZE(X1)
  144. LD a1, 1 * SIZE(X1)
  145. addq X1, INCX, X1
  146. LD a2, 0 * SIZE(X1)
  147. LD a3, 1 * SIZE(X1)
  148. addq X1, INCX, X1
  149. ST a0, 0 * SIZE(Y1)
  150. ST a1, 1 * SIZE(Y1)
  151. ST a2, 2 * SIZE(Y1)
  152. ST a3, 3 * SIZE(Y1)
  153. LD a4, 0 * SIZE(X1)
  154. LD a5, 1 * SIZE(X1)
  155. addq X1, INCX, X1
  156. LD a6, 0 * SIZE(X1)
  157. LD a7, 1 * SIZE(X1)
  158. addq X1, INCX, X1
  159. ST a4, 4 * SIZE(Y1)
  160. ST a5, 5 * SIZE(Y1)
  161. ST a6, 6 * SIZE(Y1)
  162. ST a7, 7 * SIZE(Y1)
  163. lda Y1, 8 * SIZE(Y1)
  164. bgt I, $L02
  165. .align 4
  166. $L05:
  167. and M, 3, I
  168. ble I, $L10
  169. .align 4
  170. $L06:
  171. LD a0, 0 * SIZE(X1)
  172. LD a1, 1 * SIZE(X1)
  173. addq X1, INCX, X1
  174. ST a0, 0 * SIZE(Y1)
  175. ST a1, 1 * SIZE(Y1)
  176. lda Y1, 2 * SIZE(Y1)
  177. lda I, -1(I)
  178. bgt I, $L06
  179. .align 4
  180. $L10:
  181. mov Y, Y1
  182. fclr t0
  183. unop
  184. fclr t1
  185. sra N, 1, J
  186. fclr t2
  187. fclr t3
  188. ble J, $L20
  189. .align 4
  190. $L11:
  191. mov A, A1
  192. fclr s0
  193. addq A, LDA, A2
  194. fclr s1
  195. addq A2, LDA, A
  196. unop
  197. mov X, X1
  198. lds $f31, 3 * SIZE(Y)
  199. sra M, 2, I
  200. fclr s2
  201. fclr s3
  202. ble I, $L15
  203. LD a0, 0 * SIZE(A1)
  204. LD a1, 1 * SIZE(A1)
  205. LD a2, 0 * SIZE(A2)
  206. LD a3, 1 * SIZE(A2)
  207. LD a4, 2 * SIZE(A1)
  208. LD a5, 3 * SIZE(A1)
  209. LD a6, 2 * SIZE(A2)
  210. LD a7, 3 * SIZE(A2)
  211. LD a8, 4 * SIZE(A1)
  212. LD a9, 5 * SIZE(A1)
  213. LD a10, 4 * SIZE(A2)
  214. LD a11, 5 * SIZE(A2)
  215. LD a12, 6 * SIZE(A1)
  216. LD a13, 7 * SIZE(A1)
  217. LD a14, 6 * SIZE(A2)
  218. LD a15, 7 * SIZE(A2)
  219. LD x0, 0 * SIZE(X1)
  220. LD x1, 1 * SIZE(X1)
  221. LD x2, 2 * SIZE(X1)
  222. lda I, -1(I)
  223. ble I, $L13
  224. .align 4
  225. $L12:
  226. ADD3 s0, t0, s0
  227. unop
  228. MUL x0, a0, t0
  229. LD x3, 3 * SIZE(X1)
  230. ADD4 s1, t1, s1
  231. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  232. MUL x0, a1, t1
  233. unop
  234. ADD3 s2, t2, s2
  235. unop
  236. MUL x0, a2, t2
  237. unop
  238. ADD4 s3, t3, s3
  239. unop
  240. MUL x0, a3, t3
  241. LD x0, 4 * SIZE(X1)
  242. ADD1 s0, t0, s0
  243. unop
  244. MUL x1, a1, t0
  245. LD a1, 9 * SIZE(A1)
  246. ADD2 s1, t1, s1
  247. unop
  248. MUL x1, a0, t1
  249. LD a0, 8 * SIZE(A1)
  250. ADD1 s2, t2, s2
  251. unop
  252. MUL x1, a3, t2
  253. LD a3, 9 * SIZE(A2)
  254. ADD2 s3, t3, s3
  255. unop
  256. MUL x1, a2, t3
  257. LD a2, 8 * SIZE(A2)
  258. ADD3 s0, t0, s0
  259. unop
  260. MUL x2, a4, t0
  261. LD x1, 5 * SIZE(X1)
  262. ADD4 s1, t1, s1
  263. MUL x2, a5, t1
  264. ADD3 s2, t2, s2
  265. MUL x2, a6, t2
  266. ADD4 s3, t3, s3
  267. unop
  268. MUL x2, a7, t3
  269. LD x2, 6 * SIZE(X1)
  270. ADD1 s0, t0, s0
  271. unop
  272. MUL x3, a5, t0
  273. LD a5, 11 * SIZE(A1)
  274. ADD2 s1, t1, s1
  275. unop
  276. MUL x3, a4, t1
  277. LD a4, 10 * SIZE(A1)
  278. ADD1 s2, t2, s2
  279. unop
  280. MUL x3, a7, t2
  281. LD a7, 11 * SIZE(A2)
  282. ADD2 s3, t3, s3
  283. unop
  284. MUL x3, a6, t3
  285. LD a6, 10 * SIZE(A2)
  286. ADD3 s0, t0, s0
  287. unop
  288. MUL x0, a8, t0
  289. LD x3, 7 * SIZE(X1)
  290. ADD4 s1, t1, s1
  291. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  292. MUL x0, a9, t1
  293. unop
  294. ADD3 s2, t2, s2
  295. lda I, -1(I)
  296. MUL x0, a10, t2
  297. unop
  298. ADD4 s3, t3, s3
  299. unop
  300. MUL x0, a11, t3
  301. LD x0, 8 * SIZE(X1)
  302. ADD1 s0, t0, s0
  303. unop
  304. MUL x1, a9, t0
  305. LD a9, 13 * SIZE(A1)
  306. ADD2 s1, t1, s1
  307. unop
  308. MUL x1, a8, t1
  309. LD a8, 12 * SIZE(A1)
  310. ADD1 s2, t2, s2
  311. lda A1, 8 * SIZE(A1)
  312. MUL x1, a11, t2
  313. LD a11, 13 * SIZE(A2)
  314. ADD2 s3, t3, s3
  315. unop
  316. MUL x1, a10, t3
  317. LD a10, 12 * SIZE(A2)
  318. ADD3 s0, t0, s0
  319. unop
  320. MUL x2, a12, t0
  321. LD x1, 9 * SIZE(X1)
  322. ADD4 s1, t1, s1
  323. ldl $31, (PREFETCHSIZE + 0) * SIZE(X1)
  324. MUL x2, a13, t1
  325. lda A2, 8 * SIZE(A2)
  326. ADD3 s2, t2, s2
  327. unop
  328. MUL x2, a14, t2
  329. unop
  330. ADD4 s3, t3, s3
  331. unop
  332. MUL x2, a15, t3
  333. LD x2, 10 * SIZE(X1)
  334. ADD1 s0, t0, s0
  335. unop
  336. MUL x3, a13, t0
  337. LD a13, 7 * SIZE(A1)
  338. ADD2 s1, t1, s1
  339. lda X1, 8 * SIZE(X1)
  340. MUL x3, a12, t1
  341. LD a12, 6 * SIZE(A1)
  342. ADD1 s2, t2, s2
  343. unop
  344. MUL x3, a15, t2
  345. LD a15, 7 * SIZE(A2)
  346. ADD2 s3, t3, s3
  347. MUL x3, a14, t3
  348. LD a14, 6 * SIZE(A2)
  349. bgt I, $L12
  350. .align 4
  351. $L13:
  352. ADD3 s0, t0, s0
  353. unop
  354. MUL x0, a0, t0
  355. LD x3, 3 * SIZE(X1)
  356. ADD4 s1, t1, s1
  357. MUL x0, a1, t1
  358. ADD3 s2, t2, s2
  359. MUL x0, a2, t2
  360. ADD4 s3, t3, s3
  361. unop
  362. MUL x0, a3, t3
  363. LD x0, 4 * SIZE(X1)
  364. ADD1 s0, t0, s0
  365. MUL x1, a1, t0
  366. ADD2 s1, t1, s1
  367. MUL x1, a0, t1
  368. ADD1 s2, t2, s2
  369. unop
  370. MUL x1, a3, t2
  371. unop
  372. ADD2 s3, t3, s3
  373. lda A1, 8 * SIZE(A1)
  374. MUL x1, a2, t3
  375. LD x1, 5 * SIZE(X1)
  376. ADD3 s0, t0, s0
  377. MUL x2, a4, t0
  378. ADD4 s1, t1, s1
  379. MUL x2, a5, t1
  380. ADD3 s2, t2, s2
  381. unop
  382. MUL x2, a6, t2
  383. unop
  384. ADD4 s3, t3, s3
  385. lda A2, 8 * SIZE(A2)
  386. MUL x2, a7, t3
  387. LD x2, 6 * SIZE(X1)
  388. ADD1 s0, t0, s0
  389. MUL x3, a5, t0
  390. ADD2 s1, t1, s1
  391. MUL x3, a4, t1
  392. ADD1 s2, t2, s2
  393. unop
  394. MUL x3, a7, t2
  395. lda X1, 8 * SIZE(X1)
  396. ADD2 s3, t3, s3
  397. unop
  398. MUL x3, a6, t3
  399. LD x3, -1 * SIZE(X1)
  400. ADD3 s0, t0, s0
  401. MUL x0, a8, t0
  402. ADD4 s1, t1, s1
  403. MUL x0, a9, t1
  404. ADD3 s2, t2, s2
  405. MUL x0, a10, t2
  406. ADD4 s3, t3, s3
  407. MUL x0, a11, t3
  408. ADD1 s0, t0, s0
  409. MUL x1, a9, t0
  410. ADD2 s1, t1, s1
  411. MUL x1, a8, t1
  412. ADD1 s2, t2, s2
  413. MUL x1, a11, t2
  414. ADD2 s3, t3, s3
  415. MUL x1, a10, t3
  416. ADD3 s0, t0, s0
  417. MUL x2, a12, t0
  418. ADD4 s1, t1, s1
  419. MUL x2, a13, t1
  420. ADD3 s2, t2, s2
  421. MUL x2, a14, t2
  422. ADD4 s3, t3, s3
  423. MUL x2, a15, t3
  424. ADD1 s0, t0, s0
  425. MUL x3, a13, t0
  426. ADD2 s1, t1, s1
  427. MUL x3, a12, t1
  428. ADD1 s2, t2, s2
  429. MUL x3, a15, t2
  430. ADD2 s3, t3, s3
  431. MUL x3, a14, t3
  432. .align 4
  433. $L15:
  434. and M, 3, I
  435. ble I, $L18
  436. LD a0, 0 * SIZE(A1)
  437. LD a1, 1 * SIZE(A1)
  438. LD a2, 0 * SIZE(A2)
  439. LD a3, 1 * SIZE(A2)
  440. LD x0, 0 * SIZE(X1)
  441. lda I, -1(I)
  442. ble I, $L17
  443. .align 4
  444. $L16:
  445. ADD3 s0, t0, s0
  446. lda I, -1(I)
  447. MUL x0, a0, t0
  448. LD x1, 1 * SIZE(X1)
  449. ADD4 s1, t1, s1
  450. MUL x0, a1, t1
  451. ADD3 s2, t2, s2
  452. MUL x0, a2, t2
  453. ADD4 s3, t3, s3
  454. unop
  455. MUL x0, a3, t3
  456. LD x0, 2 * SIZE(X1)
  457. ADD1 s0, t0, s0
  458. lda A2, 2 * SIZE(A2)
  459. MUL x1, a1, t0
  460. LD a1, 3 * SIZE(A1)
  461. ADD2 s1, t1, s1
  462. lda X1, 2 * SIZE(X1)
  463. MUL x1, a0, t1
  464. LD a0, 2 * SIZE(A1)
  465. ADD1 s2, t2, s2
  466. lda A1, 2 * SIZE(A1)
  467. MUL x1, a3, t2
  468. LD a3, 1 * SIZE(A2)
  469. ADD2 s3, t3, s3
  470. MUL x1, a2, t3
  471. LD a2, 0 * SIZE(A2)
  472. bgt I, $L16
  473. .align 4
  474. $L17:
  475. ADD3 s0, t0, s0
  476. unop
  477. MUL x0, a0, t0
  478. LD x1, 1 * SIZE(X1)
  479. ADD4 s1, t1, s1
  480. unop
  481. MUL x0, a1, t1
  482. unop
  483. ADD3 s2, t2, s2
  484. MUL x0, a2, t2
  485. ADD4 s3, t3, s3
  486. MUL x0, a3, t3
  487. ADD1 s0, t0, s0
  488. MUL x1, a1, t0
  489. ADD2 s1, t1, s1
  490. MUL x1, a0, t1
  491. ADD1 s2, t2, s2
  492. MUL x1, a3, t2
  493. ADD2 s3, t3, s3
  494. MUL x1, a2, t3
  495. .align 4
  496. $L18:
  497. LD a0, 0 * SIZE(Y)
  498. unop
  499. LD a1, 1 * SIZE(Y)
  500. addq Y, INCY, Y
  501. LD a2, 0 * SIZE(Y)
  502. unop
  503. LD a3, 1 * SIZE(Y)
  504. addq Y, INCY, Y
  505. ADD3 s0, t0, s0
  506. ADD4 s1, t1, s1
  507. ADD3 s2, t2, s2
  508. ADD4 s3, t3, s3
  509. MUL alpha_r, s0, t0
  510. MUL alpha_r, s1, t1
  511. MUL alpha_r, s2, t2
  512. MUL alpha_r, s3, t3
  513. ADD a0, t0, a0
  514. MUL alpha_i, s1, t0
  515. ADD a1, t1, a1
  516. MUL alpha_i, s0, t1
  517. ADD a2, t2, a2
  518. MUL alpha_i, s3, t2
  519. ADD a3, t3, a3
  520. MUL alpha_i, s2, t3
  521. SUB a0, t0, a0
  522. ADD a1, t1, a1
  523. SUB a2, t2, a2
  524. ADD a3, t3, a3
  525. ST a0, 0 * SIZE(Y1)
  526. fclr t0
  527. ST a1, 1 * SIZE(Y1)
  528. addq Y1, INCY, Y1
  529. ST a2, 0 * SIZE(Y1)
  530. fclr t1
  531. ST a3, 1 * SIZE(Y1)
  532. addq Y1, INCY, Y1
  533. fclr t2
  534. lda J, -1(J)
  535. fclr t3
  536. bgt J, $L11
  537. .align 4
  538. $L20:
  539. blbc N, $L999
  540. mov A, A1
  541. fclr s0
  542. fclr s1
  543. mov X, X1
  544. sra M, 2, I
  545. fclr s2
  546. fclr s3
  547. ble I, $L25
  548. LD a0, 0 * SIZE(A1)
  549. LD a1, 1 * SIZE(A1)
  550. LD a4, 2 * SIZE(A1)
  551. LD a5, 3 * SIZE(A1)
  552. LD a8, 4 * SIZE(A1)
  553. LD a9, 5 * SIZE(A1)
  554. LD a12, 6 * SIZE(A1)
  555. LD a13, 7 * SIZE(A1)
  556. LD x0, 0 * SIZE(X1)
  557. LD x1, 1 * SIZE(X1)
  558. LD x2, 2 * SIZE(X1)
  559. lda I, -1(I)
  560. ble I, $L23
  561. .align 4
  562. $L22:
  563. ADD3 s0, t0, s0
  564. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  565. MUL x0, a0, t0
  566. LD x3, 3 * SIZE(X1)
  567. ADD4 s1, t1, s1
  568. unop
  569. MUL x0, a1, t1
  570. LD x0, 4 * SIZE(X1)
  571. ADD1 s2, t0, s2
  572. lda I, -1(I)
  573. MUL x1, a1, t0
  574. LD a1, 9 * SIZE(A1)
  575. ADD2 s3, t1, s3
  576. unop
  577. MUL x1, a0, t1
  578. LD a0, 8 * SIZE(A1)
  579. ADD3 s0, t0, s0
  580. unop
  581. MUL x2, a4, t0
  582. LD x1, 5 * SIZE(X1)
  583. ADD4 s1, t1, s1
  584. unop
  585. MUL x2, a5, t1
  586. LD x2, 6 * SIZE(X1)
  587. ADD1 s2, t0, s2
  588. unop
  589. MUL x3, a5, t0
  590. LD a5, 11 * SIZE(A1)
  591. ADD2 s3, t1, s3
  592. unop
  593. MUL x3, a4, t1
  594. LD a4, 10 * SIZE(A1)
  595. ADD3 s0, t0, s0
  596. unop
  597. MUL x0, a8, t0
  598. LD x3, 7 * SIZE(X1)
  599. ADD4 s1, t1, s1
  600. unop
  601. MUL x0, a9, t1
  602. LD x0, 8 * SIZE(X1)
  603. ADD1 s2, t0, s2
  604. unop
  605. MUL x1, a9, t0
  606. LD a9, 13 * SIZE(A1)
  607. ADD2 s3, t1, s3
  608. unop
  609. MUL x1, a8, t1
  610. LD a8, 12 * SIZE(A1)
  611. ADD3 s0, t0, s0
  612. unop
  613. MUL x2, a12, t0
  614. LD x1, 9 * SIZE(X1)
  615. ADD4 s1, t1, s1
  616. lda A1, 8 * SIZE(A1)
  617. MUL x2, a13, t1
  618. LD x2, 10 * SIZE(X1)
  619. ADD1 s2, t0, s2
  620. lda X1, 8 * SIZE(X1)
  621. MUL x3, a13, t0
  622. LD a13, 7 * SIZE(A1)
  623. ADD2 s3, t1, s3
  624. MUL x3, a12, t1
  625. LD a12, 6 * SIZE(A1)
  626. bgt I, $L22
  627. .align 4
  628. $L23:
  629. ADD3 s0, t0, s0
  630. unop
  631. MUL x0, a0, t0
  632. LD x3, 3 * SIZE(X1)
  633. ADD4 s1, t1, s1
  634. unop
  635. MUL x0, a1, t1
  636. LD x0, 4 * SIZE(X1)
  637. ADD1 s2, t0, s2
  638. unop
  639. MUL x1, a1, t0
  640. lda A1, 8 * SIZE(A1)
  641. ADD2 s3, t1, s3
  642. unop
  643. MUL x1, a0, t1
  644. LD x1, 5 * SIZE(X1)
  645. ADD3 s0, t0, s0
  646. unop
  647. MUL x2, a4, t0
  648. unop
  649. ADD4 s1, t1, s1
  650. unop
  651. MUL x2, a5, t1
  652. LD x2, 6 * SIZE(X1)
  653. ADD1 s2, t0, s2
  654. unop
  655. MUL x3, a5, t0
  656. lda X1, 8 * SIZE(X1)
  657. ADD2 s3, t1, s3
  658. unop
  659. MUL x3, a4, t1
  660. LD x3, -1 * SIZE(X1)
  661. ADD3 s0, t0, s0
  662. MUL x0, a8, t0
  663. ADD4 s1, t1, s1
  664. MUL x0, a9, t1
  665. ADD1 s2, t0, s2
  666. MUL x1, a9, t0
  667. ADD2 s3, t1, s3
  668. MUL x1, a8, t1
  669. ADD3 s0, t0, s0
  670. MUL x2, a12, t0
  671. ADD4 s1, t1, s1
  672. MUL x2, a13, t1
  673. ADD1 s2, t0, s2
  674. MUL x3, a13, t0
  675. ADD2 s3, t1, s3
  676. MUL x3, a12, t1
  677. .align 4
  678. $L25:
  679. and M, 3, I
  680. ble I, $L28
  681. LD a0, 0 * SIZE(A1)
  682. LD a1, 1 * SIZE(A1)
  683. LD x0, 0 * SIZE(X1)
  684. lda I, -1(I)
  685. ble I, $L27
  686. .align 4
  687. $L26:
  688. ADD3 s0, t0, s0
  689. lda A1, 2 * SIZE(A1)
  690. MUL x0, a0, t0
  691. LD x1, 1 * SIZE(X1)
  692. ADD4 s1, t1, s1
  693. lda I, -1(I)
  694. MUL x0, a1, t1
  695. LD x0, 2 * SIZE(X1)
  696. ADD1 s0, t0, s0
  697. lda X1, 2 * SIZE(X1)
  698. MUL x1, a1, t0
  699. LD a1, 1 * SIZE(A1)
  700. ADD2 s1, t1, s1
  701. MUL x1, a0, t1
  702. LD a0, 0 * SIZE(A1)
  703. bgt I, $L26
  704. .align 4
  705. $L27:
  706. ADD3 s0, t0, s0
  707. unop
  708. MUL x0, a0, t0
  709. LD x1, 1 * SIZE(X1)
  710. ADD4 s1, t1, s1
  711. unop
  712. MUL x0, a1, t1
  713. unop
  714. ADD1 s0, t0, s0
  715. MUL x1, a1, t0
  716. ADD2 s1, t1, s1
  717. MUL x1, a0, t1
  718. .align 4
  719. $L28:
  720. LD a0, 0 * SIZE(Y)
  721. LD a1, 1 * SIZE(Y)
  722. ADD3 s0, t0, s0
  723. ADD4 s1, t1, s1
  724. ADD3 s2, t2, s2
  725. ADD4 s3, t3, s3
  726. ADD s0, s2, s0
  727. ADD s1, s3, s1
  728. MUL alpha_r, s0, t0
  729. MUL alpha_r, s1, t1
  730. ADD a0, t0, a0
  731. MUL alpha_i, s1, t0
  732. ADD a1, t1, a1
  733. MUL alpha_i, s0, t1
  734. SUB a0, t0, a0
  735. ADD a1, t1, a1
  736. ST a0, 0 * SIZE(Y1)
  737. ST a1, 1 * SIZE(Y1)
  738. .align 4
  739. $L999:
  740. ldt $f2, 0($sp)
  741. ldt $f3, 8($sp)
  742. ldt $f4, 16($sp)
  743. ldt $f5, 24($sp)
  744. ldt $f6, 32($sp)
  745. ldt $f7, 40($sp)
  746. ldt $f8, 48($sp)
  747. ldt $f9, 56($sp)
  748. lda $sp, STACKSIZE($sp)
  749. ret
  750. EPILOGUE