You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zger.S 29 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef NEEDPARAM
  41. #ifndef DOUBLE
  42. #include "cparam.h"
  43. #else
  44. #include "zparam.h"
  45. #endif
  46. #endif
  47. #if defined(linux) || defined(__FreeBSD__)
  48. #ifndef __64BIT__
  49. #define M r3
  50. #define N r4
  51. #define X r6
  52. #define INCX r7
  53. #define Y r8
  54. #define INCY r9
  55. #define A r10
  56. #define LDA r5
  57. #else
  58. #define M r3
  59. #define N r4
  60. #define X r8
  61. #define INCX r9
  62. #define Y r10
  63. #define INCY r5
  64. #define A r6
  65. #define LDA r7
  66. #endif
  67. #endif
  68. #if defined(_AIX) || defined(__APPLE__)
  69. #if !defined(__64BIT__) && defined(DOUBLE)
  70. #define M r3
  71. #define N r4
  72. #define X r10
  73. #define INCX r5
  74. #define Y r6
  75. #define INCY r7
  76. #define A r8
  77. #define LDA r9
  78. #else
  79. #define M r3
  80. #define N r4
  81. #define X r8
  82. #define INCX r9
  83. #define Y r10
  84. #define INCY r5
  85. #define A r6
  86. #define LDA r7
  87. #endif
  88. #endif
  89. #define I r11
  90. #define J r12
  91. #define AO1 r14
  92. #define AO2 r15
  93. #define AO3 r16
  94. #define AO4 r17
  95. #define AO5 r18
  96. #define AO6 r19
  97. #define AO7 r20
  98. #define AO8 r21
  99. #define X1 r22
  100. #define PREA r23
  101. #define PREC r24
  102. #define XX r25
  103. #define BUFFER r26
  104. #define y01 f0
  105. #define y02 f1
  106. #define y03 f2
  107. #define y04 f3
  108. #define y05 f4
  109. #define y06 f5
  110. #define y07 f6
  111. #define y08 f7
  112. #define alpha1_r f8
  113. #define alpha1_i f9
  114. #define alpha2_r f10
  115. #define alpha2_i f11
  116. #define a1 f12
  117. #define a2 f13
  118. #define a3 f14
  119. #define a4 f15
  120. #define a5 f16
  121. #define a6 f17
  122. #define a7 f18
  123. #define a8 f19
  124. #define a9 f20
  125. #define a10 f21
  126. #define a11 f22
  127. #define a12 f23
  128. #define a13 f24
  129. #define a14 f25
  130. #define a15 f26
  131. #define a16 f27
  132. #define alpha_r f30
  133. #define alpha_i f31
  134. #ifndef CONJ
  135. #define FMA1 FNMSUB
  136. #define FMA2 FMADD
  137. #else
  138. #define FMA1 FMADD
  139. #define FMA2 FNMSUB
  140. #endif
  141. #if defined(PPC440) || defined(PPC440FP2)
  142. #define PREFETCHSIZE_A 24
  143. #define PREFETCHSIZE_C 16
  144. #endif
  145. #ifdef PPC970
  146. #define PREFETCHSIZE_A 16
  147. #define PREFETCHSIZE_C 16
  148. #endif
  149. #ifdef POWER4
  150. #define PREFETCHSIZE_A 16
  151. #define PREFETCHSIZE_C 16
  152. #endif
  153. #ifdef POWER5
  154. #define PREFETCHSIZE_A 16
  155. #define PREFETCHSIZE_C 16
  156. #endif
  157. #ifndef NEEDPARAM
  158. #ifndef __64BIT__
  159. #define STACKSIZE 224
  160. #else
  161. #define STACKSIZE 280
  162. #endif
  163. PROLOGUE
  164. PROFCODE
  165. addi SP, SP, -STACKSIZE
  166. stfd f14, 0(SP)
  167. stfd f15, 8(SP)
  168. stfd f16, 16(SP)
  169. stfd f17, 24(SP)
  170. stfd f18, 32(SP)
  171. stfd f19, 40(SP)
  172. stfd f20, 48(SP)
  173. stfd f21, 56(SP)
  174. stfd f22, 64(SP)
  175. stfd f23, 72(SP)
  176. stfd f24, 80(SP)
  177. stfd f25, 88(SP)
  178. stfd f26, 96(SP)
  179. stfd f27, 104(SP)
  180. stfd f28, 112(SP)
  181. stfd f29, 120(SP)
  182. stfd f30, 128(SP)
  183. stfd f31, 136(SP)
  184. #ifdef __64BIT__
  185. std r14, 144(SP)
  186. std r15, 152(SP)
  187. std r16, 160(SP)
  188. std r17, 168(SP)
  189. std r18, 176(SP)
  190. std r19, 184(SP)
  191. std r20, 192(SP)
  192. std r21, 200(SP)
  193. std r22, 208(SP)
  194. std r23, 216(SP)
  195. std r24, 224(SP)
  196. std r25, 232(SP)
  197. std r26, 240(SP)
  198. std r27, 248(SP)
  199. #else
  200. stw r14, 144(SP)
  201. stw r15, 148(SP)
  202. stw r16, 152(SP)
  203. stw r17, 156(SP)
  204. stw r18, 160(SP)
  205. stw r19, 164(SP)
  206. stw r20, 168(SP)
  207. stw r21, 172(SP)
  208. stw r22, 176(SP)
  209. stw r23, 180(SP)
  210. stw r24, 184(SP)
  211. stw r25, 188(SP)
  212. stw r26, 192(SP)
  213. stw r27, 196(SP)
  214. #endif
  215. #if defined(linux) || defined(__FreeBSD__)
  216. #ifndef __64BIT__
  217. lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
  218. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  219. #else
  220. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  221. ld A, FRAMESLOT(1) + STACKSIZE(SP)
  222. ld LDA, FRAMESLOT(2) + STACKSIZE(SP)
  223. ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  224. #endif
  225. #endif
  226. #if defined(_AIX) || defined(__APPLE__)
  227. #ifndef __64BIT__
  228. #ifdef DOUBLE
  229. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  230. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  231. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  232. lwz A, FRAMESLOT(3) + STACKSIZE(SP)
  233. lwz LDA, FRAMESLOT(4) + STACKSIZE(SP)
  234. lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP)
  235. #else
  236. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  237. lwz A, FRAMESLOT(1) + STACKSIZE(SP)
  238. lwz LDA, FRAMESLOT(2) + STACKSIZE(SP)
  239. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  240. #endif
  241. #else
  242. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  243. ld A, FRAMESLOT(1) + STACKSIZE(SP)
  244. ld LDA, FRAMESLOT(2) + STACKSIZE(SP)
  245. ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  246. #endif
  247. #endif
  248. fmr alpha_r, f1
  249. fmr alpha_i, f2
  250. slwi LDA, LDA, ZBASE_SHIFT
  251. slwi INCX, INCX, ZBASE_SHIFT
  252. slwi INCY, INCY, ZBASE_SHIFT
  253. li PREA, PREFETCHSIZE_A * SIZE
  254. li PREC, PREFETCHSIZE_C * SIZE
  255. cmpwi cr0, M, 0
  256. ble- LL(999)
  257. cmpwi cr0, N, 0
  258. ble- LL(999)
  259. mr XX, X
  260. cmpi cr0, 0, INCX, 2 * SIZE
  261. beq LL(10)
  262. mr XX, BUFFER
  263. mr X1, BUFFER
  264. srawi. r0, M, 2
  265. mtspr CTR, r0
  266. ble LL(05)
  267. .align 4
  268. LL(01):
  269. LFD a1, 0 * SIZE(X)
  270. LFD a2, 1 * SIZE(X)
  271. add X, X, INCX
  272. LFD a3, 0 * SIZE(X)
  273. LFD a4, 1 * SIZE(X)
  274. add X, X, INCX
  275. LFD a5, 0 * SIZE(X)
  276. LFD a6, 1 * SIZE(X)
  277. add X, X, INCX
  278. LFD a7, 0 * SIZE(X)
  279. LFD a8, 1 * SIZE(X)
  280. add X, X, INCX
  281. STFD a1, 0 * SIZE(X1)
  282. STFD a2, 1 * SIZE(X1)
  283. STFD a3, 2 * SIZE(X1)
  284. STFD a4, 3 * SIZE(X1)
  285. STFD a5, 4 * SIZE(X1)
  286. STFD a6, 5 * SIZE(X1)
  287. STFD a7, 6 * SIZE(X1)
  288. STFD a8, 7 * SIZE(X1)
  289. addi X1, X1, 8 * SIZE
  290. bdnz+ LL(01)
  291. .align 4
  292. LL(05):
  293. andi. r0, M, 7
  294. mtspr CTR, r0
  295. ble LL(10)
  296. .align 4
  297. LL(06):
  298. LFD a1, 0 * SIZE(X)
  299. LFD a2, 1 * SIZE(X)
  300. STFD a1, 0 * SIZE(X1)
  301. STFD a2, 1 * SIZE(X1)
  302. add X, X, INCX
  303. addi X1, X1, 2 * SIZE
  304. bdnz+ LL(06)
  305. .align 4
  306. LL(10):
  307. srawi. J, N, 1
  308. ble LL(20)
  309. .align 4
  310. LL(11):
  311. LFD alpha1_r, 0 * SIZE(Y)
  312. LFD alpha1_i, 1 * SIZE(Y)
  313. add Y, Y, INCY
  314. LFD alpha2_r, 0 * SIZE(Y)
  315. LFD alpha2_i, 1 * SIZE(Y)
  316. add Y, Y, INCY
  317. FMUL a1, alpha_r, alpha1_r
  318. FMUL a2, alpha_i, alpha1_r
  319. FMUL a3, alpha_r, alpha2_r
  320. FMUL a4, alpha_i, alpha2_r
  321. FMA1 alpha1_r, alpha_i, alpha1_i, a1
  322. FMA2 alpha1_i, alpha_r, alpha1_i, a2
  323. FMA1 alpha2_r, alpha_i, alpha2_i, a3
  324. FMA2 alpha2_i, alpha_r, alpha2_i, a4
  325. mr AO1, A
  326. add AO2, A, LDA
  327. add A, AO2, LDA
  328. mr X1, XX
  329. srawi. r0, M, 3
  330. mtspr CTR, r0
  331. ble LL(15)
  332. LFD a1, 0 * SIZE(AO1)
  333. LFD a2, 1 * SIZE(AO1)
  334. LFD a3, 2 * SIZE(AO1)
  335. LFD a4, 3 * SIZE(AO1)
  336. LFD a5, 4 * SIZE(AO1)
  337. LFD a6, 5 * SIZE(AO1)
  338. LFD a7, 6 * SIZE(AO1)
  339. LFD a8, 7 * SIZE(AO1)
  340. LFD y01, 0 * SIZE(X1)
  341. LFD y02, 1 * SIZE(X1)
  342. LFD y03, 2 * SIZE(X1)
  343. LFD y04, 3 * SIZE(X1)
  344. LFD y05, 4 * SIZE(X1)
  345. LFD y06, 5 * SIZE(X1)
  346. LFD y07, 6 * SIZE(X1)
  347. LFD y08, 7 * SIZE(X1)
  348. LFD a9, 0 * SIZE(AO2)
  349. LFD a10, 1 * SIZE(AO2)
  350. LFD a11, 2 * SIZE(AO2)
  351. LFD a12, 3 * SIZE(AO2)
  352. LFD a13, 4 * SIZE(AO2)
  353. LFD a14, 5 * SIZE(AO2)
  354. LFD a15, 6 * SIZE(AO2)
  355. LFD a16, 7 * SIZE(AO2)
  356. bdz LL(13)
  357. .align 4
  358. LL(12):
  359. FMADD a1, alpha1_r, y01, a1
  360. FMADD a2, alpha1_r, y02, a2
  361. FMADD a3, alpha1_r, y03, a3
  362. FMADD a4, alpha1_r, y04, a4
  363. FMADD a5, alpha1_r, y05, a5
  364. FMADD a6, alpha1_r, y06, a6
  365. FMADD a7, alpha1_r, y07, a7
  366. FMADD a8, alpha1_r, y08, a8
  367. FMADD a9, alpha2_r, y01, a9
  368. FMADD a10, alpha2_r, y02, a10
  369. FMADD a11, alpha2_r, y03, a11
  370. FMADD a12, alpha2_r, y04, a12
  371. FMADD a13, alpha2_r, y05, a13
  372. FMADD a14, alpha2_r, y06, a14
  373. FMADD a15, alpha2_r, y07, a15
  374. FMADD a16, alpha2_r, y08, a16
  375. FNMSUB a1, alpha1_i, y02, a1
  376. FMADD a2, alpha1_i, y01, a2
  377. FNMSUB a3, alpha1_i, y04, a3
  378. FMADD a4, alpha1_i, y03, a4
  379. STFD a1, 0 * SIZE(AO1)
  380. STFD a2, 1 * SIZE(AO1)
  381. STFD a3, 2 * SIZE(AO1)
  382. STFD a4, 3 * SIZE(AO1)
  383. LFD a1, 8 * SIZE(AO1)
  384. LFD a2, 9 * SIZE(AO1)
  385. LFD a3, 10 * SIZE(AO1)
  386. LFD a4, 11 * SIZE(AO1)
  387. FNMSUB a5, alpha1_i, y06, a5
  388. FMADD a6, alpha1_i, y05, a6
  389. FNMSUB a7, alpha1_i, y08, a7
  390. FMADD a8, alpha1_i, y07, a8
  391. STFD a5, 4 * SIZE(AO1)
  392. STFD a6, 5 * SIZE(AO1)
  393. STFD a7, 6 * SIZE(AO1)
  394. STFD a8, 7 * SIZE(AO1)
  395. LFD a5, 12 * SIZE(AO1)
  396. LFD a6, 13 * SIZE(AO1)
  397. LFD a7, 14 * SIZE(AO1)
  398. LFD a8, 15 * SIZE(AO1)
  399. FNMSUB a9, alpha2_i, y02, a9
  400. FMADD a10, alpha2_i, y01, a10
  401. FNMSUB a11, alpha2_i, y04, a11
  402. FMADD a12, alpha2_i, y03, a12
  403. LFD y01, 8 * SIZE(X1)
  404. LFD y02, 9 * SIZE(X1)
  405. LFD y03, 10 * SIZE(X1)
  406. LFD y04, 11 * SIZE(X1)
  407. STFD a9, 0 * SIZE(AO2)
  408. STFD a10, 1 * SIZE(AO2)
  409. STFD a11, 2 * SIZE(AO2)
  410. STFD a12, 3 * SIZE(AO2)
  411. LFD a9, 8 * SIZE(AO2)
  412. LFD a10, 9 * SIZE(AO2)
  413. LFD a11, 10 * SIZE(AO2)
  414. LFD a12, 11 * SIZE(AO2)
  415. FNMSUB a13, alpha2_i, y06, a13
  416. FMADD a14, alpha2_i, y05, a14
  417. FNMSUB a15, alpha2_i, y08, a15
  418. FMADD a16, alpha2_i, y07, a16
  419. LFD y05, 12 * SIZE(X1)
  420. LFD y06, 13 * SIZE(X1)
  421. LFD y07, 14 * SIZE(X1)
  422. LFD y08, 15 * SIZE(X1)
  423. STFD a13, 4 * SIZE(AO2)
  424. STFD a14, 5 * SIZE(AO2)
  425. STFD a15, 6 * SIZE(AO2)
  426. STFD a16, 7 * SIZE(AO2)
  427. LFD a13, 12 * SIZE(AO2)
  428. LFD a14, 13 * SIZE(AO2)
  429. LFD a15, 14 * SIZE(AO2)
  430. LFD a16, 15 * SIZE(AO2)
  431. FMADD a1, alpha1_r, y01, a1
  432. FMADD a2, alpha1_r, y02, a2
  433. FMADD a3, alpha1_r, y03, a3
  434. FMADD a4, alpha1_r, y04, a4
  435. FMADD a5, alpha1_r, y05, a5
  436. FMADD a6, alpha1_r, y06, a6
  437. FMADD a7, alpha1_r, y07, a7
  438. FMADD a8, alpha1_r, y08, a8
  439. FMADD a9, alpha2_r, y01, a9
  440. FMADD a10, alpha2_r, y02, a10
  441. FMADD a11, alpha2_r, y03, a11
  442. FMADD a12, alpha2_r, y04, a12
  443. FMADD a13, alpha2_r, y05, a13
  444. FMADD a14, alpha2_r, y06, a14
  445. FMADD a15, alpha2_r, y07, a15
  446. FMADD a16, alpha2_r, y08, a16
  447. FNMSUB a1, alpha1_i, y02, a1
  448. FMADD a2, alpha1_i, y01, a2
  449. FNMSUB a3, alpha1_i, y04, a3
  450. FMADD a4, alpha1_i, y03, a4
  451. STFD a1, 8 * SIZE(AO1)
  452. STFD a2, 9 * SIZE(AO1)
  453. STFD a3, 10 * SIZE(AO1)
  454. STFD a4, 11 * SIZE(AO1)
  455. LFD a1, 16 * SIZE(AO1)
  456. LFD a2, 17 * SIZE(AO1)
  457. LFD a3, 18 * SIZE(AO1)
  458. LFD a4, 19 * SIZE(AO1)
  459. FNMSUB a5, alpha1_i, y06, a5
  460. FMADD a6, alpha1_i, y05, a6
  461. FNMSUB a7, alpha1_i, y08, a7
  462. FMADD a8, alpha1_i, y07, a8
  463. STFD a5, 12 * SIZE(AO1)
  464. STFD a6, 13 * SIZE(AO1)
  465. STFD a7, 14 * SIZE(AO1)
  466. STFD a8, 15 * SIZE(AO1)
  467. LFD a5, 20 * SIZE(AO1)
  468. LFD a6, 21 * SIZE(AO1)
  469. LFD a7, 22 * SIZE(AO1)
  470. LFD a8, 23 * SIZE(AO1)
  471. FNMSUB a9, alpha2_i, y02, a9
  472. FMADD a10, alpha2_i, y01, a10
  473. FNMSUB a11, alpha2_i, y04, a11
  474. FMADD a12, alpha2_i, y03, a12
  475. LFD y01, 16 * SIZE(X1)
  476. LFD y02, 17 * SIZE(X1)
  477. LFD y03, 18 * SIZE(X1)
  478. LFD y04, 19 * SIZE(X1)
  479. STFD a9, 8 * SIZE(AO2)
  480. STFD a10, 9 * SIZE(AO2)
  481. STFD a11, 10 * SIZE(AO2)
  482. STFD a12, 11 * SIZE(AO2)
  483. LFD a9, 16 * SIZE(AO2)
  484. LFD a10, 17 * SIZE(AO2)
  485. LFD a11, 18 * SIZE(AO2)
  486. LFD a12, 19 * SIZE(AO2)
  487. FNMSUB a13, alpha2_i, y06, a13
  488. FMADD a14, alpha2_i, y05, a14
  489. FNMSUB a15, alpha2_i, y08, a15
  490. FMADD a16, alpha2_i, y07, a16
  491. LFD y05, 20 * SIZE(X1)
  492. LFD y06, 21 * SIZE(X1)
  493. LFD y07, 22 * SIZE(X1)
  494. LFD y08, 23 * SIZE(X1)
  495. STFD a13, 12 * SIZE(AO2)
  496. STFD a14, 13 * SIZE(AO2)
  497. STFD a15, 14 * SIZE(AO2)
  498. STFD a16, 15 * SIZE(AO2)
  499. LFD a13, 20 * SIZE(AO2)
  500. LFD a14, 21 * SIZE(AO2)
  501. LFD a15, 22 * SIZE(AO2)
  502. LFD a16, 23 * SIZE(AO2)
  503. addi AO1, AO1, 16 * SIZE
  504. addi AO2, AO2, 16 * SIZE
  505. addi X1, X1, 16 * SIZE
  506. DCBT(AO1, PREA)
  507. DCBT(AO2, PREA)
  508. DCBT(Y1, PREY)
  509. bdnz+ LL(12)
  510. .align 4
  511. LL(13):
  512. FMADD a1, alpha1_r, y01, a1
  513. FMADD a2, alpha1_r, y02, a2
  514. FMADD a3, alpha1_r, y03, a3
  515. FMADD a4, alpha1_r, y04, a4
  516. FMADD a5, alpha1_r, y05, a5
  517. FMADD a6, alpha1_r, y06, a6
  518. FMADD a7, alpha1_r, y07, a7
  519. FMADD a8, alpha1_r, y08, a8
  520. FMADD a9, alpha2_r, y01, a9
  521. FMADD a10, alpha2_r, y02, a10
  522. FMADD a11, alpha2_r, y03, a11
  523. FMADD a12, alpha2_r, y04, a12
  524. FMADD a13, alpha2_r, y05, a13
  525. FMADD a14, alpha2_r, y06, a14
  526. FMADD a15, alpha2_r, y07, a15
  527. FMADD a16, alpha2_r, y08, a16
  528. FNMSUB a1, alpha1_i, y02, a1
  529. FMADD a2, alpha1_i, y01, a2
  530. FNMSUB a3, alpha1_i, y04, a3
  531. FMADD a4, alpha1_i, y03, a4
  532. STFD a1, 0 * SIZE(AO1)
  533. STFD a2, 1 * SIZE(AO1)
  534. STFD a3, 2 * SIZE(AO1)
  535. STFD a4, 3 * SIZE(AO1)
  536. LFD a1, 8 * SIZE(AO1)
  537. LFD a2, 9 * SIZE(AO1)
  538. LFD a3, 10 * SIZE(AO1)
  539. LFD a4, 11 * SIZE(AO1)
  540. FNMSUB a5, alpha1_i, y06, a5
  541. FMADD a6, alpha1_i, y05, a6
  542. FNMSUB a7, alpha1_i, y08, a7
  543. FMADD a8, alpha1_i, y07, a8
  544. STFD a5, 4 * SIZE(AO1)
  545. STFD a6, 5 * SIZE(AO1)
  546. STFD a7, 6 * SIZE(AO1)
  547. STFD a8, 7 * SIZE(AO1)
  548. LFD a5, 12 * SIZE(AO1)
  549. LFD a6, 13 * SIZE(AO1)
  550. LFD a7, 14 * SIZE(AO1)
  551. LFD a8, 15 * SIZE(AO1)
  552. FNMSUB a9, alpha2_i, y02, a9
  553. FMADD a10, alpha2_i, y01, a10
  554. FNMSUB a11, alpha2_i, y04, a11
  555. FMADD a12, alpha2_i, y03, a12
  556. LFD y01, 8 * SIZE(X1)
  557. LFD y02, 9 * SIZE(X1)
  558. LFD y03, 10 * SIZE(X1)
  559. LFD y04, 11 * SIZE(X1)
  560. STFD a9, 0 * SIZE(AO2)
  561. STFD a10, 1 * SIZE(AO2)
  562. STFD a11, 2 * SIZE(AO2)
  563. STFD a12, 3 * SIZE(AO2)
  564. LFD a9, 8 * SIZE(AO2)
  565. LFD a10, 9 * SIZE(AO2)
  566. LFD a11, 10 * SIZE(AO2)
  567. LFD a12, 11 * SIZE(AO2)
  568. FNMSUB a13, alpha2_i, y06, a13
  569. FMADD a14, alpha2_i, y05, a14
  570. FNMSUB a15, alpha2_i, y08, a15
  571. FMADD a16, alpha2_i, y07, a16
  572. LFD y05, 12 * SIZE(X1)
  573. LFD y06, 13 * SIZE(X1)
  574. LFD y07, 14 * SIZE(X1)
  575. LFD y08, 15 * SIZE(X1)
  576. STFD a13, 4 * SIZE(AO2)
  577. STFD a14, 5 * SIZE(AO2)
  578. STFD a15, 6 * SIZE(AO2)
  579. STFD a16, 7 * SIZE(AO2)
  580. LFD a13, 12 * SIZE(AO2)
  581. LFD a14, 13 * SIZE(AO2)
  582. LFD a15, 14 * SIZE(AO2)
  583. LFD a16, 15 * SIZE(AO2)
  584. FMADD a1, alpha1_r, y01, a1
  585. FMADD a2, alpha1_r, y02, a2
  586. FMADD a3, alpha1_r, y03, a3
  587. FMADD a4, alpha1_r, y04, a4
  588. FMADD a5, alpha1_r, y05, a5
  589. FMADD a6, alpha1_r, y06, a6
  590. FMADD a7, alpha1_r, y07, a7
  591. FMADD a8, alpha1_r, y08, a8
  592. FMADD a9, alpha2_r, y01, a9
  593. FMADD a10, alpha2_r, y02, a10
  594. FMADD a11, alpha2_r, y03, a11
  595. FMADD a12, alpha2_r, y04, a12
  596. FMADD a13, alpha2_r, y05, a13
  597. FMADD a14, alpha2_r, y06, a14
  598. FMADD a15, alpha2_r, y07, a15
  599. FMADD a16, alpha2_r, y08, a16
  600. FNMSUB a1, alpha1_i, y02, a1
  601. FMADD a2, alpha1_i, y01, a2
  602. FNMSUB a3, alpha1_i, y04, a3
  603. FMADD a4, alpha1_i, y03, a4
  604. STFD a1, 8 * SIZE(AO1)
  605. STFD a2, 9 * SIZE(AO1)
  606. STFD a3, 10 * SIZE(AO1)
  607. STFD a4, 11 * SIZE(AO1)
  608. FNMSUB a5, alpha1_i, y06, a5
  609. FMADD a6, alpha1_i, y05, a6
  610. FNMSUB a7, alpha1_i, y08, a7
  611. FMADD a8, alpha1_i, y07, a8
  612. STFD a5, 12 * SIZE(AO1)
  613. STFD a6, 13 * SIZE(AO1)
  614. STFD a7, 14 * SIZE(AO1)
  615. STFD a8, 15 * SIZE(AO1)
  616. FNMSUB a9, alpha2_i, y02, a9
  617. FMADD a10, alpha2_i, y01, a10
  618. FNMSUB a11, alpha2_i, y04, a11
  619. FMADD a12, alpha2_i, y03, a12
  620. STFD a9, 8 * SIZE(AO2)
  621. STFD a10, 9 * SIZE(AO2)
  622. STFD a11, 10 * SIZE(AO2)
  623. STFD a12, 11 * SIZE(AO2)
  624. FNMSUB a13, alpha2_i, y06, a13
  625. FMADD a14, alpha2_i, y05, a14
  626. FNMSUB a15, alpha2_i, y08, a15
  627. FMADD a16, alpha2_i, y07, a16
  628. STFD a13, 12 * SIZE(AO2)
  629. STFD a14, 13 * SIZE(AO2)
  630. STFD a15, 14 * SIZE(AO2)
  631. STFD a16, 15 * SIZE(AO2)
  632. addi AO1, AO1, 16 * SIZE
  633. addi AO2, AO2, 16 * SIZE
  634. addi X1, X1, 16 * SIZE
  635. .align 4
  636. LL(15):
  637. andi. r0, M, 7
  638. ble LL(19)
  639. andi. r0, M, 4
  640. ble LL(17)
  641. LFD a1, 0 * SIZE(AO1)
  642. LFD a2, 1 * SIZE(AO1)
  643. LFD a3, 2 * SIZE(AO1)
  644. LFD a4, 3 * SIZE(AO1)
  645. LFD a5, 4 * SIZE(AO1)
  646. LFD a6, 5 * SIZE(AO1)
  647. LFD a7, 6 * SIZE(AO1)
  648. LFD a8, 7 * SIZE(AO1)
  649. LFD y01, 0 * SIZE(X1)
  650. LFD y02, 1 * SIZE(X1)
  651. LFD y03, 2 * SIZE(X1)
  652. LFD y04, 3 * SIZE(X1)
  653. LFD y05, 4 * SIZE(X1)
  654. LFD y06, 5 * SIZE(X1)
  655. LFD y07, 6 * SIZE(X1)
  656. LFD y08, 7 * SIZE(X1)
  657. LFD a9, 0 * SIZE(AO2)
  658. LFD a10, 1 * SIZE(AO2)
  659. LFD a11, 2 * SIZE(AO2)
  660. LFD a12, 3 * SIZE(AO2)
  661. LFD a13, 4 * SIZE(AO2)
  662. LFD a14, 5 * SIZE(AO2)
  663. LFD a15, 6 * SIZE(AO2)
  664. LFD a16, 7 * SIZE(AO2)
  665. FMADD a1, alpha1_r, y01, a1
  666. FMADD a2, alpha1_r, y02, a2
  667. FMADD a3, alpha1_r, y03, a3
  668. FMADD a4, alpha1_r, y04, a4
  669. FMADD a5, alpha1_r, y05, a5
  670. FMADD a6, alpha1_r, y06, a6
  671. FMADD a7, alpha1_r, y07, a7
  672. FMADD a8, alpha1_r, y08, a8
  673. FMADD a9, alpha2_r, y01, a9
  674. FMADD a10, alpha2_r, y02, a10
  675. FMADD a11, alpha2_r, y03, a11
  676. FMADD a12, alpha2_r, y04, a12
  677. FMADD a13, alpha2_r, y05, a13
  678. FMADD a14, alpha2_r, y06, a14
  679. FMADD a15, alpha2_r, y07, a15
  680. FMADD a16, alpha2_r, y08, a16
  681. FNMSUB a1, alpha1_i, y02, a1
  682. FMADD a2, alpha1_i, y01, a2
  683. FNMSUB a3, alpha1_i, y04, a3
  684. FMADD a4, alpha1_i, y03, a4
  685. FNMSUB a5, alpha1_i, y06, a5
  686. FMADD a6, alpha1_i, y05, a6
  687. FNMSUB a7, alpha1_i, y08, a7
  688. FMADD a8, alpha1_i, y07, a8
  689. FNMSUB a9, alpha2_i, y02, a9
  690. FMADD a10, alpha2_i, y01, a10
  691. FNMSUB a11, alpha2_i, y04, a11
  692. FMADD a12, alpha2_i, y03, a12
  693. FNMSUB a13, alpha2_i, y06, a13
  694. FMADD a14, alpha2_i, y05, a14
  695. FNMSUB a15, alpha2_i, y08, a15
  696. FMADD a16, alpha2_i, y07, a16
  697. STFD a1, 0 * SIZE(AO1)
  698. STFD a2, 1 * SIZE(AO1)
  699. STFD a3, 2 * SIZE(AO1)
  700. STFD a4, 3 * SIZE(AO1)
  701. STFD a5, 4 * SIZE(AO1)
  702. STFD a6, 5 * SIZE(AO1)
  703. STFD a7, 6 * SIZE(AO1)
  704. STFD a8, 7 * SIZE(AO1)
  705. STFD a9, 0 * SIZE(AO2)
  706. STFD a10, 1 * SIZE(AO2)
  707. STFD a11, 2 * SIZE(AO2)
  708. STFD a12, 3 * SIZE(AO2)
  709. STFD a13, 4 * SIZE(AO2)
  710. STFD a14, 5 * SIZE(AO2)
  711. STFD a15, 6 * SIZE(AO2)
  712. STFD a16, 7 * SIZE(AO2)
  713. addi AO1, AO1, 8 * SIZE
  714. addi AO2, AO2, 8 * SIZE
  715. addi X1, X1, 8 * SIZE
  716. .align 4
  717. LL(17):
  718. andi. r0, M, 2
  719. ble LL(18)
  720. LFD a1, 0 * SIZE(AO1)
  721. LFD a2, 1 * SIZE(AO1)
  722. LFD a3, 2 * SIZE(AO1)
  723. LFD a4, 3 * SIZE(AO1)
  724. LFD y01, 0 * SIZE(X1)
  725. LFD y02, 1 * SIZE(X1)
  726. LFD y03, 2 * SIZE(X1)
  727. LFD y04, 3 * SIZE(X1)
  728. LFD a5, 0 * SIZE(AO2)
  729. LFD a6, 1 * SIZE(AO2)
  730. LFD a7, 2 * SIZE(AO2)
  731. LFD a8, 3 * SIZE(AO2)
  732. FMADD a1, alpha1_r, y01, a1
  733. FMADD a2, alpha1_r, y02, a2
  734. FMADD a3, alpha1_r, y03, a3
  735. FMADD a4, alpha1_r, y04, a4
  736. FMADD a5, alpha2_r, y01, a5
  737. FMADD a6, alpha2_r, y02, a6
  738. FMADD a7, alpha2_r, y03, a7
  739. FMADD a8, alpha2_r, y04, a8
  740. FNMSUB a1, alpha1_i, y02, a1
  741. FMADD a2, alpha1_i, y01, a2
  742. FNMSUB a3, alpha1_i, y04, a3
  743. FMADD a4, alpha1_i, y03, a4
  744. FNMSUB a5, alpha2_i, y02, a5
  745. FMADD a6, alpha2_i, y01, a6
  746. FNMSUB a7, alpha2_i, y04, a7
  747. FMADD a8, alpha2_i, y03, a8
  748. STFD a1, 0 * SIZE(AO1)
  749. STFD a2, 1 * SIZE(AO1)
  750. STFD a3, 2 * SIZE(AO1)
  751. STFD a4, 3 * SIZE(AO1)
  752. STFD a5, 0 * SIZE(AO2)
  753. STFD a6, 1 * SIZE(AO2)
  754. STFD a7, 2 * SIZE(AO2)
  755. STFD a8, 3 * SIZE(AO2)
  756. addi AO1, AO1, 4 * SIZE
  757. addi AO2, AO2, 4 * SIZE
  758. addi X1, X1, 4 * SIZE
  759. .align 4
  760. LL(18):
  761. andi. r0, M, 1
  762. ble LL(19)
  763. LFD a1, 0 * SIZE(AO1)
  764. LFD a2, 1 * SIZE(AO1)
  765. LFD a3, 0 * SIZE(AO2)
  766. LFD a4, 1 * SIZE(AO2)
  767. LFD y01, 0 * SIZE(X1)
  768. LFD y02, 1 * SIZE(X1)
  769. FMADD a1, alpha1_r, y01, a1
  770. FMADD a2, alpha1_r, y02, a2
  771. FMADD a3, alpha2_r, y01, a3
  772. FMADD a4, alpha2_r, y02, a4
  773. FNMSUB a1, alpha1_i, y02, a1
  774. FMADD a2, alpha1_i, y01, a2
  775. FNMSUB a3, alpha2_i, y02, a3
  776. FMADD a4, alpha2_i, y01, a4
  777. STFD a1, 0 * SIZE(AO1)
  778. STFD a2, 1 * SIZE(AO1)
  779. STFD a3, 0 * SIZE(AO2)
  780. STFD a4, 1 * SIZE(AO2)
  781. .align 4
  782. LL(19):
  783. addi J, J, -1
  784. cmpi cr0, 0, J, 0
  785. bgt LL(11)
  786. .align 4
  787. LL(20):
  788. andi. J, N, 1
  789. ble LL(999)
  790. LFD alpha1_r, 0 * SIZE(Y)
  791. LFD alpha1_i, 1 * SIZE(Y)
  792. FMUL a1, alpha_r, alpha1_r
  793. FMUL a2, alpha_i, alpha1_r
  794. FMA1 alpha1_r, alpha_i, alpha1_i, a1
  795. FMA2 alpha1_i, alpha_r, alpha1_i, a2
  796. mr AO1, A
  797. mr X1, XX
  798. srawi. r0, M, 3
  799. mtspr CTR, r0
  800. ble LL(25)
  801. LFD a1, 0 * SIZE(AO1)
  802. LFD a2, 1 * SIZE(AO1)
  803. LFD a3, 2 * SIZE(AO1)
  804. LFD a4, 3 * SIZE(AO1)
  805. LFD a5, 4 * SIZE(AO1)
  806. LFD a6, 5 * SIZE(AO1)
  807. LFD a7, 6 * SIZE(AO1)
  808. LFD a8, 7 * SIZE(AO1)
  809. LFD y01, 0 * SIZE(X1)
  810. LFD y02, 1 * SIZE(X1)
  811. LFD y03, 2 * SIZE(X1)
  812. LFD y04, 3 * SIZE(X1)
  813. LFD y05, 4 * SIZE(X1)
  814. LFD y06, 5 * SIZE(X1)
  815. LFD y07, 6 * SIZE(X1)
  816. LFD y08, 7 * SIZE(X1)
  817. bdz LL(23)
  818. .align 4
  819. LL(22):
  820. FMADD a1, alpha1_r, y01, a1
  821. FMADD a2, alpha1_r, y02, a2
  822. FMADD a3, alpha1_r, y03, a3
  823. FMADD a4, alpha1_r, y04, a4
  824. FMADD a5, alpha1_r, y05, a5
  825. FMADD a6, alpha1_r, y06, a6
  826. FMADD a7, alpha1_r, y07, a7
  827. FMADD a8, alpha1_r, y08, a8
  828. FNMSUB a1, alpha1_i, y02, a1
  829. FMADD a2, alpha1_i, y01, a2
  830. FNMSUB a3, alpha1_i, y04, a3
  831. FMADD a4, alpha1_i, y03, a4
  832. STFD a1, 0 * SIZE(AO1)
  833. STFD a2, 1 * SIZE(AO1)
  834. STFD a3, 2 * SIZE(AO1)
  835. STFD a4, 3 * SIZE(AO1)
  836. LFD a1, 8 * SIZE(AO1)
  837. LFD a2, 9 * SIZE(AO1)
  838. LFD a3, 10 * SIZE(AO1)
  839. LFD a4, 11 * SIZE(AO1)
  840. FNMSUB a5, alpha1_i, y06, a5
  841. FMADD a6, alpha1_i, y05, a6
  842. FNMSUB a7, alpha1_i, y08, a7
  843. FMADD a8, alpha1_i, y07, a8
  844. STFD a5, 4 * SIZE(AO1)
  845. STFD a6, 5 * SIZE(AO1)
  846. STFD a7, 6 * SIZE(AO1)
  847. STFD a8, 7 * SIZE(AO1)
  848. LFD a5, 12 * SIZE(AO1)
  849. LFD a6, 13 * SIZE(AO1)
  850. LFD a7, 14 * SIZE(AO1)
  851. LFD a8, 15 * SIZE(AO1)
  852. LFD y01, 8 * SIZE(X1)
  853. LFD y02, 9 * SIZE(X1)
  854. LFD y03, 10 * SIZE(X1)
  855. LFD y04, 11 * SIZE(X1)
  856. LFD y05, 12 * SIZE(X1)
  857. LFD y06, 13 * SIZE(X1)
  858. LFD y07, 14 * SIZE(X1)
  859. LFD y08, 15 * SIZE(X1)
  860. FMADD a1, alpha1_r, y01, a1
  861. FMADD a2, alpha1_r, y02, a2
  862. FMADD a3, alpha1_r, y03, a3
  863. FMADD a4, alpha1_r, y04, a4
  864. FMADD a5, alpha1_r, y05, a5
  865. FMADD a6, alpha1_r, y06, a6
  866. FMADD a7, alpha1_r, y07, a7
  867. FMADD a8, alpha1_r, y08, a8
  868. FMADD a9, alpha2_r, y01, a9
  869. FMADD a10, alpha2_r, y02, a10
  870. FMADD a11, alpha2_r, y03, a11
  871. FMADD a12, alpha2_r, y04, a12
  872. FMADD a13, alpha2_r, y05, a13
  873. FMADD a14, alpha2_r, y06, a14
  874. FMADD a15, alpha2_r, y07, a15
  875. FMADD a16, alpha2_r, y08, a16
  876. FNMSUB a1, alpha1_i, y02, a1
  877. FMADD a2, alpha1_i, y01, a2
  878. FNMSUB a3, alpha1_i, y04, a3
  879. FMADD a4, alpha1_i, y03, a4
  880. STFD a1, 8 * SIZE(AO1)
  881. STFD a2, 9 * SIZE(AO1)
  882. STFD a3, 10 * SIZE(AO1)
  883. STFD a4, 11 * SIZE(AO1)
  884. LFD a1, 16 * SIZE(AO1)
  885. LFD a2, 17 * SIZE(AO1)
  886. LFD a3, 18 * SIZE(AO1)
  887. LFD a4, 19 * SIZE(AO1)
  888. FNMSUB a5, alpha1_i, y06, a5
  889. FMADD a6, alpha1_i, y05, a6
  890. FNMSUB a7, alpha1_i, y08, a7
  891. FMADD a8, alpha1_i, y07, a8
  892. STFD a5, 12 * SIZE(AO1)
  893. STFD a6, 13 * SIZE(AO1)
  894. STFD a7, 14 * SIZE(AO1)
  895. STFD a8, 15 * SIZE(AO1)
  896. LFD a5, 20 * SIZE(AO1)
  897. LFD a6, 21 * SIZE(AO1)
  898. LFD a7, 22 * SIZE(AO1)
  899. LFD a8, 23 * SIZE(AO1)
  900. LFD y01, 16 * SIZE(X1)
  901. LFD y02, 17 * SIZE(X1)
  902. LFD y03, 18 * SIZE(X1)
  903. LFD y04, 19 * SIZE(X1)
  904. LFD y05, 20 * SIZE(X1)
  905. LFD y06, 21 * SIZE(X1)
  906. LFD y07, 22 * SIZE(X1)
  907. LFD y08, 23 * SIZE(X1)
  908. addi AO1, AO1, 16 * SIZE
  909. addi X1, X1, 16 * SIZE
  910. DCBT(AO1, PREA)
  911. DCBT(Y1, PREY)
  912. bdnz+ LL(22)
  913. .align 4
  914. LL(23):
  915. FMADD a1, alpha1_r, y01, a1
  916. FMADD a2, alpha1_r, y02, a2
  917. FMADD a3, alpha1_r, y03, a3
  918. FMADD a4, alpha1_r, y04, a4
  919. FMADD a5, alpha1_r, y05, a5
  920. FMADD a6, alpha1_r, y06, a6
  921. FMADD a7, alpha1_r, y07, a7
  922. FMADD a8, alpha1_r, y08, a8
  923. FNMSUB a1, alpha1_i, y02, a1
  924. FMADD a2, alpha1_i, y01, a2
  925. FNMSUB a3, alpha1_i, y04, a3
  926. FMADD a4, alpha1_i, y03, a4
  927. STFD a1, 0 * SIZE(AO1)
  928. STFD a2, 1 * SIZE(AO1)
  929. STFD a3, 2 * SIZE(AO1)
  930. STFD a4, 3 * SIZE(AO1)
  931. LFD a1, 8 * SIZE(AO1)
  932. LFD a2, 9 * SIZE(AO1)
  933. LFD a3, 10 * SIZE(AO1)
  934. LFD a4, 11 * SIZE(AO1)
  935. FNMSUB a5, alpha1_i, y06, a5
  936. FMADD a6, alpha1_i, y05, a6
  937. FNMSUB a7, alpha1_i, y08, a7
  938. FMADD a8, alpha1_i, y07, a8
  939. STFD a5, 4 * SIZE(AO1)
  940. STFD a6, 5 * SIZE(AO1)
  941. STFD a7, 6 * SIZE(AO1)
  942. STFD a8, 7 * SIZE(AO1)
  943. LFD a5, 12 * SIZE(AO1)
  944. LFD a6, 13 * SIZE(AO1)
  945. LFD a7, 14 * SIZE(AO1)
  946. LFD a8, 15 * SIZE(AO1)
  947. LFD y01, 8 * SIZE(X1)
  948. LFD y02, 9 * SIZE(X1)
  949. LFD y03, 10 * SIZE(X1)
  950. LFD y04, 11 * SIZE(X1)
  951. LFD y05, 12 * SIZE(X1)
  952. LFD y06, 13 * SIZE(X1)
  953. LFD y07, 14 * SIZE(X1)
  954. LFD y08, 15 * SIZE(X1)
  955. FMADD a1, alpha1_r, y01, a1
  956. FMADD a2, alpha1_r, y02, a2
  957. FMADD a3, alpha1_r, y03, a3
  958. FMADD a4, alpha1_r, y04, a4
  959. FMADD a5, alpha1_r, y05, a5
  960. FMADD a6, alpha1_r, y06, a6
  961. FMADD a7, alpha1_r, y07, a7
  962. FMADD a8, alpha1_r, y08, a8
  963. FNMSUB a1, alpha1_i, y02, a1
  964. FMADD a2, alpha1_i, y01, a2
  965. FNMSUB a3, alpha1_i, y04, a3
  966. FMADD a4, alpha1_i, y03, a4
  967. STFD a1, 8 * SIZE(AO1)
  968. STFD a2, 9 * SIZE(AO1)
  969. STFD a3, 10 * SIZE(AO1)
  970. STFD a4, 11 * SIZE(AO1)
  971. FNMSUB a5, alpha1_i, y06, a5
  972. FMADD a6, alpha1_i, y05, a6
  973. FNMSUB a7, alpha1_i, y08, a7
  974. FMADD a8, alpha1_i, y07, a8
  975. STFD a5, 12 * SIZE(AO1)
  976. STFD a6, 13 * SIZE(AO1)
  977. STFD a7, 14 * SIZE(AO1)
  978. STFD a8, 15 * SIZE(AO1)
  979. addi AO1, AO1, 16 * SIZE
  980. addi X1, X1, 16 * SIZE
  981. .align 4
  982. LL(25):
  983. andi. r0, M, 7
  984. ble LL(999)
  985. andi. r0, M, 4
  986. ble LL(27)
  987. LFD a1, 0 * SIZE(AO1)
  988. LFD a2, 1 * SIZE(AO1)
  989. LFD a3, 2 * SIZE(AO1)
  990. LFD a4, 3 * SIZE(AO1)
  991. LFD a5, 4 * SIZE(AO1)
  992. LFD a6, 5 * SIZE(AO1)
  993. LFD a7, 6 * SIZE(AO1)
  994. LFD a8, 7 * SIZE(AO1)
  995. LFD y01, 0 * SIZE(X1)
  996. LFD y02, 1 * SIZE(X1)
  997. LFD y03, 2 * SIZE(X1)
  998. LFD y04, 3 * SIZE(X1)
  999. LFD y05, 4 * SIZE(X1)
  1000. LFD y06, 5 * SIZE(X1)
  1001. LFD y07, 6 * SIZE(X1)
  1002. LFD y08, 7 * SIZE(X1)
  1003. FMADD a1, alpha1_r, y01, a1
  1004. FMADD a2, alpha1_r, y02, a2
  1005. FMADD a3, alpha1_r, y03, a3
  1006. FMADD a4, alpha1_r, y04, a4
  1007. FMADD a5, alpha1_r, y05, a5
  1008. FMADD a6, alpha1_r, y06, a6
  1009. FMADD a7, alpha1_r, y07, a7
  1010. FMADD a8, alpha1_r, y08, a8
  1011. FNMSUB a1, alpha1_i, y02, a1
  1012. FMADD a2, alpha1_i, y01, a2
  1013. FNMSUB a3, alpha1_i, y04, a3
  1014. FMADD a4, alpha1_i, y03, a4
  1015. FNMSUB a5, alpha1_i, y06, a5
  1016. FMADD a6, alpha1_i, y05, a6
  1017. FNMSUB a7, alpha1_i, y08, a7
  1018. FMADD a8, alpha1_i, y07, a8
  1019. STFD a1, 0 * SIZE(AO1)
  1020. STFD a2, 1 * SIZE(AO1)
  1021. STFD a3, 2 * SIZE(AO1)
  1022. STFD a4, 3 * SIZE(AO1)
  1023. STFD a5, 4 * SIZE(AO1)
  1024. STFD a6, 5 * SIZE(AO1)
  1025. STFD a7, 6 * SIZE(AO1)
  1026. STFD a8, 7 * SIZE(AO1)
  1027. addi AO1, AO1, 8 * SIZE
  1028. addi X1, X1, 8 * SIZE
  1029. .align 4
  1030. LL(27):
  1031. andi. r0, M, 2
  1032. ble LL(28)
  1033. LFD a1, 0 * SIZE(AO1)
  1034. LFD a2, 1 * SIZE(AO1)
  1035. LFD a3, 2 * SIZE(AO1)
  1036. LFD a4, 3 * SIZE(AO1)
  1037. LFD y01, 0 * SIZE(X1)
  1038. LFD y02, 1 * SIZE(X1)
  1039. LFD y03, 2 * SIZE(X1)
  1040. LFD y04, 3 * SIZE(X1)
  1041. FMADD a1, alpha1_r, y01, a1
  1042. FMADD a2, alpha1_r, y02, a2
  1043. FMADD a3, alpha1_r, y03, a3
  1044. FMADD a4, alpha1_r, y04, a4
  1045. FNMSUB a1, alpha1_i, y02, a1
  1046. FMADD a2, alpha1_i, y01, a2
  1047. FNMSUB a3, alpha1_i, y04, a3
  1048. FMADD a4, alpha1_i, y03, a4
  1049. STFD a1, 0 * SIZE(AO1)
  1050. STFD a2, 1 * SIZE(AO1)
  1051. STFD a3, 2 * SIZE(AO1)
  1052. STFD a4, 3 * SIZE(AO1)
  1053. addi AO1, AO1, 4 * SIZE
  1054. addi X1, X1, 4 * SIZE
  1055. .align 4
  1056. LL(28):
  1057. andi. r0, M, 1
  1058. ble LL(999)
  1059. LFD a1, 0 * SIZE(AO1)
  1060. LFD a2, 1 * SIZE(AO1)
  1061. LFD y01, 0 * SIZE(X1)
  1062. LFD y02, 1 * SIZE(X1)
  1063. FMADD a1, alpha1_r, y01, a1
  1064. FMADD a2, alpha1_r, y02, a2
  1065. FNMSUB a1, alpha1_i, y02, a1
  1066. FMADD a2, alpha1_i, y01, a2
  1067. STFD a1, 0 * SIZE(AO1)
  1068. STFD a2, 1 * SIZE(AO1)
  1069. .align 4
  1070. LL(999):
  1071. li r3, 0
  1072. lfd f14, 0(SP)
  1073. lfd f15, 8(SP)
  1074. lfd f16, 16(SP)
  1075. lfd f17, 24(SP)
  1076. lfd f18, 32(SP)
  1077. lfd f19, 40(SP)
  1078. lfd f20, 48(SP)
  1079. lfd f21, 56(SP)
  1080. lfd f22, 64(SP)
  1081. lfd f23, 72(SP)
  1082. lfd f24, 80(SP)
  1083. lfd f25, 88(SP)
  1084. lfd f26, 96(SP)
  1085. lfd f27, 104(SP)
  1086. lfd f28, 112(SP)
  1087. lfd f29, 120(SP)
  1088. lfd f30, 128(SP)
  1089. lfd f31, 136(SP)
  1090. #ifdef __64BIT__
  1091. ld r14, 144(SP)
  1092. ld r15, 152(SP)
  1093. ld r16, 160(SP)
  1094. ld r17, 168(SP)
  1095. ld r18, 176(SP)
  1096. ld r19, 184(SP)
  1097. ld r20, 192(SP)
  1098. ld r21, 200(SP)
  1099. ld r22, 208(SP)
  1100. ld r23, 216(SP)
  1101. ld r24, 224(SP)
  1102. ld r25, 232(SP)
  1103. ld r26, 240(SP)
  1104. ld r27, 248(SP)
  1105. #else
  1106. lwz r14, 144(SP)
  1107. lwz r15, 148(SP)
  1108. lwz r16, 152(SP)
  1109. lwz r17, 156(SP)
  1110. lwz r18, 160(SP)
  1111. lwz r19, 164(SP)
  1112. lwz r20, 168(SP)
  1113. lwz r21, 172(SP)
  1114. lwz r22, 176(SP)
  1115. lwz r23, 180(SP)
  1116. lwz r24, 184(SP)
  1117. lwz r25, 188(SP)
  1118. lwz r26, 192(SP)
  1119. lwz r27, 196(SP)
  1120. #endif
  1121. addi SP, SP, STACKSIZE
  1122. blr
  1123. EPILOGUE
  1124. #endif