You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_ppc440_RT.S 38 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA_R 296(SP)
  48. #define ALPHA_I 304(SP)
  49. #define FZERO 312(SP)
  50. #else
  51. #define STACKSIZE 256
  52. #define ALPHA_R 224(SP)
  53. #define ALPHA_I 232(SP)
  54. #define FZERO 240(SP)
  55. #endif
  56. #define M r3
  57. #define N r4
  58. #define K r5
  59. #if defined(linux) || defined(__FreeBSD__)
  60. #ifndef __64BIT__
  61. #define A r6
  62. #define B r7
  63. #define C r8
  64. #define LDC r9
  65. #define OFFSET r10
  66. #else
  67. #define A r8
  68. #define B r9
  69. #define C r10
  70. #define LDC r6
  71. #define OFFSET r7
  72. #endif
  73. #endif
  74. #if defined(_AIX) || defined(__APPLE__)
  75. #if !defined(__64BIT__) && defined(DOUBLE)
  76. #define A r10
  77. #define B r6
  78. #define C r7
  79. #define LDC r8
  80. #define OFFSET r9
  81. #else
  82. #define A r8
  83. #define B r9
  84. #define C r10
  85. #define LDC r6
  86. #define OFFSET r7
  87. #endif
  88. #endif
  89. #define AORIG r21
  90. #define TEMP r22
  91. #define KK r23
  92. #define I r24
  93. #define J r25
  94. #define AO r26
  95. #define BO r27
  96. #define CO1 r28
  97. #define CO2 r29
  98. #define A1 f16
  99. #define A2 f17
  100. #define A3 f18
  101. #define A4 f19
  102. #define A5 f20
  103. #define A6 f21
  104. #define B1 f22
  105. #define B2 f23
  106. #define B3 f24
  107. #define B4 f25
  108. #define B5 f26
  109. #define B6 f27
  110. #define B7 f28
  111. #define B8 f29
  112. #define B9 f30
  113. #define B10 f31
  114. PROLOGUE
  115. PROFCODE
  116. addi SP, SP, -STACKSIZE
  117. li r0, 0
  118. stfd f14, 0(SP)
  119. stfd f15, 8(SP)
  120. stfd f16, 16(SP)
  121. stfd f17, 24(SP)
  122. stfd f18, 32(SP)
  123. stfd f19, 40(SP)
  124. stfd f20, 48(SP)
  125. stfd f21, 56(SP)
  126. stfd f22, 64(SP)
  127. stfd f23, 72(SP)
  128. stfd f24, 80(SP)
  129. stfd f25, 88(SP)
  130. stfd f26, 96(SP)
  131. stfd f27, 104(SP)
  132. stfd f28, 112(SP)
  133. stfd f29, 120(SP)
  134. stfd f30, 128(SP)
  135. stfd f31, 136(SP)
  136. #ifdef __64BIT__
  137. std r31, 144(SP)
  138. std r30, 152(SP)
  139. std r29, 160(SP)
  140. std r28, 168(SP)
  141. std r27, 176(SP)
  142. std r26, 184(SP)
  143. std r25, 192(SP)
  144. std r24, 200(SP)
  145. std r23, 208(SP)
  146. std r22, 216(SP)
  147. std r21, 224(SP)
  148. #else
  149. stw r31, 144(SP)
  150. stw r30, 148(SP)
  151. stw r29, 152(SP)
  152. stw r28, 156(SP)
  153. stw r27, 160(SP)
  154. stw r26, 164(SP)
  155. stw r25, 168(SP)
  156. stw r24, 172(SP)
  157. stw r23, 176(SP)
  158. stw r22, 180(SP)
  159. stw r21, 184(SP)
  160. #endif
  161. stw r0, FZERO
  162. #if defined(linux) || defined(__FreeBSD__)
  163. #ifdef __64BIT__
  164. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  165. #endif
  166. #endif
  167. #if defined(_AIX) || defined(__APPLE__)
  168. #ifdef __64BIT__
  169. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  170. #else
  171. #ifdef DOUBLE
  172. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  173. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  174. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  175. #else
  176. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  177. #endif
  178. #endif
  179. #endif
  180. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  181. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  182. #endif
  183. #if defined(_AIX) || defined(__APPLE__)
  184. #ifdef __64BIT__
  185. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  186. #else
  187. #ifdef DOUBLE
  188. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  189. #else
  190. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  191. #endif
  192. #endif
  193. #endif
  194. slwi LDC, LDC, ZBASE_SHIFT
  195. #ifdef LN
  196. mullw r0, M, K
  197. slwi r0, r0, ZBASE_SHIFT
  198. add A, A, r0
  199. slwi r0, M, ZBASE_SHIFT
  200. add C, C, r0
  201. #endif
  202. #ifdef RN
  203. neg KK, OFFSET
  204. #endif
  205. #ifdef RT
  206. mullw r0, N, K
  207. slwi r0, r0, ZBASE_SHIFT
  208. add B, B, r0
  209. mullw r0, N, LDC
  210. add C, C, r0
  211. sub KK, N, OFFSET
  212. #endif
  213. cmpwi cr0, M, 0
  214. ble .L999
  215. cmpwi cr0, N, 0
  216. ble .L999
  217. cmpwi cr0, K, 0
  218. ble .L999
  219. andi. J, N, 1
  220. ble .L30
  221. #ifdef RT
  222. slwi r0, K, 0 + ZBASE_SHIFT
  223. sub B, B, r0
  224. sub C, C, LDC
  225. #endif
  226. mr CO1, C
  227. #ifdef LN
  228. add KK, M, OFFSET
  229. #endif
  230. #ifdef LT
  231. mr KK, OFFSET
  232. #endif
  233. srawi. I, M, 1
  234. #if defined(LN) || defined(RT)
  235. mr AORIG, A
  236. #else
  237. mr AO, A
  238. #endif
  239. #ifndef RT
  240. add C, C, LDC
  241. #endif
  242. ble .L40
  243. .align 4
  244. .L31:
  245. #if defined(LT) || defined(RN)
  246. LFD f20, 0 * SIZE(AO)
  247. LFD f21, 1 * SIZE(AO)
  248. LFD f22, 2 * SIZE(AO)
  249. LFD f23, 3 * SIZE(AO)
  250. LFD f24, 4 * SIZE(AO)
  251. LFD f25, 5 * SIZE(AO)
  252. LFD f26, 6 * SIZE(AO)
  253. LFD f27, 7 * SIZE(AO)
  254. LFD f16, 0 * SIZE(B)
  255. LFD f17, 1 * SIZE(B)
  256. LFD f18, 2 * SIZE(B)
  257. LFD f19, 3 * SIZE(B)
  258. lfs f0, FZERO
  259. fmr f1, f0
  260. fmr f2, f0
  261. fmr f3, f0
  262. fmr f4, f0
  263. fmr f5, f0
  264. fmr f6, f0
  265. fmr f7, f0
  266. srawi. r0, KK, 2
  267. mr BO, B
  268. mtspr CTR, r0
  269. #else
  270. #ifdef LN
  271. slwi r0, K, 1 + ZBASE_SHIFT
  272. sub AORIG, AORIG, r0
  273. #endif
  274. slwi r0, KK, 1 + ZBASE_SHIFT
  275. slwi TEMP, KK, 0 + ZBASE_SHIFT
  276. add AO, AORIG, r0
  277. add BO, B, TEMP
  278. sub TEMP, K, KK
  279. LFD f20, 0 * SIZE(AO)
  280. LFD f21, 1 * SIZE(AO)
  281. LFD f22, 2 * SIZE(AO)
  282. LFD f23, 3 * SIZE(AO)
  283. LFD f24, 4 * SIZE(AO)
  284. LFD f25, 5 * SIZE(AO)
  285. LFD f26, 6 * SIZE(AO)
  286. LFD f27, 7 * SIZE(AO)
  287. LFD f16, 0 * SIZE(BO)
  288. LFD f17, 1 * SIZE(BO)
  289. LFD f18, 2 * SIZE(BO)
  290. LFD f19, 3 * SIZE(BO)
  291. lfs f0, FZERO
  292. fmr f1, f0
  293. fmr f2, f0
  294. fmr f3, f0
  295. fmr f4, f0
  296. fmr f5, f0
  297. fmr f6, f0
  298. fmr f7, f0
  299. srawi. r0, TEMP, 2
  300. mtspr CTR, r0
  301. #endif
  302. ble .L35
  303. .align 4
  304. .L32:
  305. fmadd f0, f16, f20, f0
  306. LFD f19, 3 * SIZE(BO)
  307. fmadd f1, f16, f21, f1
  308. nop
  309. fmadd f2, f16, f22, f2
  310. nop
  311. fmadd f3, f16, f23, f3
  312. LFD f16, 4 * SIZE(BO)
  313. fmadd f4, f17, f20, f4
  314. LFD f20, 8 * SIZE(AO)
  315. fmadd f5, f17, f21, f5
  316. LFD f21, 9 * SIZE(AO)
  317. fmadd f6, f17, f22, f6
  318. LFD f22, 10 * SIZE(AO)
  319. fmadd f7, f17, f23, f7
  320. LFD f23, 11 * SIZE(AO)
  321. fmadd f0, f18, f24, f0
  322. LFD f17, 5 * SIZE(BO)
  323. fmadd f1, f18, f25, f1
  324. nop
  325. fmadd f2, f18, f26, f2
  326. nop
  327. fmadd f3, f18, f27, f3
  328. LFD f18, 6 * SIZE(BO)
  329. fmadd f4, f19, f24, f4
  330. LFD f24, 12 * SIZE(AO)
  331. fmadd f5, f19, f25, f5
  332. LFD f25, 13 * SIZE(AO)
  333. fmadd f6, f19, f26, f6
  334. LFD f26, 14 * SIZE(AO)
  335. fmadd f7, f19, f27, f7
  336. LFD f27, 15 * SIZE(AO)
  337. fmadd f0, f16, f20, f0
  338. LFD f19, 7 * SIZE(BO)
  339. fmadd f1, f16, f21, f1
  340. nop
  341. fmadd f2, f16, f22, f2
  342. nop
  343. fmadd f3, f16, f23, f3
  344. LFDU f16, 8 * SIZE(BO)
  345. fmadd f4, f17, f20, f4
  346. LFDU f20, 16 * SIZE(AO)
  347. fmadd f5, f17, f21, f5
  348. LFD f21, 1 * SIZE(AO)
  349. fmadd f6, f17, f22, f6
  350. LFD f22, 2 * SIZE(AO)
  351. fmadd f7, f17, f23, f7
  352. LFD f23, 3 * SIZE(AO)
  353. fmadd f0, f18, f24, f0
  354. LFD f17, 1 * SIZE(BO)
  355. fmadd f1, f18, f25, f1
  356. nop
  357. fmadd f2, f18, f26, f2
  358. nop
  359. fmadd f3, f18, f27, f3
  360. LFD f18, 2 * SIZE(BO)
  361. fmadd f4, f19, f24, f4
  362. LFD f24, 4 * SIZE(AO)
  363. fmadd f5, f19, f25, f5
  364. LFD f25, 5 * SIZE(AO)
  365. fmadd f6, f19, f26, f6
  366. LFD f26, 6 * SIZE(AO)
  367. fmadd f7, f19, f27, f7
  368. LFD f27, 7 * SIZE(AO)
  369. bdnz .L32
  370. .align 4
  371. .L35:
  372. #if defined(LT) || defined(RN)
  373. andi. r0, KK, 3
  374. #else
  375. andi. r0, TEMP, 3
  376. #endif
  377. mtspr CTR, r0
  378. ble .L37
  379. .align 4
  380. .L36:
  381. fmadd f0, f16, f20, f0
  382. LFD f17, 1 * SIZE(BO)
  383. fmadd f1, f16, f21, f1
  384. nop
  385. fmadd f2, f16, f22, f2
  386. nop
  387. fmadd f3, f16, f23, f3
  388. LFDU f16, 2 * SIZE(BO)
  389. fmadd f4, f17, f20, f4
  390. LFDU f20, 4 * SIZE(AO)
  391. fmadd f5, f17, f21, f5
  392. LFD f21, 1 * SIZE(AO)
  393. fmadd f6, f17, f22, f6
  394. LFD f22, 2 * SIZE(AO)
  395. fmadd f7, f17, f23, f7
  396. LFD f23, 3 * SIZE(AO)
  397. bdnz .L36
  398. .align 4
  399. .L37:
  400. #ifndef CONJ
  401. FSUB f0, f0, f5
  402. FADD f1, f1, f4
  403. FSUB f2, f2, f7
  404. FADD f3, f3, f6
  405. #else
  406. FADD f0, f0, f5
  407. FSUB f1, f4, f1
  408. FADD f2, f2, f7
  409. FSUB f3, f6, f3
  410. #endif
  411. #if defined(LN) || defined(RT)
  412. #ifdef LN
  413. subi r0, KK, 2
  414. #else
  415. subi r0, KK, 1
  416. #endif
  417. slwi TEMP, r0, 1 + ZBASE_SHIFT
  418. slwi r0, r0, 0 + ZBASE_SHIFT
  419. add AO, AORIG, TEMP
  420. add BO, B, r0
  421. #endif
  422. #if defined(LN) || defined(LT)
  423. LFD f16, 0 * SIZE(BO)
  424. LFD f17, 1 * SIZE(BO)
  425. LFD f18, 2 * SIZE(BO)
  426. LFD f19, 3 * SIZE(BO)
  427. FSUB f0, f16, f0
  428. FSUB f1, f17, f1
  429. FSUB f2, f18, f2
  430. FSUB f3, f19, f3
  431. #else
  432. LFD f16, 0 * SIZE(AO)
  433. LFD f17, 1 * SIZE(AO)
  434. LFD f18, 2 * SIZE(AO)
  435. LFD f19, 3 * SIZE(AO)
  436. #ifndef CONJ
  437. FSUB f0, f16, f0
  438. FSUB f1, f17, f1
  439. FSUB f2, f18, f2
  440. FSUB f3, f19, f3
  441. #else
  442. FSUB f0, f16, f0
  443. FADD f1, f17, f1
  444. FSUB f2, f18, f2
  445. FADD f3, f19, f3
  446. #endif
  447. #endif
  448. #ifdef LN
  449. LFD f16, 6 * SIZE(AO)
  450. LFD f17, 7 * SIZE(AO)
  451. LFD f18, 4 * SIZE(AO)
  452. LFD f19, 5 * SIZE(AO)
  453. LFD f20, 0 * SIZE(AO)
  454. LFD f21, 1 * SIZE(AO)
  455. FMUL f6, f17, f3
  456. FMUL f7, f17, f2
  457. #ifndef CONJ
  458. FMSUB f2, f16, f2, f6
  459. FMADD f3, f16, f3, f7
  460. FMADD f0, f19, f3, f0
  461. FNMSUB f1, f19, f2, f1
  462. FNMSUB f0, f18, f2, f0
  463. FNMSUB f1, f18, f3, f1
  464. FMUL f4, f21, f1
  465. FMUL f5, f21, f0
  466. FMSUB f0, f20, f0, f4
  467. FMADD f1, f20, f1, f5
  468. #else
  469. FMADD f2, f16, f2, f6
  470. FMSUB f3, f16, f3, f7
  471. FMSUB f0, f19, f3, f0
  472. FNMADD f1, f19, f2, f1
  473. FNMADD f0, f18, f2, f0
  474. FNMADD f1, f18, f3, f1
  475. FMUL f4, f21, f1
  476. FMUL f5, f21, f0
  477. FMADD f0, f20, f0, f4
  478. FMSUB f1, f20, f1, f5
  479. #endif
  480. #endif
  481. #ifdef LT
  482. LFD f16, 0 * SIZE(AO)
  483. LFD f17, 1 * SIZE(AO)
  484. LFD f18, 2 * SIZE(AO)
  485. LFD f19, 3 * SIZE(AO)
  486. LFD f20, 6 * SIZE(AO)
  487. LFD f21, 7 * SIZE(AO)
  488. FMUL f4, f17, f1
  489. FMUL f5, f17, f0
  490. #ifndef CONJ
  491. FMSUB f0, f16, f0, f4
  492. FMADD f1, f16, f1, f5
  493. FMADD f2, f19, f1, f2
  494. FNMSUB f3, f19, f0, f3
  495. FNMSUB f2, f18, f0, f2
  496. FNMSUB f3, f18, f1, f3
  497. FMUL f4, f21, f3
  498. FMUL f5, f21, f2
  499. FMSUB f2, f20, f2, f4
  500. FMADD f3, f20, f3, f5
  501. #else
  502. FMADD f0, f16, f0, f4
  503. FMSUB f1, f16, f1, f5
  504. FMSUB f2, f19, f1, f2
  505. FNMADD f3, f19, f0, f3
  506. FNMADD f2, f18, f0, f2
  507. FNMADD f3, f18, f1, f3
  508. FMUL f4, f21, f3
  509. FMUL f5, f21, f2
  510. FMADD f2, f20, f2, f4
  511. FMSUB f3, f20, f3, f5
  512. #endif
  513. #endif
  514. #ifdef RN
  515. LFD f16, 0 * SIZE(BO)
  516. LFD f17, 1 * SIZE(BO)
  517. FMUL f4, f17, f1
  518. FMUL f5, f17, f0
  519. FMUL f6, f17, f3
  520. FMUL f7, f17, f2
  521. #ifndef CONJ
  522. FMSUB f0, f16, f0, f4
  523. FMADD f1, f16, f1, f5
  524. FMSUB f2, f16, f2, f6
  525. FMADD f3, f16, f3, f7
  526. #else
  527. FMADD f0, f16, f0, f4
  528. FMSUB f1, f16, f1, f5
  529. FMADD f2, f16, f2, f6
  530. FMSUB f3, f16, f3, f7
  531. #endif
  532. #endif
  533. #ifdef RT
  534. LFD f20, 0 * SIZE(BO)
  535. LFD f21, 1 * SIZE(BO)
  536. FMUL f4, f21, f1
  537. FMUL f5, f21, f0
  538. FMUL f6, f21, f3
  539. FMUL f7, f21, f2
  540. #ifndef CONJ
  541. FMSUB f0, f20, f0, f4
  542. FMADD f1, f20, f1, f5
  543. FMSUB f2, f20, f2, f6
  544. FMADD f3, f20, f3, f7
  545. #else
  546. FMADD f0, f20, f0, f4
  547. FMSUB f1, f20, f1, f5
  548. FMADD f2, f20, f2, f6
  549. FMSUB f3, f20, f3, f7
  550. #endif
  551. #endif
  552. #ifdef LN
  553. subi CO1, CO1, 4 * SIZE
  554. #endif
  555. #if defined(LN) || defined(LT)
  556. STFD f0, 0 * SIZE(BO)
  557. STFD f1, 1 * SIZE(BO)
  558. STFD f2, 2 * SIZE(BO)
  559. STFD f3, 3 * SIZE(BO)
  560. #else
  561. STFD f0, 0 * SIZE(AO)
  562. STFD f1, 1 * SIZE(AO)
  563. STFD f2, 2 * SIZE(AO)
  564. STFD f3, 3 * SIZE(AO)
  565. #endif
  566. STFD f0, 0 * SIZE(CO1)
  567. STFD f1, 1 * SIZE(CO1)
  568. STFD f2, 2 * SIZE(CO1)
  569. STFD f3, 3 * SIZE(CO1)
  570. #ifndef LN
  571. addi CO1, CO1, 4 * SIZE
  572. #endif
  573. #ifdef RT
  574. slwi r0, K, 1 + ZBASE_SHIFT
  575. add AORIG, AORIG, r0
  576. #endif
  577. #if defined(LT) || defined(RN)
  578. sub TEMP, K, KK
  579. slwi r0, TEMP, 1 + ZBASE_SHIFT
  580. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  581. add AO, AO, r0
  582. add BO, BO, TEMP
  583. #endif
  584. #ifdef LT
  585. addi KK, KK, 2
  586. #endif
  587. #ifdef LN
  588. subi KK, KK, 2
  589. #endif
  590. addic. I, I, -1
  591. bgt .L31
  592. .align 4
  593. .L40:
  594. andi. I, M, 1
  595. ble .L49
  596. #if defined(LT) || defined(RN)
  597. LFD f16, 0 * SIZE(AO)
  598. LFD f17, 1 * SIZE(AO)
  599. LFD f18, 2 * SIZE(AO)
  600. LFD f19, 3 * SIZE(AO)
  601. LFD f20, 0 * SIZE(B)
  602. LFD f21, 1 * SIZE(B)
  603. LFD f22, 2 * SIZE(B)
  604. LFD f23, 3 * SIZE(B)
  605. lfs f0, FZERO
  606. fmr f1, f0
  607. fmr f2, f0
  608. fmr f3, f0
  609. fmr f4, f0
  610. fmr f5, f0
  611. fmr f6, f0
  612. fmr f7, f0
  613. srawi. r0, KK, 2
  614. mr BO, B
  615. mtspr CTR, r0
  616. #else
  617. #ifdef LN
  618. slwi r0, K, 0 + ZBASE_SHIFT
  619. sub AORIG, AORIG, r0
  620. #endif
  621. slwi r0, KK, 0 + ZBASE_SHIFT
  622. add AO, AORIG, r0
  623. add BO, B, r0
  624. sub TEMP, K, KK
  625. LFD f16, 0 * SIZE(AO)
  626. LFD f17, 1 * SIZE(AO)
  627. LFD f18, 2 * SIZE(AO)
  628. LFD f19, 3 * SIZE(AO)
  629. LFD f20, 0 * SIZE(BO)
  630. LFD f21, 1 * SIZE(BO)
  631. LFD f22, 2 * SIZE(BO)
  632. LFD f23, 3 * SIZE(BO)
  633. lfs f0, FZERO
  634. fmr f1, f0
  635. fmr f2, f0
  636. fmr f3, f0
  637. fmr f4, f0
  638. fmr f5, f0
  639. fmr f6, f0
  640. fmr f7, f0
  641. srawi. r0, TEMP, 2
  642. mtspr CTR, r0
  643. #endif
  644. ble .L45
  645. .align 4
  646. .L42:
  647. fmadd f0, f16, f20, f0
  648. LFD f23, 3 * SIZE(BO)
  649. fmadd f3, f16, f21, f3
  650. LFD f16, 4 * SIZE(AO)
  651. fmadd f2, f17, f20, f2
  652. LFD f20, 4 * SIZE(BO)
  653. fmadd f1, f17, f21, f1
  654. LFD f17, 5 * SIZE(AO)
  655. fmadd f4, f18, f22, f4
  656. LFD f21, 5 * SIZE(BO)
  657. fmadd f7, f18, f23, f7
  658. LFD f18, 6 * SIZE(AO)
  659. fmadd f6, f19, f22, f6
  660. LFD f22, 6 * SIZE(BO)
  661. fmadd f5, f19, f23, f5
  662. LFD f19, 7 * SIZE(AO)
  663. fmadd f0, f16, f20, f0
  664. LFD f23, 7 * SIZE(BO)
  665. fmadd f3, f16, f21, f3
  666. LFDU f16, 8 * SIZE(AO)
  667. fmadd f2, f17, f20, f2
  668. LFDU f20, 8 * SIZE(BO)
  669. fmadd f1, f17, f21, f1
  670. LFD f17, 1 * SIZE(AO)
  671. fmadd f4, f18, f22, f4
  672. LFD f21, 1 * SIZE(BO)
  673. fmadd f7, f18, f23, f7
  674. LFD f18, 2 * SIZE(AO)
  675. fmadd f6, f19, f22, f6
  676. LFD f22, 2 * SIZE(BO)
  677. fmadd f5, f19, f23, f5
  678. LFD f19, 3 * SIZE(AO)
  679. bdnz .L42
  680. .align 4
  681. .L45:
  682. fadd f0, f0, f4
  683. fadd f1, f1, f5
  684. fadd f2, f2, f6
  685. fadd f3, f3, f7
  686. #if defined(LT) || defined(RN)
  687. andi. r0, KK, 3
  688. #else
  689. andi. r0, TEMP, 3
  690. #endif
  691. mtspr CTR,r0
  692. ble .L47
  693. .align 4
  694. .L46:
  695. fmadd f0, f16, f20, f0
  696. LFD f21, 1 * SIZE(BO)
  697. fmadd f3, f16, f21, f3
  698. LFDU f16, 2 * SIZE(AO)
  699. fmadd f2, f17, f20, f2
  700. LFDU f20, 2 * SIZE(BO)
  701. fmadd f1, f17, f21, f1
  702. LFD f17, 1 * SIZE(AO)
  703. bdnz .L46
  704. .align 4
  705. .L47:
  706. #ifndef CONJ
  707. FSUB f0, f0, f1
  708. FADD f1, f2, f3
  709. #else
  710. FADD f0, f0, f1
  711. FSUB f1, f3, f2
  712. #endif
  713. #if defined(LN) || defined(RT)
  714. subi r0, KK, 1
  715. slwi r0, r0, 0 + ZBASE_SHIFT
  716. add AO, AORIG, r0
  717. add BO, B, r0
  718. #endif
  719. #if defined(LN) || defined(LT)
  720. LFD f16, 0 * SIZE(BO)
  721. LFD f17, 1 * SIZE(BO)
  722. FSUB f0, f16, f0
  723. FSUB f1, f17, f1
  724. #else
  725. LFD f16, 0 * SIZE(AO)
  726. LFD f17, 1 * SIZE(AO)
  727. #ifndef CONJ
  728. FSUB f0, f16, f0
  729. FSUB f1, f17, f1
  730. #else
  731. FSUB f0, f16, f0
  732. FADD f1, f17, f1
  733. #endif
  734. #endif
  735. #ifdef LN
  736. LFD f20, 0 * SIZE(AO)
  737. LFD f21, 1 * SIZE(AO)
  738. FMUL f4, f21, f1
  739. FMUL f5, f21, f0
  740. #ifndef CONJ
  741. FMSUB f0, f20, f0, f4
  742. FMADD f1, f20, f1, f5
  743. #else
  744. FMADD f0, f20, f0, f4
  745. FMSUB f1, f20, f1, f5
  746. #endif
  747. #endif
  748. #ifdef LT
  749. LFD f16, 0 * SIZE(AO)
  750. LFD f17, 1 * SIZE(AO)
  751. FMUL f4, f17, f1
  752. FMUL f5, f17, f0
  753. #ifndef CONJ
  754. FMSUB f0, f16, f0, f4
  755. FMADD f1, f16, f1, f5
  756. #else
  757. FMADD f0, f16, f0, f4
  758. FMSUB f1, f16, f1, f5
  759. #endif
  760. #endif
  761. #ifdef RN
  762. LFD f16, 0 * SIZE(BO)
  763. LFD f17, 1 * SIZE(BO)
  764. FMUL f4, f17, f1
  765. FMUL f5, f17, f0
  766. #ifndef CONJ
  767. FMSUB f0, f16, f0, f4
  768. FMADD f1, f16, f1, f5
  769. #else
  770. FMADD f0, f16, f0, f4
  771. FMSUB f1, f16, f1, f5
  772. #endif
  773. #endif
  774. #ifdef RT
  775. LFD f20, 0 * SIZE(BO)
  776. LFD f21, 1 * SIZE(BO)
  777. FMUL f4, f21, f1
  778. FMUL f5, f21, f0
  779. #ifndef CONJ
  780. FMSUB f0, f20, f0, f4
  781. FMADD f1, f20, f1, f5
  782. #else
  783. FMADD f0, f20, f0, f4
  784. FMSUB f1, f20, f1, f5
  785. #endif
  786. #endif
  787. #ifdef LN
  788. subi CO1, CO1, 2 * SIZE
  789. #endif
  790. #if defined(LN) || defined(LT)
  791. STFD f0, 0 * SIZE(BO)
  792. STFD f1, 1 * SIZE(BO)
  793. #else
  794. STFD f0, 0 * SIZE(AO)
  795. STFD f1, 1 * SIZE(AO)
  796. #endif
  797. STFD f0, 0 * SIZE(CO1)
  798. STFD f1, 1 * SIZE(CO1)
  799. #ifndef LN
  800. addi CO1, CO1, 2 * SIZE
  801. #endif
  802. #ifdef RT
  803. slwi r0, K, 0 + ZBASE_SHIFT
  804. add AORIG, AORIG, r0
  805. #endif
  806. #if defined(LT) || defined(RN)
  807. sub TEMP, K, KK
  808. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  809. add AO, AO, TEMP
  810. add BO, BO, TEMP
  811. #endif
  812. #ifdef LT
  813. addi KK, KK, 1
  814. #endif
  815. #ifdef LN
  816. subi KK, KK, 1
  817. #endif
  818. .align 4
  819. .L49:
  820. #ifdef LN
  821. slwi r0, K, 0 + ZBASE_SHIFT
  822. add B, B, r0
  823. #endif
  824. #if defined(LT) || defined(RN)
  825. mr B, BO
  826. #endif
  827. #ifdef RN
  828. addi KK, KK, 1
  829. #endif
  830. #ifdef RT
  831. subi KK, KK, 1
  832. #endif
  833. .align 4
  834. .L30:
  835. srawi. J, N, 1
  836. ble .L999
  837. .align 4
  838. .L10:
  839. #ifdef RT
  840. slwi r0, K, 1 + ZBASE_SHIFT
  841. sub B, B, r0
  842. slwi r0, LDC, 1
  843. sub C, C, r0
  844. #endif
  845. mr CO1, C
  846. add CO2, C, LDC
  847. #ifdef LN
  848. add KK, M, OFFSET
  849. #endif
  850. #ifdef LT
  851. mr KK, OFFSET
  852. #endif
  853. srawi. I, M, 1
  854. #if defined(LN) || defined(RT)
  855. mr AORIG, A
  856. #else
  857. mr AO, A
  858. #endif
  859. #ifndef RT
  860. add C, CO2, LDC
  861. #endif
  862. ble .L20
  863. .align 4
  864. .L11:
  865. #if defined(LT) || defined(RN)
  866. LFD A1, 0 * SIZE(AO)
  867. LFD A2, 1 * SIZE(AO)
  868. LFD A4, 4 * SIZE(AO)
  869. LFD A5, 8 * SIZE(AO)
  870. LFD B1, 0 * SIZE(B)
  871. LFD B2, 1 * SIZE(B)
  872. LFD B3, 2 * SIZE(B)
  873. LFD B4, 3 * SIZE(B)
  874. LFD B5, 4 * SIZE(B)
  875. LFD B6, 8 * SIZE(B)
  876. LFD B7, 12 * SIZE(B)
  877. lfs f0, FZERO
  878. fmr f1, f0
  879. fmr f2, f0
  880. fmr f3, f0
  881. fmr f4, f0
  882. fmr f5, f0
  883. fmr f6, f0
  884. fmr f7, f0
  885. fmr f8, f0
  886. fmr f9, f0
  887. fmr f10, f0
  888. fmr f11, f0
  889. fmr f12, f0
  890. fmr f13, f0
  891. fmr f14, f0
  892. fmr f15, f0
  893. srawi. r0, KK, 2
  894. mtspr CTR, r0
  895. mr BO, B
  896. #else
  897. #ifdef LN
  898. slwi r0, K, 1 + ZBASE_SHIFT
  899. sub AORIG, AORIG, r0
  900. #endif
  901. slwi TEMP, KK, 1 + ZBASE_SHIFT
  902. add AO, AORIG, TEMP
  903. add BO, B, TEMP
  904. sub TEMP, K, KK
  905. LFD A1, 0 * SIZE(AO)
  906. LFD A2, 1 * SIZE(AO)
  907. LFD A4, 4 * SIZE(AO)
  908. LFD A5, 8 * SIZE(AO)
  909. LFD B1, 0 * SIZE(BO)
  910. LFD B2, 1 * SIZE(BO)
  911. LFD B3, 2 * SIZE(BO)
  912. LFD B4, 3 * SIZE(BO)
  913. LFD B5, 4 * SIZE(BO)
  914. LFD B6, 8 * SIZE(BO)
  915. LFD B7, 12 * SIZE(BO)
  916. lfs f0, FZERO
  917. fmr f1, f0
  918. fmr f2, f0
  919. fmr f3, f0
  920. fmr f4, f0
  921. fmr f5, f0
  922. fmr f6, f0
  923. fmr f7, f0
  924. fmr f8, f0
  925. fmr f9, f0
  926. fmr f10, f0
  927. fmr f11, f0
  928. fmr f12, f0
  929. fmr f13, f0
  930. fmr f14, f0
  931. fmr f15, f0
  932. srawi. r0, TEMP, 2
  933. mtspr CTR, r0
  934. #endif
  935. ble .L15
  936. .align 4
  937. .L12:
  938. FMADD f0, A1, B1, f0
  939. LFD A3, 2 * SIZE(AO)
  940. FMADD f4, A1, B2, f4
  941. LFD A6, 12 * SIZE(AO)
  942. FMADD f8, A1, B3, f8
  943. nop
  944. FMADD f12, A1, B4, f12
  945. nop
  946. FMADD f1, A2, B1, f1
  947. LFD A1, 3 * SIZE(AO)
  948. FMADD f5, A2, B2, f5
  949. nop
  950. FMADD f9, A2, B3, f9
  951. nop
  952. FMADD f13, A2, B4, f13
  953. nop
  954. FMADD f2, A3, B1, f2
  955. nop
  956. FMADD f6, A3, B2, f6
  957. LFD B8, 5 * SIZE(BO)
  958. FMADD f10, A3, B3, f10
  959. LFD B9, 6 * SIZE(BO)
  960. FMADD f14, A3, B4, f14
  961. LFD B10, 7 * SIZE(BO)
  962. FMADD f3, A1, B1, f3
  963. LFD A2, 5 * SIZE(AO)
  964. FMADD f7, A1, B2, f7
  965. LFD B1, 16 * SIZE(BO)
  966. FMADD f11, A1, B3, f11
  967. nop
  968. FMADD f15, A1, B4, f15
  969. nop
  970. FMADD f0, A4, B5, f0
  971. LFD A3, 6 * SIZE(AO)
  972. FMADD f4, A4, B8, f4
  973. LFD A1, 16 * SIZE(AO)
  974. FMADD f8, A4, B9, f8
  975. nop
  976. FMADD f12, A4, B10, f12
  977. nop
  978. FMADD f1, A2, B5, f1
  979. LFD A4, 7 * SIZE(AO)
  980. FMADD f5, A2, B8, f5
  981. nop
  982. FMADD f9, A2, B9, f9
  983. nop
  984. FMADD f13, A2, B10, f13
  985. nop
  986. FMADD f2, A3, B5, f2
  987. nop
  988. FMADD f6, A3, B8, f6
  989. LFD B2, 9 * SIZE(BO)
  990. FMADD f10, A3, B9, f10
  991. LFD B3, 10 * SIZE(BO)
  992. FMADD f14, A3, B10, f14
  993. LFD B4, 11 * SIZE(BO)
  994. FMADD f3, A4, B5, f3
  995. LFD A2, 9 * SIZE(AO)
  996. FMADD f7, A4, B8, f7
  997. LFD B5, 20 * SIZE(BO)
  998. FMADD f11, A4, B9, f11
  999. nop
  1000. FMADD f15, A4, B10, f15
  1001. nop
  1002. FMADD f0, A5, B6, f0
  1003. LFD A3, 10 * SIZE(AO)
  1004. FMADD f4, A5, B2, f4
  1005. LFD A4, 20 * SIZE(AO)
  1006. FMADD f8, A5, B3, f8
  1007. nop
  1008. FMADD f12, A5, B4, f12
  1009. nop
  1010. FMADD f1, A2, B6, f1
  1011. LFD A5, 11 * SIZE(AO)
  1012. FMADD f5, A2, B2, f5
  1013. nop
  1014. FMADD f9, A2, B3, f9
  1015. nop
  1016. FMADD f13, A2, B4, f13
  1017. nop
  1018. FMADD f2, A3, B6, f2
  1019. nop
  1020. FMADD f6, A3, B2, f6
  1021. LFD B8, 13 * SIZE(BO)
  1022. FMADD f10, A3, B3, f10
  1023. LFD B9, 14 * SIZE(BO)
  1024. FMADD f14, A3, B4, f14
  1025. LFD B10,15 * SIZE(BO)
  1026. FMADD f3, A5, B6, f3
  1027. LFD A2, 13 * SIZE(AO)
  1028. FMADD f7, A5, B2, f7
  1029. LFD B6, 24 * SIZE(BO)
  1030. FMADD f11, A5, B3, f11
  1031. nop
  1032. FMADD f15, A5, B4, f15
  1033. nop
  1034. FMADD f0, A6, B7, f0
  1035. LFD A3, 14 * SIZE(AO)
  1036. FMADD f4, A6, B8, f4
  1037. LFD A5, 24 * SIZE(AO)
  1038. FMADD f8, A6, B9, f8
  1039. nop
  1040. FMADD f12, A6, B10, f12
  1041. nop
  1042. FMADD f1, A2, B7, f1
  1043. LFD A6, 15 * SIZE(AO)
  1044. FMADD f5, A2, B8, f5
  1045. nop
  1046. FMADD f9, A2, B9, f9
  1047. nop
  1048. FMADD f13, A2, B10, f13
  1049. nop
  1050. FMADD f2, A3, B7, f2
  1051. addi AO, AO, 16 * SIZE
  1052. FMADD f6, A3, B8, f6
  1053. LFD B2, 17 * SIZE(BO)
  1054. FMADD f10, A3, B9, f10
  1055. LFD B3, 18 * SIZE(BO)
  1056. FMADD f14, A3, B10, f14
  1057. LFD B4, 19 * SIZE(BO)
  1058. FMADD f3, A6, B7, f3
  1059. LFD A2, 1 * SIZE(AO)
  1060. FMADD f7, A6, B8, f7
  1061. LFD B7, 28 * SIZE(BO)
  1062. FMADD f11, A6, B9, f11
  1063. addi BO, BO, 16 * SIZE
  1064. FMADD f15, A6, B10, f15
  1065. bdnz .L12
  1066. .align 4
  1067. .L15:
  1068. #if defined(LT) || defined(RN)
  1069. andi. r0, KK, 3
  1070. #else
  1071. andi. r0, TEMP, 3
  1072. #endif
  1073. mtspr CTR, r0
  1074. ble .LKERNEL_MainFinish
  1075. .align 4
  1076. .L16:
  1077. FMADD f0, A1, B1, f0
  1078. LFD A3, 2 * SIZE(AO)
  1079. FMADD f4, A1, B2, f4
  1080. FMADD f8, A1, B3, f8
  1081. FMADD f12, A1, B4, f12
  1082. LFD A4, 3 * SIZE(AO)
  1083. FMADD f1, A2, B1, f1
  1084. FMADD f5, A2, B2, f5
  1085. FMADD f9, A2, B3, f9
  1086. FMADD f13, A2, B4, f13
  1087. LFDU A1, 4 * SIZE(AO)
  1088. FMADD f2, A3, B1, f2
  1089. FMADD f6, A3, B2, f6
  1090. FMADD f10, A3, B3, f10
  1091. FMADD f14, A3, B4, f14
  1092. LFD A2, 1 * SIZE(AO)
  1093. FMADD f3, A4, B1, f3
  1094. LFDU B1, 4 * SIZE(BO)
  1095. FMADD f7, A4, B2, f7
  1096. LFD B2, 1 * SIZE(BO)
  1097. FMADD f11, A4, B3, f11
  1098. LFD B3, 2 * SIZE(BO)
  1099. FMADD f15, A4, B4, f15
  1100. LFD B4, 3 * SIZE(BO)
  1101. bdnz .L16
  1102. .align 4
  1103. .LKERNEL_MainFinish:
  1104. #ifndef CONJ
  1105. FSUB f0, f0, f5
  1106. FADD f1, f1, f4
  1107. FSUB f2, f2, f7
  1108. FADD f3, f3, f6
  1109. FSUB f8, f8, f13
  1110. FADD f9, f9, f12
  1111. FSUB f10, f10, f15
  1112. FADD f11, f11, f14
  1113. #else
  1114. FADD f0, f0, f5
  1115. FSUB f1, f4, f1
  1116. FADD f2, f2, f7
  1117. FSUB f3, f6, f3
  1118. FADD f8, f8, f13
  1119. FSUB f9, f12, f9
  1120. FADD f10, f10, f15
  1121. FSUB f11, f14, f11
  1122. #endif
  1123. #if defined(LN) || defined(RT)
  1124. subi r0, KK, 2
  1125. slwi r0, r0, 1 + ZBASE_SHIFT
  1126. add AO, AORIG, r0
  1127. add BO, B, r0
  1128. #endif
  1129. #if defined(LN) || defined(LT)
  1130. LFD f16, 0 * SIZE(BO)
  1131. LFD f17, 1 * SIZE(BO)
  1132. LFD f18, 2 * SIZE(BO)
  1133. LFD f19, 3 * SIZE(BO)
  1134. LFD f20, 4 * SIZE(BO)
  1135. LFD f21, 5 * SIZE(BO)
  1136. LFD f22, 6 * SIZE(BO)
  1137. LFD f23, 7 * SIZE(BO)
  1138. FSUB f0, f16, f0
  1139. FSUB f1, f17, f1
  1140. FSUB f8, f18, f8
  1141. FSUB f9, f19, f9
  1142. FSUB f2, f20, f2
  1143. FSUB f3, f21, f3
  1144. FSUB f10, f22, f10
  1145. FSUB f11, f23, f11
  1146. #else
  1147. LFD f16, 0 * SIZE(AO)
  1148. LFD f17, 1 * SIZE(AO)
  1149. LFD f18, 2 * SIZE(AO)
  1150. LFD f19, 3 * SIZE(AO)
  1151. LFD f20, 4 * SIZE(AO)
  1152. LFD f21, 5 * SIZE(AO)
  1153. LFD f22, 6 * SIZE(AO)
  1154. LFD f23, 7 * SIZE(AO)
  1155. #ifndef CONJ
  1156. FSUB f0, f16, f0
  1157. FSUB f1, f17, f1
  1158. FSUB f2, f18, f2
  1159. FSUB f3, f19, f3
  1160. FSUB f8, f20, f8
  1161. FSUB f9, f21, f9
  1162. FSUB f10, f22, f10
  1163. FSUB f11, f23, f11
  1164. #else
  1165. FSUB f0, f16, f0
  1166. FADD f1, f17, f1
  1167. FSUB f2, f18, f2
  1168. FADD f3, f19, f3
  1169. FSUB f8, f20, f8
  1170. FADD f9, f21, f9
  1171. FSUB f10, f22, f10
  1172. FADD f11, f23, f11
  1173. #endif
  1174. #endif
  1175. #ifdef LN
  1176. LFD f16, 6 * SIZE(AO)
  1177. LFD f17, 7 * SIZE(AO)
  1178. LFD f18, 4 * SIZE(AO)
  1179. LFD f19, 5 * SIZE(AO)
  1180. LFD f20, 0 * SIZE(AO)
  1181. LFD f21, 1 * SIZE(AO)
  1182. FMUL f6, f17, f3
  1183. FMUL f7, f17, f2
  1184. FMUL f14, f17, f11
  1185. FMUL f15, f17, f10
  1186. #ifndef CONJ
  1187. FMSUB f2, f16, f2, f6
  1188. FMADD f3, f16, f3, f7
  1189. FMSUB f10, f16, f10, f14
  1190. FMADD f11, f16, f11, f15
  1191. FMADD f0, f19, f3, f0
  1192. FNMSUB f1, f19, f2, f1
  1193. FMADD f8, f19, f11, f8
  1194. FNMSUB f9, f19, f10, f9
  1195. FNMSUB f0, f18, f2, f0
  1196. FNMSUB f1, f18, f3, f1
  1197. FNMSUB f8, f18, f10, f8
  1198. FNMSUB f9, f18, f11, f9
  1199. FMUL f4, f21, f1
  1200. FMUL f5, f21, f0
  1201. FMUL f12, f21, f9
  1202. FMUL f13, f21, f8
  1203. FMSUB f0, f20, f0, f4
  1204. FMADD f1, f20, f1, f5
  1205. FMSUB f8, f20, f8, f12
  1206. FMADD f9, f20, f9, f13
  1207. #else
  1208. FMADD f2, f16, f2, f6
  1209. FMSUB f3, f16, f3, f7
  1210. FMADD f10, f16, f10, f14
  1211. FMSUB f11, f16, f11, f15
  1212. FMSUB f0, f19, f3, f0
  1213. FNMADD f1, f19, f2, f1
  1214. FMSUB f8, f19, f11, f8
  1215. FNMADD f9, f19, f10, f9
  1216. FNMADD f0, f18, f2, f0
  1217. FNMADD f1, f18, f3, f1
  1218. FNMADD f8, f18, f10, f8
  1219. FNMADD f9, f18, f11, f9
  1220. FMUL f4, f21, f1
  1221. FMUL f5, f21, f0
  1222. FMUL f12, f21, f9
  1223. FMUL f13, f21, f8
  1224. FMADD f0, f20, f0, f4
  1225. FMSUB f1, f20, f1, f5
  1226. FMADD f8, f20, f8, f12
  1227. FMSUB f9, f20, f9, f13
  1228. #endif
  1229. #endif
  1230. #ifdef LT
  1231. LFD f16, 0 * SIZE(AO)
  1232. LFD f17, 1 * SIZE(AO)
  1233. LFD f18, 2 * SIZE(AO)
  1234. LFD f19, 3 * SIZE(AO)
  1235. LFD f20, 6 * SIZE(AO)
  1236. LFD f21, 7 * SIZE(AO)
  1237. FMUL f4, f17, f1
  1238. FMUL f5, f17, f0
  1239. FMUL f12, f17, f9
  1240. FMUL f13, f17, f8
  1241. #ifndef CONJ
  1242. FMSUB f0, f16, f0, f4
  1243. FMADD f1, f16, f1, f5
  1244. FMSUB f8, f16, f8, f12
  1245. FMADD f9, f16, f9, f13
  1246. FMADD f2, f19, f1, f2
  1247. FNMSUB f3, f19, f0, f3
  1248. FMADD f10, f19, f9, f10
  1249. FNMSUB f11, f19, f8, f11
  1250. FNMSUB f2, f18, f0, f2
  1251. FNMSUB f3, f18, f1, f3
  1252. FNMSUB f10, f18, f8, f10
  1253. FNMSUB f11, f18, f9, f11
  1254. FMUL f4, f21, f3
  1255. FMUL f5, f21, f2
  1256. FMUL f12, f21, f11
  1257. FMUL f13, f21, f10
  1258. FMSUB f2, f20, f2, f4
  1259. FMADD f3, f20, f3, f5
  1260. FMSUB f10, f20, f10, f12
  1261. FMADD f11, f20, f11, f13
  1262. #else
  1263. FMADD f0, f16, f0, f4
  1264. FMSUB f1, f16, f1, f5
  1265. FMADD f8, f16, f8, f12
  1266. FMSUB f9, f16, f9, f13
  1267. FMSUB f2, f19, f1, f2
  1268. FNMADD f3, f19, f0, f3
  1269. FMSUB f10, f19, f9, f10
  1270. FNMADD f11, f19, f8, f11
  1271. FNMADD f2, f18, f0, f2
  1272. FNMADD f3, f18, f1, f3
  1273. FNMADD f10, f18, f8, f10
  1274. FNMADD f11, f18, f9, f11
  1275. FMUL f4, f21, f3
  1276. FMUL f5, f21, f2
  1277. FMUL f12, f21, f11
  1278. FMUL f13, f21, f10
  1279. FMADD f2, f20, f2, f4
  1280. FMSUB f3, f20, f3, f5
  1281. FMADD f10, f20, f10, f12
  1282. FMSUB f11, f20, f11, f13
  1283. #endif
  1284. #endif
  1285. #ifdef RN
  1286. LFD f16, 0 * SIZE(BO)
  1287. LFD f17, 1 * SIZE(BO)
  1288. LFD f18, 2 * SIZE(BO)
  1289. LFD f19, 3 * SIZE(BO)
  1290. LFD f20, 6 * SIZE(BO)
  1291. LFD f21, 7 * SIZE(BO)
  1292. FMUL f4, f17, f1
  1293. FMUL f5, f17, f0
  1294. FMUL f6, f17, f3
  1295. FMUL f7, f17, f2
  1296. #ifndef CONJ
  1297. FMSUB f0, f16, f0, f4
  1298. FMADD f1, f16, f1, f5
  1299. FMSUB f2, f16, f2, f6
  1300. FMADD f3, f16, f3, f7
  1301. FMADD f8, f19, f1, f8
  1302. FNMSUB f9, f19, f0, f9
  1303. FMADD f10, f19, f3, f10
  1304. FNMSUB f11, f19, f2, f11
  1305. FNMSUB f8, f18, f0, f8
  1306. FNMSUB f9, f18, f1, f9
  1307. FNMSUB f10, f18, f2, f10
  1308. FNMSUB f11, f18, f3, f11
  1309. FMUL f4, f21, f9
  1310. FMUL f5, f21, f8
  1311. FMUL f6, f21, f11
  1312. FMUL f7, f21, f10
  1313. FMSUB f8, f20, f8, f4
  1314. FMADD f9, f20, f9, f5
  1315. FMSUB f10, f20, f10, f6
  1316. FMADD f11, f20, f11, f7
  1317. #else
  1318. FMADD f0, f16, f0, f4
  1319. FMSUB f1, f16, f1, f5
  1320. FMADD f2, f16, f2, f6
  1321. FMSUB f3, f16, f3, f7
  1322. FMSUB f8, f19, f1, f8
  1323. FNMADD f9, f19, f0, f9
  1324. FMSUB f10, f19, f3, f10
  1325. FNMADD f11, f19, f2, f11
  1326. FNMADD f8, f18, f0, f8
  1327. FNMADD f9, f18, f1, f9
  1328. FNMADD f10, f18, f2, f10
  1329. FNMADD f11, f18, f3, f11
  1330. FMUL f4, f21, f9
  1331. FMUL f5, f21, f8
  1332. FMUL f6, f21, f11
  1333. FMUL f7, f21, f10
  1334. FMADD f8, f20, f8, f4
  1335. FMSUB f9, f20, f9, f5
  1336. FMADD f10, f20, f10, f6
  1337. FMSUB f11, f20, f11, f7
  1338. #endif
  1339. #endif
  1340. #ifdef RT
  1341. LFD f16, 6 * SIZE(BO)
  1342. LFD f17, 7 * SIZE(BO)
  1343. LFD f18, 4 * SIZE(BO)
  1344. LFD f19, 5 * SIZE(BO)
  1345. LFD f20, 0 * SIZE(BO)
  1346. LFD f21, 1 * SIZE(BO)
  1347. FMUL f12, f17, f9
  1348. FMUL f13, f17, f8
  1349. FMUL f14, f17, f11
  1350. FMUL f15, f17, f10
  1351. #ifndef CONJ
  1352. FMSUB f8, f16, f8, f12
  1353. FMADD f9, f16, f9, f13
  1354. FMSUB f10, f16, f10, f14
  1355. FMADD f11, f16, f11, f15
  1356. FMADD f0, f19, f9, f0
  1357. FNMSUB f1, f19, f8, f1
  1358. FMADD f2, f19, f11, f2
  1359. FNMSUB f3, f19, f10, f3
  1360. FNMSUB f0, f18, f8, f0
  1361. FNMSUB f1, f18, f9, f1
  1362. FNMSUB f2, f18, f10, f2
  1363. FNMSUB f3, f18, f11, f3
  1364. FMUL f4, f21, f1
  1365. FMUL f5, f21, f0
  1366. FMUL f6, f21, f3
  1367. FMUL f7, f21, f2
  1368. FMSUB f0, f20, f0, f4
  1369. FMADD f1, f20, f1, f5
  1370. FMSUB f2, f20, f2, f6
  1371. FMADD f3, f20, f3, f7
  1372. #else
  1373. FMADD f8, f16, f8, f12
  1374. FMSUB f9, f16, f9, f13
  1375. FMADD f10, f16, f10, f14
  1376. FMSUB f11, f16, f11, f15
  1377. FMSUB f0, f19, f9, f0
  1378. FNMADD f1, f19, f8, f1
  1379. FMSUB f2, f19, f11, f2
  1380. FNMADD f3, f19, f10, f3
  1381. FNMADD f0, f18, f8, f0
  1382. FNMADD f1, f18, f9, f1
  1383. FNMADD f2, f18, f10, f2
  1384. FNMADD f3, f18, f11, f3
  1385. FMUL f4, f21, f1
  1386. FMUL f5, f21, f0
  1387. FMUL f6, f21, f3
  1388. FMUL f7, f21, f2
  1389. FMADD f0, f20, f0, f4
  1390. FMSUB f1, f20, f1, f5
  1391. FMADD f2, f20, f2, f6
  1392. FMSUB f3, f20, f3, f7
  1393. #endif
  1394. #endif
  1395. #ifdef LN
  1396. subi CO1, CO1, 4 * SIZE
  1397. subi CO2, CO2, 4 * SIZE
  1398. #endif
  1399. #if defined(LN) || defined(LT)
  1400. STFD f0, 0 * SIZE(BO)
  1401. STFD f1, 1 * SIZE(BO)
  1402. STFD f8, 2 * SIZE(BO)
  1403. STFD f9, 3 * SIZE(BO)
  1404. STFD f2, 4 * SIZE(BO)
  1405. STFD f3, 5 * SIZE(BO)
  1406. STFD f10, 6 * SIZE(BO)
  1407. STFD f11, 7 * SIZE(BO)
  1408. #else
  1409. STFD f0, 0 * SIZE(AO)
  1410. STFD f1, 1 * SIZE(AO)
  1411. STFD f2, 2 * SIZE(AO)
  1412. STFD f3, 3 * SIZE(AO)
  1413. STFD f8, 4 * SIZE(AO)
  1414. STFD f9, 5 * SIZE(AO)
  1415. STFD f10, 6 * SIZE(AO)
  1416. STFD f11, 7 * SIZE(AO)
  1417. #endif
  1418. STFD f0, 0 * SIZE(CO1)
  1419. STFD f1, 1 * SIZE(CO1)
  1420. STFD f2, 2 * SIZE(CO1)
  1421. STFD f3, 3 * SIZE(CO1)
  1422. STFD f8, 0 * SIZE(CO2)
  1423. STFD f9, 1 * SIZE(CO2)
  1424. STFD f10, 2 * SIZE(CO2)
  1425. STFD f11, 3 * SIZE(CO2)
  1426. #ifndef LN
  1427. addi CO1, CO1, 4 * SIZE
  1428. addi CO2, CO2, 4 * SIZE
  1429. #endif
  1430. #ifdef RT
  1431. slwi r0, K, 1 + ZBASE_SHIFT
  1432. add AORIG, AORIG, r0
  1433. #endif
  1434. #if defined(LT) || defined(RN)
  1435. sub TEMP, K, KK
  1436. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1437. add AO, AO, TEMP
  1438. add BO, BO, TEMP
  1439. #endif
  1440. #ifdef LT
  1441. addi KK, KK, 2
  1442. #endif
  1443. #ifdef LN
  1444. subi KK, KK, 2
  1445. #endif
  1446. addic. I, I, -1
  1447. bgt .L11
  1448. .align 4
  1449. .L20:
  1450. andi. I, M, 1
  1451. ble .L29
  1452. #if defined(LT) || defined(RN)
  1453. LFD f16, 0 * SIZE(AO)
  1454. LFD f17, 1 * SIZE(AO)
  1455. LFD f18, 2 * SIZE(AO)
  1456. LFD f19, 3 * SIZE(AO)
  1457. LFD f20, 0 * SIZE(B)
  1458. LFD f21, 1 * SIZE(B)
  1459. LFD f22, 2 * SIZE(B)
  1460. LFD f23, 3 * SIZE(B)
  1461. LFD f24, 4 * SIZE(B)
  1462. LFD f25, 5 * SIZE(B)
  1463. LFD f26, 6 * SIZE(B)
  1464. LFD f27, 7 * SIZE(B)
  1465. lfs f0, FZERO
  1466. fmr f1, f0
  1467. fmr f2, f0
  1468. fmr f3, f0
  1469. fmr f4, f0
  1470. fmr f5, f0
  1471. fmr f6, f0
  1472. fmr f7, f0
  1473. srawi. r0, KK, 2
  1474. mr BO, B
  1475. mtspr CTR, r0
  1476. #else
  1477. #ifdef LN
  1478. slwi r0, K, 0 + ZBASE_SHIFT
  1479. sub AORIG, AORIG, r0
  1480. #endif
  1481. slwi r0, KK, 0 + ZBASE_SHIFT
  1482. slwi TEMP, KK, 1 + ZBASE_SHIFT
  1483. add AO, AORIG, r0
  1484. add BO, B, TEMP
  1485. sub TEMP, K, KK
  1486. LFD f16, 0 * SIZE(AO)
  1487. LFD f17, 1 * SIZE(AO)
  1488. LFD f18, 2 * SIZE(AO)
  1489. LFD f19, 3 * SIZE(AO)
  1490. LFD f20, 0 * SIZE(BO)
  1491. LFD f21, 1 * SIZE(BO)
  1492. LFD f22, 2 * SIZE(BO)
  1493. LFD f23, 3 * SIZE(BO)
  1494. LFD f24, 4 * SIZE(BO)
  1495. LFD f25, 5 * SIZE(BO)
  1496. LFD f26, 6 * SIZE(BO)
  1497. LFD f27, 7 * SIZE(BO)
  1498. lfs f0, FZERO
  1499. fmr f1, f0
  1500. fmr f2, f0
  1501. fmr f3, f0
  1502. fmr f4, f0
  1503. fmr f5, f0
  1504. fmr f6, f0
  1505. fmr f7, f0
  1506. srawi. r0, TEMP, 2
  1507. mtspr CTR, r0
  1508. #endif
  1509. ble .L25
  1510. .align 4
  1511. .L22:
  1512. fmadd f0, f16, f20, f0
  1513. LFD f19, 3 * SIZE(AO)
  1514. fmadd f1, f16, f21, f1
  1515. nop
  1516. fmadd f2, f16, f22, f2
  1517. nop
  1518. fmadd f3, f16, f23, f3
  1519. LFD f16, 4 * SIZE(AO)
  1520. fmadd f4, f17, f20, f4
  1521. LFD f20, 8 * SIZE(BO)
  1522. fmadd f5, f17, f21, f5
  1523. LFD f21, 9 * SIZE(BO)
  1524. fmadd f6, f17, f22, f6
  1525. LFD f22, 10 * SIZE(BO)
  1526. fmadd f7, f17, f23, f7
  1527. LFD f23, 11 * SIZE(BO)
  1528. fmadd f0, f18, f24, f0
  1529. LFD f17, 5 * SIZE(AO)
  1530. fmadd f1, f18, f25, f1
  1531. nop
  1532. fmadd f2, f18, f26, f2
  1533. nop
  1534. fmadd f3, f18, f27, f3
  1535. LFD f18, 6 * SIZE(AO)
  1536. fmadd f4, f19, f24, f4
  1537. LFD f24, 12 * SIZE(BO)
  1538. fmadd f5, f19, f25, f5
  1539. LFD f25, 13 * SIZE(BO)
  1540. fmadd f6, f19, f26, f6
  1541. LFD f26, 14 * SIZE(BO)
  1542. fmadd f7, f19, f27, f7
  1543. LFD f27, 15 * SIZE(BO)
  1544. fmadd f0, f16, f20, f0
  1545. LFD f19, 7 * SIZE(AO)
  1546. fmadd f1, f16, f21, f1
  1547. nop
  1548. fmadd f2, f16, f22, f2
  1549. nop
  1550. fmadd f3, f16, f23, f3
  1551. LFDU f16, 8 * SIZE(AO)
  1552. fmadd f4, f17, f20, f4
  1553. LFDU f20, 16 * SIZE(BO)
  1554. fmadd f5, f17, f21, f5
  1555. LFD f21, 1 * SIZE(BO)
  1556. fmadd f6, f17, f22, f6
  1557. LFD f22, 2 * SIZE(BO)
  1558. fmadd f7, f17, f23, f7
  1559. LFD f23, 3 * SIZE(BO)
  1560. fmadd f0, f18, f24, f0
  1561. LFD f17, 1 * SIZE(AO)
  1562. fmadd f1, f18, f25, f1
  1563. nop
  1564. fmadd f2, f18, f26, f2
  1565. nop
  1566. fmadd f3, f18, f27, f3
  1567. LFD f18, 2 * SIZE(AO)
  1568. fmadd f4, f19, f24, f4
  1569. LFD f24, 4 * SIZE(BO)
  1570. fmadd f5, f19, f25, f5
  1571. LFD f25, 5 * SIZE(BO)
  1572. fmadd f6, f19, f26, f6
  1573. LFD f26, 6 * SIZE(BO)
  1574. fmadd f7, f19, f27, f7
  1575. LFD f27, 7 * SIZE(BO)
  1576. bdnz .L22
  1577. .align 4
  1578. .L25:
  1579. #if defined(LT) || defined(RN)
  1580. andi. r0, KK, 3
  1581. #else
  1582. andi. r0, TEMP, 3
  1583. #endif
  1584. mtspr CTR, r0
  1585. ble .L27
  1586. .align 4
  1587. .L26:
  1588. fmadd f0, f16, f20, f0
  1589. LFD f17, 1 * SIZE(AO)
  1590. fmadd f1, f16, f21, f1
  1591. nop
  1592. fmadd f2, f16, f22, f2
  1593. nop
  1594. fmadd f3, f16, f23, f3
  1595. LFDU f16, 2 * SIZE(AO)
  1596. fmadd f4, f17, f20, f4
  1597. LFDU f20, 4 * SIZE(BO)
  1598. fmadd f5, f17, f21, f5
  1599. LFD f21, 1 * SIZE(BO)
  1600. fmadd f6, f17, f22, f6
  1601. LFD f22, 2 * SIZE(BO)
  1602. fmadd f7, f17, f23, f7
  1603. LFD f23, 3 * SIZE(BO)
  1604. bdnz .L26
  1605. .align 4
  1606. .L27:
  1607. #ifndef CONJ
  1608. FSUB f0, f0, f5
  1609. FADD f1, f1, f4
  1610. FSUB f2, f2, f7
  1611. FADD f3, f3, f6
  1612. #else
  1613. #if defined(LN) || defined(LT)
  1614. FADD f0, f0, f5
  1615. FSUB f1, f1, f4
  1616. FADD f2, f2, f7
  1617. FSUB f3, f3, f6
  1618. #else
  1619. FADD f0, f0, f5
  1620. FSUB f1, f4, f1
  1621. FADD f2, f2, f7
  1622. FSUB f3, f6, f3
  1623. #endif
  1624. #endif
  1625. #if defined(LN) || defined(RT)
  1626. #ifdef LN
  1627. subi r0, KK, 1
  1628. #else
  1629. subi r0, KK, 2
  1630. #endif
  1631. slwi TEMP, r0, 0 + ZBASE_SHIFT
  1632. slwi r0, r0, 1 + ZBASE_SHIFT
  1633. add AO, AORIG, TEMP
  1634. add BO, B, r0
  1635. #endif
  1636. #if defined(LN) || defined(LT)
  1637. LFD f16, 0 * SIZE(BO)
  1638. LFD f17, 1 * SIZE(BO)
  1639. LFD f18, 2 * SIZE(BO)
  1640. LFD f19, 3 * SIZE(BO)
  1641. FSUB f0, f16, f0
  1642. FSUB f1, f17, f1
  1643. FSUB f2, f18, f2
  1644. FSUB f3, f19, f3
  1645. #else
  1646. LFD f16, 0 * SIZE(AO)
  1647. LFD f17, 1 * SIZE(AO)
  1648. LFD f20, 2 * SIZE(AO)
  1649. LFD f21, 3 * SIZE(AO)
  1650. FSUB f0, f16, f0
  1651. FSUB f1, f17, f1
  1652. FSUB f2, f20, f2
  1653. FSUB f3, f21, f3
  1654. #endif
  1655. #ifdef LN
  1656. LFD f20, 0 * SIZE(AO)
  1657. LFD f21, 1 * SIZE(AO)
  1658. FMUL f4, f21, f1
  1659. FMUL f5, f21, f0
  1660. FMUL f12, f21, f3
  1661. FMUL f13, f21, f2
  1662. #ifndef CONJ
  1663. FMSUB f0, f20, f0, f4
  1664. FMADD f1, f20, f1, f5
  1665. FMSUB f2, f20, f2, f12
  1666. FMADD f3, f20, f3, f13
  1667. #else
  1668. FMADD f0, f20, f0, f4
  1669. FMSUB f1, f20, f1, f5
  1670. FMADD f2, f20, f2, f12
  1671. FMSUB f3, f20, f3, f13
  1672. #endif
  1673. #endif
  1674. #ifdef LT
  1675. LFD f16, 0 * SIZE(AO)
  1676. LFD f17, 1 * SIZE(AO)
  1677. FMUL f4, f17, f1
  1678. FMUL f5, f17, f0
  1679. FMUL f12, f17, f3
  1680. FMUL f13, f17, f2
  1681. #ifndef CONJ
  1682. FMSUB f0, f16, f0, f4
  1683. FMADD f1, f16, f1, f5
  1684. FMSUB f2, f16, f2, f12
  1685. FMADD f3, f16, f3, f13
  1686. #else
  1687. FMADD f0, f16, f0, f4
  1688. FMSUB f1, f16, f1, f5
  1689. FMADD f2, f16, f2, f12
  1690. FMSUB f3, f16, f3, f13
  1691. #endif
  1692. #endif
  1693. #ifdef RN
  1694. LFD f16, 0 * SIZE(BO)
  1695. LFD f17, 1 * SIZE(BO)
  1696. LFD f18, 2 * SIZE(BO)
  1697. LFD f19, 3 * SIZE(BO)
  1698. LFD f20, 6 * SIZE(BO)
  1699. LFD f21, 7 * SIZE(BO)
  1700. FMUL f4, f17, f1
  1701. FMUL f5, f17, f0
  1702. #ifndef CONJ
  1703. FMSUB f0, f16, f0, f4
  1704. FMADD f1, f16, f1, f5
  1705. FMADD f2, f19, f1, f2
  1706. FNMSUB f3, f19, f0, f3
  1707. FNMSUB f2, f18, f0, f2
  1708. FNMSUB f3, f18, f1, f3
  1709. FMUL f4, f21, f3
  1710. FMUL f5, f21, f2
  1711. FMSUB f2, f20, f2, f4
  1712. FMADD f3, f20, f3, f5
  1713. #else
  1714. FMADD f0, f16, f0, f4
  1715. FMSUB f1, f16, f1, f5
  1716. FMSUB f2, f19, f1, f2
  1717. FNMADD f3, f19, f0, f3
  1718. FNMADD f2, f18, f0, f2
  1719. FNMADD f3, f18, f1, f3
  1720. FMUL f4, f21, f3
  1721. FMUL f5, f21, f2
  1722. FMADD f2, f20, f2, f4
  1723. FMSUB f3, f20, f3, f5
  1724. #endif
  1725. #endif
  1726. #ifdef RT
  1727. LFD f16, 6 * SIZE(BO)
  1728. LFD f17, 7 * SIZE(BO)
  1729. LFD f18, 4 * SIZE(BO)
  1730. LFD f19, 5 * SIZE(BO)
  1731. LFD f20, 0 * SIZE(BO)
  1732. LFD f21, 1 * SIZE(BO)
  1733. FMUL f12, f17, f3
  1734. FMUL f13, f17, f2
  1735. #ifndef CONJ
  1736. FMSUB f2, f16, f2, f12
  1737. FMADD f3, f16, f3, f13
  1738. FMADD f0, f19, f3, f0
  1739. FNMSUB f1, f19, f2, f1
  1740. FNMSUB f0, f18, f2, f0
  1741. FNMSUB f1, f18, f3, f1
  1742. FMUL f4, f21, f1
  1743. FMUL f5, f21, f0
  1744. FMSUB f0, f20, f0, f4
  1745. FMADD f1, f20, f1, f5
  1746. #else
  1747. FMADD f2, f16, f2, f12
  1748. FMSUB f3, f16, f3, f13
  1749. FMSUB f0, f19, f3, f0
  1750. FNMADD f1, f19, f2, f1
  1751. FNMADD f0, f18, f2, f0
  1752. FNMADD f1, f18, f3, f1
  1753. FMUL f4, f21, f1
  1754. FMUL f5, f21, f0
  1755. FMADD f0, f20, f0, f4
  1756. FMSUB f1, f20, f1, f5
  1757. #endif
  1758. #endif
  1759. #ifdef LN
  1760. subi CO1, CO1, 2 * SIZE
  1761. subi CO2, CO2, 2 * SIZE
  1762. #endif
  1763. #if defined(LN) || defined(LT)
  1764. STFD f0, 0 * SIZE(BO)
  1765. STFD f1, 1 * SIZE(BO)
  1766. STFD f2, 2 * SIZE(BO)
  1767. STFD f3, 3 * SIZE(BO)
  1768. #else
  1769. STFD f0, 0 * SIZE(AO)
  1770. STFD f1, 1 * SIZE(AO)
  1771. STFD f2, 2 * SIZE(AO)
  1772. STFD f3, 3 * SIZE(AO)
  1773. #endif
  1774. STFD f0, 0 * SIZE(CO1)
  1775. STFD f1, 1 * SIZE(CO1)
  1776. STFD f2, 0 * SIZE(CO2)
  1777. STFD f3, 1 * SIZE(CO2)
  1778. #ifndef LN
  1779. addi CO1, CO1, 2 * SIZE
  1780. addi CO2, CO2, 2 * SIZE
  1781. #endif
  1782. #ifdef RT
  1783. slwi r0, K, 0 + ZBASE_SHIFT
  1784. add AORIG, AORIG, r0
  1785. #endif
  1786. #if defined(LT) || defined(RN)
  1787. sub TEMP, K, KK
  1788. slwi r0, TEMP, 0 + ZBASE_SHIFT
  1789. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1790. add AO, AO, r0
  1791. add BO, BO, TEMP
  1792. #endif
  1793. #ifdef LT
  1794. addi KK, KK, 1
  1795. #endif
  1796. #ifdef LN
  1797. subi KK, KK, 1
  1798. #endif
  1799. .align 4
  1800. .L29:
  1801. #ifdef LN
  1802. slwi r0, K, 1 + ZBASE_SHIFT
  1803. add B, B, r0
  1804. #endif
  1805. #if defined(LT) || defined(RN)
  1806. mr B, BO
  1807. #endif
  1808. #ifdef RN
  1809. addi KK, KK, 2
  1810. #endif
  1811. #ifdef RT
  1812. subi KK, KK, 2
  1813. #endif
  1814. addic. J, J, -1
  1815. bgt .L10
  1816. .align 4
  1817. .L999:
  1818. addi r3, 0, 0
  1819. lfd f14, 0(SP)
  1820. lfd f15, 8(SP)
  1821. lfd f16, 16(SP)
  1822. lfd f17, 24(SP)
  1823. lfd f18, 32(SP)
  1824. lfd f19, 40(SP)
  1825. lfd f20, 48(SP)
  1826. lfd f21, 56(SP)
  1827. lfd f22, 64(SP)
  1828. lfd f23, 72(SP)
  1829. lfd f24, 80(SP)
  1830. lfd f25, 88(SP)
  1831. lfd f26, 96(SP)
  1832. lfd f27, 104(SP)
  1833. lfd f28, 112(SP)
  1834. lfd f29, 120(SP)
  1835. lfd f30, 128(SP)
  1836. lfd f31, 136(SP)
  1837. #ifdef __64BIT__
  1838. ld r31, 144(SP)
  1839. ld r30, 152(SP)
  1840. ld r29, 160(SP)
  1841. ld r28, 168(SP)
  1842. ld r27, 176(SP)
  1843. ld r26, 184(SP)
  1844. ld r25, 192(SP)
  1845. ld r24, 200(SP)
  1846. ld r23, 208(SP)
  1847. ld r22, 216(SP)
  1848. ld r21, 224(SP)
  1849. #else
  1850. lwz r31, 144(SP)
  1851. lwz r30, 148(SP)
  1852. lwz r29, 152(SP)
  1853. lwz r28, 156(SP)
  1854. lwz r27, 160(SP)
  1855. lwz r26, 164(SP)
  1856. lwz r25, 168(SP)
  1857. lwz r24, 172(SP)
  1858. lwz r23, 176(SP)
  1859. lwz r22, 180(SP)
  1860. lwz r21, 184(SP)
  1861. #endif
  1862. addi SP, SP, STACKSIZE
  1863. blr
  1864. EPILOGUE