You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_ppc440_LT.S 38 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA_R 296(SP)
  48. #define ALPHA_I 304(SP)
  49. #define FZERO 312(SP)
  50. #else
  51. #define STACKSIZE 256
  52. #define ALPHA_R 224(SP)
  53. #define ALPHA_I 232(SP)
  54. #define FZERO 240(SP)
  55. #endif
  56. #define M r3
  57. #define N r4
  58. #define K r5
  59. #ifdef linux
  60. #ifndef __64BIT__
  61. #define A r6
  62. #define B r7
  63. #define C r8
  64. #define LDC r9
  65. #define OFFSET r10
  66. #else
  67. #define A r8
  68. #define B r9
  69. #define C r10
  70. #define LDC r6
  71. #define OFFSET r7
  72. #endif
  73. #endif
  74. #if defined(_AIX) || defined(__APPLE__)
  75. #if !defined(__64BIT__) && defined(DOUBLE)
  76. #define A r10
  77. #define B r6
  78. #define C r7
  79. #define LDC r8
  80. #define OFFSET r9
  81. #else
  82. #define A r8
  83. #define B r9
  84. #define C r10
  85. #define LDC r6
  86. #define OFFSET r7
  87. #endif
  88. #endif
  89. #define AORIG r21
  90. #define TEMP r22
  91. #define KK r23
  92. #define I r24
  93. #define J r25
  94. #define AO r26
  95. #define BO r27
  96. #define CO1 r28
  97. #define CO2 r29
  98. #define A1 f16
  99. #define A2 f17
  100. #define A3 f18
  101. #define A4 f19
  102. #define A5 f20
  103. #define A6 f21
  104. #define B1 f22
  105. #define B2 f23
  106. #define B3 f24
  107. #define B4 f25
  108. #define B5 f26
  109. #define B6 f27
  110. #define B7 f28
  111. #define B8 f29
  112. #define B9 f30
  113. #define B10 f31
  114. PROLOGUE
  115. PROFCODE
  116. addi SP, SP, -STACKSIZE
  117. li r0, 0
  118. stfd f14, 0(SP)
  119. stfd f15, 8(SP)
  120. stfd f16, 16(SP)
  121. stfd f17, 24(SP)
  122. stfd f18, 32(SP)
  123. stfd f19, 40(SP)
  124. stfd f20, 48(SP)
  125. stfd f21, 56(SP)
  126. stfd f22, 64(SP)
  127. stfd f23, 72(SP)
  128. stfd f24, 80(SP)
  129. stfd f25, 88(SP)
  130. stfd f26, 96(SP)
  131. stfd f27, 104(SP)
  132. stfd f28, 112(SP)
  133. stfd f29, 120(SP)
  134. stfd f30, 128(SP)
  135. stfd f31, 136(SP)
  136. #ifdef __64BIT__
  137. std r31, 144(SP)
  138. std r30, 152(SP)
  139. std r29, 160(SP)
  140. std r28, 168(SP)
  141. std r27, 176(SP)
  142. std r26, 184(SP)
  143. std r25, 192(SP)
  144. std r24, 200(SP)
  145. std r23, 208(SP)
  146. std r22, 216(SP)
  147. std r21, 224(SP)
  148. #else
  149. stw r31, 144(SP)
  150. stw r30, 148(SP)
  151. stw r29, 152(SP)
  152. stw r28, 156(SP)
  153. stw r27, 160(SP)
  154. stw r26, 164(SP)
  155. stw r25, 168(SP)
  156. stw r24, 172(SP)
  157. stw r23, 176(SP)
  158. stw r22, 180(SP)
  159. stw r21, 184(SP)
  160. #endif
  161. stw r0, FZERO
  162. #ifdef linux
  163. #ifdef __64BIT__
  164. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  165. #endif
  166. #endif
  167. #if defined(_AIX) || defined(__APPLE__)
  168. #ifdef __64BIT__
  169. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  170. #else
  171. #ifdef DOUBLE
  172. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  173. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  174. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  175. #else
  176. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  177. #endif
  178. #endif
  179. #endif
  180. #if defined(linux) && defined(__64BIT__)
  181. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  182. #endif
  183. #if defined(_AIX) || defined(__APPLE__)
  184. #ifdef __64BIT__
  185. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  186. #else
  187. #ifdef DOUBLE
  188. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  189. #else
  190. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  191. #endif
  192. #endif
  193. #endif
  194. slwi LDC, LDC, ZBASE_SHIFT
  195. #ifdef LN
  196. mullw r0, M, K
  197. slwi r0, r0, ZBASE_SHIFT
  198. add A, A, r0
  199. slwi r0, M, ZBASE_SHIFT
  200. add C, C, r0
  201. #endif
  202. #ifdef RN
  203. neg KK, OFFSET
  204. #endif
  205. #ifdef RT
  206. mullw r0, N, K
  207. slwi r0, r0, ZBASE_SHIFT
  208. add B, B, r0
  209. mullw r0, N, LDC
  210. add C, C, r0
  211. sub KK, N, OFFSET
  212. #endif
  213. cmpwi cr0, M, 0
  214. ble .L999
  215. cmpwi cr0, N, 0
  216. ble .L999
  217. cmpwi cr0, K, 0
  218. ble .L999
  219. srawi. J, N, 1
  220. ble .L30
  221. .align 4
  222. .L10:
  223. #ifdef RT
  224. slwi r0, K, 1 + ZBASE_SHIFT
  225. sub B, B, r0
  226. slwi r0, LDC, 1
  227. sub C, C, r0
  228. #endif
  229. mr CO1, C
  230. add CO2, C, LDC
  231. #ifdef LN
  232. add KK, M, OFFSET
  233. #endif
  234. #ifdef LT
  235. mr KK, OFFSET
  236. #endif
  237. srawi. I, M, 1
  238. #if defined(LN) || defined(RT)
  239. mr AORIG, A
  240. #else
  241. mr AO, A
  242. #endif
  243. #ifndef RT
  244. add C, CO2, LDC
  245. #endif
  246. ble .L20
  247. .align 4
  248. .L11:
  249. #if defined(LT) || defined(RN)
  250. LFD A1, 0 * SIZE(AO)
  251. LFD A2, 1 * SIZE(AO)
  252. LFD A4, 4 * SIZE(AO)
  253. LFD A5, 8 * SIZE(AO)
  254. LFD B1, 0 * SIZE(B)
  255. LFD B2, 1 * SIZE(B)
  256. LFD B3, 2 * SIZE(B)
  257. LFD B4, 3 * SIZE(B)
  258. LFD B5, 4 * SIZE(B)
  259. LFD B6, 8 * SIZE(B)
  260. LFD B7, 12 * SIZE(B)
  261. lfs f0, FZERO
  262. fmr f1, f0
  263. fmr f2, f0
  264. fmr f3, f0
  265. fmr f4, f0
  266. fmr f5, f0
  267. fmr f6, f0
  268. fmr f7, f0
  269. fmr f8, f0
  270. fmr f9, f0
  271. fmr f10, f0
  272. fmr f11, f0
  273. fmr f12, f0
  274. fmr f13, f0
  275. fmr f14, f0
  276. fmr f15, f0
  277. srawi. r0, KK, 2
  278. mtspr CTR, r0
  279. mr BO, B
  280. #else
  281. #ifdef LN
  282. slwi r0, K, 1 + ZBASE_SHIFT
  283. sub AORIG, AORIG, r0
  284. #endif
  285. slwi TEMP, KK, 1 + ZBASE_SHIFT
  286. add AO, AORIG, TEMP
  287. add BO, B, TEMP
  288. sub TEMP, K, KK
  289. LFD A1, 0 * SIZE(AO)
  290. LFD A2, 1 * SIZE(AO)
  291. LFD A4, 4 * SIZE(AO)
  292. LFD A5, 8 * SIZE(AO)
  293. LFD B1, 0 * SIZE(BO)
  294. LFD B2, 1 * SIZE(BO)
  295. LFD B3, 2 * SIZE(BO)
  296. LFD B4, 3 * SIZE(BO)
  297. LFD B5, 4 * SIZE(BO)
  298. LFD B6, 8 * SIZE(BO)
  299. LFD B7, 12 * SIZE(BO)
  300. lfs f0, FZERO
  301. fmr f1, f0
  302. fmr f2, f0
  303. fmr f3, f0
  304. fmr f4, f0
  305. fmr f5, f0
  306. fmr f6, f0
  307. fmr f7, f0
  308. fmr f8, f0
  309. fmr f9, f0
  310. fmr f10, f0
  311. fmr f11, f0
  312. fmr f12, f0
  313. fmr f13, f0
  314. fmr f14, f0
  315. fmr f15, f0
  316. srawi. r0, TEMP, 2
  317. mtspr CTR, r0
  318. #endif
  319. ble .L15
  320. .align 4
  321. .L12:
  322. FMADD f0, A1, B1, f0
  323. LFD A3, 2 * SIZE(AO)
  324. FMADD f4, A1, B2, f4
  325. LFD A6, 12 * SIZE(AO)
  326. FMADD f8, A1, B3, f8
  327. nop
  328. FMADD f12, A1, B4, f12
  329. nop
  330. FMADD f1, A2, B1, f1
  331. LFD A1, 3 * SIZE(AO)
  332. FMADD f5, A2, B2, f5
  333. nop
  334. FMADD f9, A2, B3, f9
  335. nop
  336. FMADD f13, A2, B4, f13
  337. nop
  338. FMADD f2, A3, B1, f2
  339. nop
  340. FMADD f6, A3, B2, f6
  341. LFD B8, 5 * SIZE(BO)
  342. FMADD f10, A3, B3, f10
  343. LFD B9, 6 * SIZE(BO)
  344. FMADD f14, A3, B4, f14
  345. LFD B10, 7 * SIZE(BO)
  346. FMADD f3, A1, B1, f3
  347. LFD A2, 5 * SIZE(AO)
  348. FMADD f7, A1, B2, f7
  349. LFD B1, 16 * SIZE(BO)
  350. FMADD f11, A1, B3, f11
  351. nop
  352. FMADD f15, A1, B4, f15
  353. nop
  354. FMADD f0, A4, B5, f0
  355. LFD A3, 6 * SIZE(AO)
  356. FMADD f4, A4, B8, f4
  357. LFD A1, 16 * SIZE(AO)
  358. FMADD f8, A4, B9, f8
  359. nop
  360. FMADD f12, A4, B10, f12
  361. nop
  362. FMADD f1, A2, B5, f1
  363. LFD A4, 7 * SIZE(AO)
  364. FMADD f5, A2, B8, f5
  365. nop
  366. FMADD f9, A2, B9, f9
  367. nop
  368. FMADD f13, A2, B10, f13
  369. nop
  370. FMADD f2, A3, B5, f2
  371. nop
  372. FMADD f6, A3, B8, f6
  373. LFD B2, 9 * SIZE(BO)
  374. FMADD f10, A3, B9, f10
  375. LFD B3, 10 * SIZE(BO)
  376. FMADD f14, A3, B10, f14
  377. LFD B4, 11 * SIZE(BO)
  378. FMADD f3, A4, B5, f3
  379. LFD A2, 9 * SIZE(AO)
  380. FMADD f7, A4, B8, f7
  381. LFD B5, 20 * SIZE(BO)
  382. FMADD f11, A4, B9, f11
  383. nop
  384. FMADD f15, A4, B10, f15
  385. nop
  386. FMADD f0, A5, B6, f0
  387. LFD A3, 10 * SIZE(AO)
  388. FMADD f4, A5, B2, f4
  389. LFD A4, 20 * SIZE(AO)
  390. FMADD f8, A5, B3, f8
  391. nop
  392. FMADD f12, A5, B4, f12
  393. nop
  394. FMADD f1, A2, B6, f1
  395. LFD A5, 11 * SIZE(AO)
  396. FMADD f5, A2, B2, f5
  397. nop
  398. FMADD f9, A2, B3, f9
  399. nop
  400. FMADD f13, A2, B4, f13
  401. nop
  402. FMADD f2, A3, B6, f2
  403. nop
  404. FMADD f6, A3, B2, f6
  405. LFD B8, 13 * SIZE(BO)
  406. FMADD f10, A3, B3, f10
  407. LFD B9, 14 * SIZE(BO)
  408. FMADD f14, A3, B4, f14
  409. LFD B10,15 * SIZE(BO)
  410. FMADD f3, A5, B6, f3
  411. LFD A2, 13 * SIZE(AO)
  412. FMADD f7, A5, B2, f7
  413. LFD B6, 24 * SIZE(BO)
  414. FMADD f11, A5, B3, f11
  415. nop
  416. FMADD f15, A5, B4, f15
  417. nop
  418. FMADD f0, A6, B7, f0
  419. LFD A3, 14 * SIZE(AO)
  420. FMADD f4, A6, B8, f4
  421. LFD A5, 24 * SIZE(AO)
  422. FMADD f8, A6, B9, f8
  423. nop
  424. FMADD f12, A6, B10, f12
  425. nop
  426. FMADD f1, A2, B7, f1
  427. LFD A6, 15 * SIZE(AO)
  428. FMADD f5, A2, B8, f5
  429. nop
  430. FMADD f9, A2, B9, f9
  431. nop
  432. FMADD f13, A2, B10, f13
  433. nop
  434. FMADD f2, A3, B7, f2
  435. addi AO, AO, 16 * SIZE
  436. FMADD f6, A3, B8, f6
  437. LFD B2, 17 * SIZE(BO)
  438. FMADD f10, A3, B9, f10
  439. LFD B3, 18 * SIZE(BO)
  440. FMADD f14, A3, B10, f14
  441. LFD B4, 19 * SIZE(BO)
  442. FMADD f3, A6, B7, f3
  443. LFD A2, 1 * SIZE(AO)
  444. FMADD f7, A6, B8, f7
  445. LFD B7, 28 * SIZE(BO)
  446. FMADD f11, A6, B9, f11
  447. addi BO, BO, 16 * SIZE
  448. FMADD f15, A6, B10, f15
  449. bdnz .L12
  450. .align 4
  451. .L15:
  452. #if defined(LT) || defined(RN)
  453. andi. r0, KK, 3
  454. #else
  455. andi. r0, TEMP, 3
  456. #endif
  457. mtspr CTR, r0
  458. ble .LKERNEL_MainFinish
  459. .align 4
  460. .L16:
  461. FMADD f0, A1, B1, f0
  462. LFD A3, 2 * SIZE(AO)
  463. FMADD f4, A1, B2, f4
  464. FMADD f8, A1, B3, f8
  465. FMADD f12, A1, B4, f12
  466. LFD A4, 3 * SIZE(AO)
  467. FMADD f1, A2, B1, f1
  468. FMADD f5, A2, B2, f5
  469. FMADD f9, A2, B3, f9
  470. FMADD f13, A2, B4, f13
  471. LFDU A1, 4 * SIZE(AO)
  472. FMADD f2, A3, B1, f2
  473. FMADD f6, A3, B2, f6
  474. FMADD f10, A3, B3, f10
  475. FMADD f14, A3, B4, f14
  476. LFD A2, 1 * SIZE(AO)
  477. FMADD f3, A4, B1, f3
  478. LFDU B1, 4 * SIZE(BO)
  479. FMADD f7, A4, B2, f7
  480. LFD B2, 1 * SIZE(BO)
  481. FMADD f11, A4, B3, f11
  482. LFD B3, 2 * SIZE(BO)
  483. FMADD f15, A4, B4, f15
  484. LFD B4, 3 * SIZE(BO)
  485. bdnz .L16
  486. .align 4
  487. .LKERNEL_MainFinish:
  488. #ifndef CONJ
  489. FSUB f0, f0, f5
  490. FADD f1, f1, f4
  491. FSUB f2, f2, f7
  492. FADD f3, f3, f6
  493. FSUB f8, f8, f13
  494. FADD f9, f9, f12
  495. FSUB f10, f10, f15
  496. FADD f11, f11, f14
  497. #else
  498. FADD f0, f0, f5
  499. FSUB f1, f4, f1
  500. FADD f2, f2, f7
  501. FSUB f3, f6, f3
  502. FADD f8, f8, f13
  503. FSUB f9, f12, f9
  504. FADD f10, f10, f15
  505. FSUB f11, f14, f11
  506. #endif
  507. #if defined(LN) || defined(RT)
  508. subi r0, KK, 2
  509. slwi r0, r0, 1 + ZBASE_SHIFT
  510. add AO, AORIG, r0
  511. add BO, B, r0
  512. #endif
  513. #if defined(LN) || defined(LT)
  514. LFD f16, 0 * SIZE(BO)
  515. LFD f17, 1 * SIZE(BO)
  516. LFD f18, 2 * SIZE(BO)
  517. LFD f19, 3 * SIZE(BO)
  518. LFD f20, 4 * SIZE(BO)
  519. LFD f21, 5 * SIZE(BO)
  520. LFD f22, 6 * SIZE(BO)
  521. LFD f23, 7 * SIZE(BO)
  522. FSUB f0, f16, f0
  523. FSUB f1, f17, f1
  524. FSUB f8, f18, f8
  525. FSUB f9, f19, f9
  526. FSUB f2, f20, f2
  527. FSUB f3, f21, f3
  528. FSUB f10, f22, f10
  529. FSUB f11, f23, f11
  530. #else
  531. LFD f16, 0 * SIZE(AO)
  532. LFD f17, 1 * SIZE(AO)
  533. LFD f18, 2 * SIZE(AO)
  534. LFD f19, 3 * SIZE(AO)
  535. LFD f20, 4 * SIZE(AO)
  536. LFD f21, 5 * SIZE(AO)
  537. LFD f22, 6 * SIZE(AO)
  538. LFD f23, 7 * SIZE(AO)
  539. #ifndef CONJ
  540. FSUB f0, f16, f0
  541. FSUB f1, f17, f1
  542. FSUB f2, f18, f2
  543. FSUB f3, f19, f3
  544. FSUB f8, f20, f8
  545. FSUB f9, f21, f9
  546. FSUB f10, f22, f10
  547. FSUB f11, f23, f11
  548. #else
  549. FSUB f0, f16, f0
  550. FADD f1, f17, f1
  551. FSUB f2, f18, f2
  552. FADD f3, f19, f3
  553. FSUB f8, f20, f8
  554. FADD f9, f21, f9
  555. FSUB f10, f22, f10
  556. FADD f11, f23, f11
  557. #endif
  558. #endif
  559. #ifdef LN
  560. LFD f16, 6 * SIZE(AO)
  561. LFD f17, 7 * SIZE(AO)
  562. LFD f18, 4 * SIZE(AO)
  563. LFD f19, 5 * SIZE(AO)
  564. LFD f20, 0 * SIZE(AO)
  565. LFD f21, 1 * SIZE(AO)
  566. FMUL f6, f17, f3
  567. FMUL f7, f17, f2
  568. FMUL f14, f17, f11
  569. FMUL f15, f17, f10
  570. #ifndef CONJ
  571. FMSUB f2, f16, f2, f6
  572. FMADD f3, f16, f3, f7
  573. FMSUB f10, f16, f10, f14
  574. FMADD f11, f16, f11, f15
  575. FMADD f0, f19, f3, f0
  576. FNMSUB f1, f19, f2, f1
  577. FMADD f8, f19, f11, f8
  578. FNMSUB f9, f19, f10, f9
  579. FNMSUB f0, f18, f2, f0
  580. FNMSUB f1, f18, f3, f1
  581. FNMSUB f8, f18, f10, f8
  582. FNMSUB f9, f18, f11, f9
  583. FMUL f4, f21, f1
  584. FMUL f5, f21, f0
  585. FMUL f12, f21, f9
  586. FMUL f13, f21, f8
  587. FMSUB f0, f20, f0, f4
  588. FMADD f1, f20, f1, f5
  589. FMSUB f8, f20, f8, f12
  590. FMADD f9, f20, f9, f13
  591. #else
  592. FMADD f2, f16, f2, f6
  593. FMSUB f3, f16, f3, f7
  594. FMADD f10, f16, f10, f14
  595. FMSUB f11, f16, f11, f15
  596. FMSUB f0, f19, f3, f0
  597. FNMADD f1, f19, f2, f1
  598. FMSUB f8, f19, f11, f8
  599. FNMADD f9, f19, f10, f9
  600. FNMADD f0, f18, f2, f0
  601. FNMADD f1, f18, f3, f1
  602. FNMADD f8, f18, f10, f8
  603. FNMADD f9, f18, f11, f9
  604. FMUL f4, f21, f1
  605. FMUL f5, f21, f0
  606. FMUL f12, f21, f9
  607. FMUL f13, f21, f8
  608. FMADD f0, f20, f0, f4
  609. FMSUB f1, f20, f1, f5
  610. FMADD f8, f20, f8, f12
  611. FMSUB f9, f20, f9, f13
  612. #endif
  613. #endif
  614. #ifdef LT
  615. LFD f16, 0 * SIZE(AO)
  616. LFD f17, 1 * SIZE(AO)
  617. LFD f18, 2 * SIZE(AO)
  618. LFD f19, 3 * SIZE(AO)
  619. LFD f20, 6 * SIZE(AO)
  620. LFD f21, 7 * SIZE(AO)
  621. FMUL f4, f17, f1
  622. FMUL f5, f17, f0
  623. FMUL f12, f17, f9
  624. FMUL f13, f17, f8
  625. #ifndef CONJ
  626. FMSUB f0, f16, f0, f4
  627. FMADD f1, f16, f1, f5
  628. FMSUB f8, f16, f8, f12
  629. FMADD f9, f16, f9, f13
  630. FMADD f2, f19, f1, f2
  631. FNMSUB f3, f19, f0, f3
  632. FMADD f10, f19, f9, f10
  633. FNMSUB f11, f19, f8, f11
  634. FNMSUB f2, f18, f0, f2
  635. FNMSUB f3, f18, f1, f3
  636. FNMSUB f10, f18, f8, f10
  637. FNMSUB f11, f18, f9, f11
  638. FMUL f4, f21, f3
  639. FMUL f5, f21, f2
  640. FMUL f12, f21, f11
  641. FMUL f13, f21, f10
  642. FMSUB f2, f20, f2, f4
  643. FMADD f3, f20, f3, f5
  644. FMSUB f10, f20, f10, f12
  645. FMADD f11, f20, f11, f13
  646. #else
  647. FMADD f0, f16, f0, f4
  648. FMSUB f1, f16, f1, f5
  649. FMADD f8, f16, f8, f12
  650. FMSUB f9, f16, f9, f13
  651. FMSUB f2, f19, f1, f2
  652. FNMADD f3, f19, f0, f3
  653. FMSUB f10, f19, f9, f10
  654. FNMADD f11, f19, f8, f11
  655. FNMADD f2, f18, f0, f2
  656. FNMADD f3, f18, f1, f3
  657. FNMADD f10, f18, f8, f10
  658. FNMADD f11, f18, f9, f11
  659. FMUL f4, f21, f3
  660. FMUL f5, f21, f2
  661. FMUL f12, f21, f11
  662. FMUL f13, f21, f10
  663. FMADD f2, f20, f2, f4
  664. FMSUB f3, f20, f3, f5
  665. FMADD f10, f20, f10, f12
  666. FMSUB f11, f20, f11, f13
  667. #endif
  668. #endif
  669. #ifdef RN
  670. LFD f16, 0 * SIZE(BO)
  671. LFD f17, 1 * SIZE(BO)
  672. LFD f18, 2 * SIZE(BO)
  673. LFD f19, 3 * SIZE(BO)
  674. LFD f20, 6 * SIZE(BO)
  675. LFD f21, 7 * SIZE(BO)
  676. FMUL f4, f17, f1
  677. FMUL f5, f17, f0
  678. FMUL f6, f17, f3
  679. FMUL f7, f17, f2
  680. #ifndef CONJ
  681. FMSUB f0, f16, f0, f4
  682. FMADD f1, f16, f1, f5
  683. FMSUB f2, f16, f2, f6
  684. FMADD f3, f16, f3, f7
  685. FMADD f8, f19, f1, f8
  686. FNMSUB f9, f19, f0, f9
  687. FMADD f10, f19, f3, f10
  688. FNMSUB f11, f19, f2, f11
  689. FNMSUB f8, f18, f0, f8
  690. FNMSUB f9, f18, f1, f9
  691. FNMSUB f10, f18, f2, f10
  692. FNMSUB f11, f18, f3, f11
  693. FMUL f4, f21, f9
  694. FMUL f5, f21, f8
  695. FMUL f6, f21, f11
  696. FMUL f7, f21, f10
  697. FMSUB f8, f20, f8, f4
  698. FMADD f9, f20, f9, f5
  699. FMSUB f10, f20, f10, f6
  700. FMADD f11, f20, f11, f7
  701. #else
  702. FMADD f0, f16, f0, f4
  703. FMSUB f1, f16, f1, f5
  704. FMADD f2, f16, f2, f6
  705. FMSUB f3, f16, f3, f7
  706. FMSUB f8, f19, f1, f8
  707. FNMADD f9, f19, f0, f9
  708. FMSUB f10, f19, f3, f10
  709. FNMADD f11, f19, f2, f11
  710. FNMADD f8, f18, f0, f8
  711. FNMADD f9, f18, f1, f9
  712. FNMADD f10, f18, f2, f10
  713. FNMADD f11, f18, f3, f11
  714. FMUL f4, f21, f9
  715. FMUL f5, f21, f8
  716. FMUL f6, f21, f11
  717. FMUL f7, f21, f10
  718. FMADD f8, f20, f8, f4
  719. FMSUB f9, f20, f9, f5
  720. FMADD f10, f20, f10, f6
  721. FMSUB f11, f20, f11, f7
  722. #endif
  723. #endif
  724. #ifdef RT
  725. LFD f16, 6 * SIZE(BO)
  726. LFD f17, 7 * SIZE(BO)
  727. LFD f18, 4 * SIZE(BO)
  728. LFD f19, 5 * SIZE(BO)
  729. LFD f20, 0 * SIZE(BO)
  730. LFD f21, 1 * SIZE(BO)
  731. FMUL f12, f17, f9
  732. FMUL f13, f17, f8
  733. FMUL f14, f17, f11
  734. FMUL f15, f17, f10
  735. #ifndef CONJ
  736. FMSUB f8, f16, f8, f12
  737. FMADD f9, f16, f9, f13
  738. FMSUB f10, f16, f10, f14
  739. FMADD f11, f16, f11, f15
  740. FMADD f0, f19, f9, f0
  741. FNMSUB f1, f19, f8, f1
  742. FMADD f2, f19, f11, f2
  743. FNMSUB f3, f19, f10, f3
  744. FNMSUB f0, f18, f8, f0
  745. FNMSUB f1, f18, f9, f1
  746. FNMSUB f2, f18, f10, f2
  747. FNMSUB f3, f18, f11, f3
  748. FMUL f4, f21, f1
  749. FMUL f5, f21, f0
  750. FMUL f6, f21, f3
  751. FMUL f7, f21, f2
  752. FMSUB f0, f20, f0, f4
  753. FMADD f1, f20, f1, f5
  754. FMSUB f2, f20, f2, f6
  755. FMADD f3, f20, f3, f7
  756. #else
  757. FMADD f8, f16, f8, f12
  758. FMSUB f9, f16, f9, f13
  759. FMADD f10, f16, f10, f14
  760. FMSUB f11, f16, f11, f15
  761. FMSUB f0, f19, f9, f0
  762. FNMADD f1, f19, f8, f1
  763. FMSUB f2, f19, f11, f2
  764. FNMADD f3, f19, f10, f3
  765. FNMADD f0, f18, f8, f0
  766. FNMADD f1, f18, f9, f1
  767. FNMADD f2, f18, f10, f2
  768. FNMADD f3, f18, f11, f3
  769. FMUL f4, f21, f1
  770. FMUL f5, f21, f0
  771. FMUL f6, f21, f3
  772. FMUL f7, f21, f2
  773. FMADD f0, f20, f0, f4
  774. FMSUB f1, f20, f1, f5
  775. FMADD f2, f20, f2, f6
  776. FMSUB f3, f20, f3, f7
  777. #endif
  778. #endif
  779. #ifdef LN
  780. subi CO1, CO1, 4 * SIZE
  781. subi CO2, CO2, 4 * SIZE
  782. #endif
  783. #if defined(LN) || defined(LT)
  784. STFD f0, 0 * SIZE(BO)
  785. STFD f1, 1 * SIZE(BO)
  786. STFD f8, 2 * SIZE(BO)
  787. STFD f9, 3 * SIZE(BO)
  788. STFD f2, 4 * SIZE(BO)
  789. STFD f3, 5 * SIZE(BO)
  790. STFD f10, 6 * SIZE(BO)
  791. STFD f11, 7 * SIZE(BO)
  792. #else
  793. STFD f0, 0 * SIZE(AO)
  794. STFD f1, 1 * SIZE(AO)
  795. STFD f2, 2 * SIZE(AO)
  796. STFD f3, 3 * SIZE(AO)
  797. STFD f8, 4 * SIZE(AO)
  798. STFD f9, 5 * SIZE(AO)
  799. STFD f10, 6 * SIZE(AO)
  800. STFD f11, 7 * SIZE(AO)
  801. #endif
  802. STFD f0, 0 * SIZE(CO1)
  803. STFD f1, 1 * SIZE(CO1)
  804. STFD f2, 2 * SIZE(CO1)
  805. STFD f3, 3 * SIZE(CO1)
  806. STFD f8, 0 * SIZE(CO2)
  807. STFD f9, 1 * SIZE(CO2)
  808. STFD f10, 2 * SIZE(CO2)
  809. STFD f11, 3 * SIZE(CO2)
  810. #ifndef LN
  811. addi CO1, CO1, 4 * SIZE
  812. addi CO2, CO2, 4 * SIZE
  813. #endif
  814. #ifdef RT
  815. slwi r0, K, 1 + ZBASE_SHIFT
  816. add AORIG, AORIG, r0
  817. #endif
  818. #if defined(LT) || defined(RN)
  819. sub TEMP, K, KK
  820. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  821. add AO, AO, TEMP
  822. add BO, BO, TEMP
  823. #endif
  824. #ifdef LT
  825. addi KK, KK, 2
  826. #endif
  827. #ifdef LN
  828. subi KK, KK, 2
  829. #endif
  830. addic. I, I, -1
  831. bgt .L11
  832. .align 4
  833. .L20:
  834. andi. I, M, 1
  835. ble .L29
  836. #if defined(LT) || defined(RN)
  837. LFD f16, 0 * SIZE(AO)
  838. LFD f17, 1 * SIZE(AO)
  839. LFD f18, 2 * SIZE(AO)
  840. LFD f19, 3 * SIZE(AO)
  841. LFD f20, 0 * SIZE(B)
  842. LFD f21, 1 * SIZE(B)
  843. LFD f22, 2 * SIZE(B)
  844. LFD f23, 3 * SIZE(B)
  845. LFD f24, 4 * SIZE(B)
  846. LFD f25, 5 * SIZE(B)
  847. LFD f26, 6 * SIZE(B)
  848. LFD f27, 7 * SIZE(B)
  849. lfs f0, FZERO
  850. fmr f1, f0
  851. fmr f2, f0
  852. fmr f3, f0
  853. fmr f4, f0
  854. fmr f5, f0
  855. fmr f6, f0
  856. fmr f7, f0
  857. srawi. r0, KK, 2
  858. mr BO, B
  859. mtspr CTR, r0
  860. #else
  861. #ifdef LN
  862. slwi r0, K, 0 + ZBASE_SHIFT
  863. sub AORIG, AORIG, r0
  864. #endif
  865. slwi r0, KK, 0 + ZBASE_SHIFT
  866. slwi TEMP, KK, 1 + ZBASE_SHIFT
  867. add AO, AORIG, r0
  868. add BO, B, TEMP
  869. sub TEMP, K, KK
  870. LFD f16, 0 * SIZE(AO)
  871. LFD f17, 1 * SIZE(AO)
  872. LFD f18, 2 * SIZE(AO)
  873. LFD f19, 3 * SIZE(AO)
  874. LFD f20, 0 * SIZE(BO)
  875. LFD f21, 1 * SIZE(BO)
  876. LFD f22, 2 * SIZE(BO)
  877. LFD f23, 3 * SIZE(BO)
  878. LFD f24, 4 * SIZE(BO)
  879. LFD f25, 5 * SIZE(BO)
  880. LFD f26, 6 * SIZE(BO)
  881. LFD f27, 7 * SIZE(BO)
  882. lfs f0, FZERO
  883. fmr f1, f0
  884. fmr f2, f0
  885. fmr f3, f0
  886. fmr f4, f0
  887. fmr f5, f0
  888. fmr f6, f0
  889. fmr f7, f0
  890. srawi. r0, TEMP, 2
  891. mtspr CTR, r0
  892. #endif
  893. ble .L25
  894. .align 4
  895. .L22:
  896. fmadd f0, f16, f20, f0
  897. LFD f19, 3 * SIZE(AO)
  898. fmadd f1, f16, f21, f1
  899. nop
  900. fmadd f2, f16, f22, f2
  901. nop
  902. fmadd f3, f16, f23, f3
  903. LFD f16, 4 * SIZE(AO)
  904. fmadd f4, f17, f20, f4
  905. LFD f20, 8 * SIZE(BO)
  906. fmadd f5, f17, f21, f5
  907. LFD f21, 9 * SIZE(BO)
  908. fmadd f6, f17, f22, f6
  909. LFD f22, 10 * SIZE(BO)
  910. fmadd f7, f17, f23, f7
  911. LFD f23, 11 * SIZE(BO)
  912. fmadd f0, f18, f24, f0
  913. LFD f17, 5 * SIZE(AO)
  914. fmadd f1, f18, f25, f1
  915. nop
  916. fmadd f2, f18, f26, f2
  917. nop
  918. fmadd f3, f18, f27, f3
  919. LFD f18, 6 * SIZE(AO)
  920. fmadd f4, f19, f24, f4
  921. LFD f24, 12 * SIZE(BO)
  922. fmadd f5, f19, f25, f5
  923. LFD f25, 13 * SIZE(BO)
  924. fmadd f6, f19, f26, f6
  925. LFD f26, 14 * SIZE(BO)
  926. fmadd f7, f19, f27, f7
  927. LFD f27, 15 * SIZE(BO)
  928. fmadd f0, f16, f20, f0
  929. LFD f19, 7 * SIZE(AO)
  930. fmadd f1, f16, f21, f1
  931. nop
  932. fmadd f2, f16, f22, f2
  933. nop
  934. fmadd f3, f16, f23, f3
  935. LFDU f16, 8 * SIZE(AO)
  936. fmadd f4, f17, f20, f4
  937. LFDU f20, 16 * SIZE(BO)
  938. fmadd f5, f17, f21, f5
  939. LFD f21, 1 * SIZE(BO)
  940. fmadd f6, f17, f22, f6
  941. LFD f22, 2 * SIZE(BO)
  942. fmadd f7, f17, f23, f7
  943. LFD f23, 3 * SIZE(BO)
  944. fmadd f0, f18, f24, f0
  945. LFD f17, 1 * SIZE(AO)
  946. fmadd f1, f18, f25, f1
  947. nop
  948. fmadd f2, f18, f26, f2
  949. nop
  950. fmadd f3, f18, f27, f3
  951. LFD f18, 2 * SIZE(AO)
  952. fmadd f4, f19, f24, f4
  953. LFD f24, 4 * SIZE(BO)
  954. fmadd f5, f19, f25, f5
  955. LFD f25, 5 * SIZE(BO)
  956. fmadd f6, f19, f26, f6
  957. LFD f26, 6 * SIZE(BO)
  958. fmadd f7, f19, f27, f7
  959. LFD f27, 7 * SIZE(BO)
  960. bdnz .L22
  961. .align 4
  962. .L25:
  963. #if defined(LT) || defined(RN)
  964. andi. r0, KK, 3
  965. #else
  966. andi. r0, TEMP, 3
  967. #endif
  968. mtspr CTR, r0
  969. ble .L27
  970. .align 4
  971. .L26:
  972. fmadd f0, f16, f20, f0
  973. LFD f17, 1 * SIZE(AO)
  974. fmadd f1, f16, f21, f1
  975. nop
  976. fmadd f2, f16, f22, f2
  977. nop
  978. fmadd f3, f16, f23, f3
  979. LFDU f16, 2 * SIZE(AO)
  980. fmadd f4, f17, f20, f4
  981. LFDU f20, 4 * SIZE(BO)
  982. fmadd f5, f17, f21, f5
  983. LFD f21, 1 * SIZE(BO)
  984. fmadd f6, f17, f22, f6
  985. LFD f22, 2 * SIZE(BO)
  986. fmadd f7, f17, f23, f7
  987. LFD f23, 3 * SIZE(BO)
  988. bdnz .L26
  989. .align 4
  990. .L27:
  991. #ifndef CONJ
  992. FSUB f0, f0, f5
  993. FADD f1, f1, f4
  994. FSUB f2, f2, f7
  995. FADD f3, f3, f6
  996. #else
  997. #if defined(LN) || defined(LT)
  998. FADD f0, f0, f5
  999. FSUB f1, f1, f4
  1000. FADD f2, f2, f7
  1001. FSUB f3, f3, f6
  1002. #else
  1003. FADD f0, f0, f5
  1004. FSUB f1, f4, f1
  1005. FADD f2, f2, f7
  1006. FSUB f3, f6, f3
  1007. #endif
  1008. #endif
  1009. #if defined(LN) || defined(RT)
  1010. #ifdef LN
  1011. subi r0, KK, 1
  1012. #else
  1013. subi r0, KK, 2
  1014. #endif
  1015. slwi TEMP, r0, 0 + ZBASE_SHIFT
  1016. slwi r0, r0, 1 + ZBASE_SHIFT
  1017. add AO, AORIG, TEMP
  1018. add BO, B, r0
  1019. #endif
  1020. #if defined(LN) || defined(LT)
  1021. LFD f16, 0 * SIZE(BO)
  1022. LFD f17, 1 * SIZE(BO)
  1023. LFD f18, 2 * SIZE(BO)
  1024. LFD f19, 3 * SIZE(BO)
  1025. FSUB f0, f16, f0
  1026. FSUB f1, f17, f1
  1027. FSUB f2, f18, f2
  1028. FSUB f3, f19, f3
  1029. #else
  1030. LFD f16, 0 * SIZE(AO)
  1031. LFD f17, 1 * SIZE(AO)
  1032. LFD f20, 2 * SIZE(AO)
  1033. LFD f21, 3 * SIZE(AO)
  1034. FSUB f0, f16, f0
  1035. FSUB f1, f17, f1
  1036. FSUB f2, f20, f2
  1037. FSUB f3, f21, f3
  1038. #endif
  1039. #ifdef LN
  1040. LFD f20, 0 * SIZE(AO)
  1041. LFD f21, 1 * SIZE(AO)
  1042. FMUL f4, f21, f1
  1043. FMUL f5, f21, f0
  1044. FMUL f12, f21, f3
  1045. FMUL f13, f21, f2
  1046. #ifndef CONJ
  1047. FMSUB f0, f20, f0, f4
  1048. FMADD f1, f20, f1, f5
  1049. FMSUB f2, f20, f2, f12
  1050. FMADD f3, f20, f3, f13
  1051. #else
  1052. FMADD f0, f20, f0, f4
  1053. FMSUB f1, f20, f1, f5
  1054. FMADD f2, f20, f2, f12
  1055. FMSUB f3, f20, f3, f13
  1056. #endif
  1057. #endif
  1058. #ifdef LT
  1059. LFD f16, 0 * SIZE(AO)
  1060. LFD f17, 1 * SIZE(AO)
  1061. FMUL f4, f17, f1
  1062. FMUL f5, f17, f0
  1063. FMUL f12, f17, f3
  1064. FMUL f13, f17, f2
  1065. #ifndef CONJ
  1066. FMSUB f0, f16, f0, f4
  1067. FMADD f1, f16, f1, f5
  1068. FMSUB f2, f16, f2, f12
  1069. FMADD f3, f16, f3, f13
  1070. #else
  1071. FMADD f0, f16, f0, f4
  1072. FMSUB f1, f16, f1, f5
  1073. FMADD f2, f16, f2, f12
  1074. FMSUB f3, f16, f3, f13
  1075. #endif
  1076. #endif
  1077. #ifdef RN
  1078. LFD f16, 0 * SIZE(BO)
  1079. LFD f17, 1 * SIZE(BO)
  1080. LFD f18, 2 * SIZE(BO)
  1081. LFD f19, 3 * SIZE(BO)
  1082. LFD f20, 6 * SIZE(BO)
  1083. LFD f21, 7 * SIZE(BO)
  1084. FMUL f4, f17, f1
  1085. FMUL f5, f17, f0
  1086. #ifndef CONJ
  1087. FMSUB f0, f16, f0, f4
  1088. FMADD f1, f16, f1, f5
  1089. FMADD f2, f19, f1, f2
  1090. FNMSUB f3, f19, f0, f3
  1091. FNMSUB f2, f18, f0, f2
  1092. FNMSUB f3, f18, f1, f3
  1093. FMUL f4, f21, f3
  1094. FMUL f5, f21, f2
  1095. FMSUB f2, f20, f2, f4
  1096. FMADD f3, f20, f3, f5
  1097. #else
  1098. FMADD f0, f16, f0, f4
  1099. FMSUB f1, f16, f1, f5
  1100. FMSUB f2, f19, f1, f2
  1101. FNMADD f3, f19, f0, f3
  1102. FNMADD f2, f18, f0, f2
  1103. FNMADD f3, f18, f1, f3
  1104. FMUL f4, f21, f3
  1105. FMUL f5, f21, f2
  1106. FMADD f2, f20, f2, f4
  1107. FMSUB f3, f20, f3, f5
  1108. #endif
  1109. #endif
  1110. #ifdef RT
  1111. LFD f16, 6 * SIZE(BO)
  1112. LFD f17, 7 * SIZE(BO)
  1113. LFD f18, 4 * SIZE(BO)
  1114. LFD f19, 5 * SIZE(BO)
  1115. LFD f20, 0 * SIZE(BO)
  1116. LFD f21, 1 * SIZE(BO)
  1117. FMUL f12, f17, f3
  1118. FMUL f13, f17, f2
  1119. #ifndef CONJ
  1120. FMSUB f2, f16, f2, f12
  1121. FMADD f3, f16, f3, f13
  1122. FMADD f0, f19, f3, f0
  1123. FNMSUB f1, f19, f2, f1
  1124. FNMSUB f0, f18, f2, f0
  1125. FNMSUB f1, f18, f3, f1
  1126. FMUL f4, f21, f1
  1127. FMUL f5, f21, f0
  1128. FMSUB f0, f20, f0, f4
  1129. FMADD f1, f20, f1, f5
  1130. #else
  1131. FMADD f2, f16, f2, f12
  1132. FMSUB f3, f16, f3, f13
  1133. FMSUB f0, f19, f3, f0
  1134. FNMADD f1, f19, f2, f1
  1135. FNMADD f0, f18, f2, f0
  1136. FNMADD f1, f18, f3, f1
  1137. FMUL f4, f21, f1
  1138. FMUL f5, f21, f0
  1139. FMADD f0, f20, f0, f4
  1140. FMSUB f1, f20, f1, f5
  1141. #endif
  1142. #endif
  1143. #ifdef LN
  1144. subi CO1, CO1, 2 * SIZE
  1145. subi CO2, CO2, 2 * SIZE
  1146. #endif
  1147. #if defined(LN) || defined(LT)
  1148. STFD f0, 0 * SIZE(BO)
  1149. STFD f1, 1 * SIZE(BO)
  1150. STFD f2, 2 * SIZE(BO)
  1151. STFD f3, 3 * SIZE(BO)
  1152. #else
  1153. STFD f0, 0 * SIZE(AO)
  1154. STFD f1, 1 * SIZE(AO)
  1155. STFD f2, 2 * SIZE(AO)
  1156. STFD f3, 3 * SIZE(AO)
  1157. #endif
  1158. STFD f0, 0 * SIZE(CO1)
  1159. STFD f1, 1 * SIZE(CO1)
  1160. STFD f2, 0 * SIZE(CO2)
  1161. STFD f3, 1 * SIZE(CO2)
  1162. #ifndef LN
  1163. addi CO1, CO1, 2 * SIZE
  1164. addi CO2, CO2, 2 * SIZE
  1165. #endif
  1166. #ifdef RT
  1167. slwi r0, K, 0 + ZBASE_SHIFT
  1168. add AORIG, AORIG, r0
  1169. #endif
  1170. #if defined(LT) || defined(RN)
  1171. sub TEMP, K, KK
  1172. slwi r0, TEMP, 0 + ZBASE_SHIFT
  1173. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1174. add AO, AO, r0
  1175. add BO, BO, TEMP
  1176. #endif
  1177. #ifdef LT
  1178. addi KK, KK, 1
  1179. #endif
  1180. #ifdef LN
  1181. subi KK, KK, 1
  1182. #endif
  1183. .align 4
  1184. .L29:
  1185. #ifdef LN
  1186. slwi r0, K, 1 + ZBASE_SHIFT
  1187. add B, B, r0
  1188. #endif
  1189. #if defined(LT) || defined(RN)
  1190. mr B, BO
  1191. #endif
  1192. #ifdef RN
  1193. addi KK, KK, 2
  1194. #endif
  1195. #ifdef RT
  1196. subi KK, KK, 2
  1197. #endif
  1198. addic. J, J, -1
  1199. bgt .L10
  1200. .align 4
  1201. .L30:
  1202. andi. J, N, 1
  1203. ble .L999
  1204. #ifdef RT
  1205. slwi r0, K, 0 + ZBASE_SHIFT
  1206. sub B, B, r0
  1207. sub C, C, LDC
  1208. #endif
  1209. mr CO1, C
  1210. #ifdef LN
  1211. add KK, M, OFFSET
  1212. #endif
  1213. #ifdef LT
  1214. mr KK, OFFSET
  1215. #endif
  1216. srawi. I, M, 1
  1217. #if defined(LN) || defined(RT)
  1218. mr AORIG, A
  1219. #else
  1220. mr AO, A
  1221. #endif
  1222. #ifndef RT
  1223. add C, C, LDC
  1224. #endif
  1225. ble .L40
  1226. .align 4
  1227. .L31:
  1228. #if defined(LT) || defined(RN)
  1229. LFD f20, 0 * SIZE(AO)
  1230. LFD f21, 1 * SIZE(AO)
  1231. LFD f22, 2 * SIZE(AO)
  1232. LFD f23, 3 * SIZE(AO)
  1233. LFD f24, 4 * SIZE(AO)
  1234. LFD f25, 5 * SIZE(AO)
  1235. LFD f26, 6 * SIZE(AO)
  1236. LFD f27, 7 * SIZE(AO)
  1237. LFD f16, 0 * SIZE(B)
  1238. LFD f17, 1 * SIZE(B)
  1239. LFD f18, 2 * SIZE(B)
  1240. LFD f19, 3 * SIZE(B)
  1241. lfs f0, FZERO
  1242. fmr f1, f0
  1243. fmr f2, f0
  1244. fmr f3, f0
  1245. fmr f4, f0
  1246. fmr f5, f0
  1247. fmr f6, f0
  1248. fmr f7, f0
  1249. srawi. r0, KK, 2
  1250. mr BO, B
  1251. mtspr CTR, r0
  1252. #else
  1253. #ifdef LN
  1254. slwi r0, K, 1 + ZBASE_SHIFT
  1255. sub AORIG, AORIG, r0
  1256. #endif
  1257. slwi r0, KK, 1 + ZBASE_SHIFT
  1258. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1259. add AO, AORIG, r0
  1260. add BO, B, TEMP
  1261. sub TEMP, K, KK
  1262. LFD f20, 0 * SIZE(AO)
  1263. LFD f21, 1 * SIZE(AO)
  1264. LFD f22, 2 * SIZE(AO)
  1265. LFD f23, 3 * SIZE(AO)
  1266. LFD f24, 4 * SIZE(AO)
  1267. LFD f25, 5 * SIZE(AO)
  1268. LFD f26, 6 * SIZE(AO)
  1269. LFD f27, 7 * SIZE(AO)
  1270. LFD f16, 0 * SIZE(BO)
  1271. LFD f17, 1 * SIZE(BO)
  1272. LFD f18, 2 * SIZE(BO)
  1273. LFD f19, 3 * SIZE(BO)
  1274. lfs f0, FZERO
  1275. fmr f1, f0
  1276. fmr f2, f0
  1277. fmr f3, f0
  1278. fmr f4, f0
  1279. fmr f5, f0
  1280. fmr f6, f0
  1281. fmr f7, f0
  1282. srawi. r0, TEMP, 2
  1283. mtspr CTR, r0
  1284. #endif
  1285. ble .L35
  1286. .align 4
  1287. .L32:
  1288. fmadd f0, f16, f20, f0
  1289. LFD f19, 3 * SIZE(BO)
  1290. fmadd f1, f16, f21, f1
  1291. nop
  1292. fmadd f2, f16, f22, f2
  1293. nop
  1294. fmadd f3, f16, f23, f3
  1295. LFD f16, 4 * SIZE(BO)
  1296. fmadd f4, f17, f20, f4
  1297. LFD f20, 8 * SIZE(AO)
  1298. fmadd f5, f17, f21, f5
  1299. LFD f21, 9 * SIZE(AO)
  1300. fmadd f6, f17, f22, f6
  1301. LFD f22, 10 * SIZE(AO)
  1302. fmadd f7, f17, f23, f7
  1303. LFD f23, 11 * SIZE(AO)
  1304. fmadd f0, f18, f24, f0
  1305. LFD f17, 5 * SIZE(BO)
  1306. fmadd f1, f18, f25, f1
  1307. nop
  1308. fmadd f2, f18, f26, f2
  1309. nop
  1310. fmadd f3, f18, f27, f3
  1311. LFD f18, 6 * SIZE(BO)
  1312. fmadd f4, f19, f24, f4
  1313. LFD f24, 12 * SIZE(AO)
  1314. fmadd f5, f19, f25, f5
  1315. LFD f25, 13 * SIZE(AO)
  1316. fmadd f6, f19, f26, f6
  1317. LFD f26, 14 * SIZE(AO)
  1318. fmadd f7, f19, f27, f7
  1319. LFD f27, 15 * SIZE(AO)
  1320. fmadd f0, f16, f20, f0
  1321. LFD f19, 7 * SIZE(BO)
  1322. fmadd f1, f16, f21, f1
  1323. nop
  1324. fmadd f2, f16, f22, f2
  1325. nop
  1326. fmadd f3, f16, f23, f3
  1327. LFDU f16, 8 * SIZE(BO)
  1328. fmadd f4, f17, f20, f4
  1329. LFDU f20, 16 * SIZE(AO)
  1330. fmadd f5, f17, f21, f5
  1331. LFD f21, 1 * SIZE(AO)
  1332. fmadd f6, f17, f22, f6
  1333. LFD f22, 2 * SIZE(AO)
  1334. fmadd f7, f17, f23, f7
  1335. LFD f23, 3 * SIZE(AO)
  1336. fmadd f0, f18, f24, f0
  1337. LFD f17, 1 * SIZE(BO)
  1338. fmadd f1, f18, f25, f1
  1339. nop
  1340. fmadd f2, f18, f26, f2
  1341. nop
  1342. fmadd f3, f18, f27, f3
  1343. LFD f18, 2 * SIZE(BO)
  1344. fmadd f4, f19, f24, f4
  1345. LFD f24, 4 * SIZE(AO)
  1346. fmadd f5, f19, f25, f5
  1347. LFD f25, 5 * SIZE(AO)
  1348. fmadd f6, f19, f26, f6
  1349. LFD f26, 6 * SIZE(AO)
  1350. fmadd f7, f19, f27, f7
  1351. LFD f27, 7 * SIZE(AO)
  1352. bdnz .L32
  1353. .align 4
  1354. .L35:
  1355. #if defined(LT) || defined(RN)
  1356. andi. r0, KK, 3
  1357. #else
  1358. andi. r0, TEMP, 3
  1359. #endif
  1360. mtspr CTR, r0
  1361. ble .L37
  1362. .align 4
  1363. .L36:
  1364. fmadd f0, f16, f20, f0
  1365. LFD f17, 1 * SIZE(BO)
  1366. fmadd f1, f16, f21, f1
  1367. nop
  1368. fmadd f2, f16, f22, f2
  1369. nop
  1370. fmadd f3, f16, f23, f3
  1371. LFDU f16, 2 * SIZE(BO)
  1372. fmadd f4, f17, f20, f4
  1373. LFDU f20, 4 * SIZE(AO)
  1374. fmadd f5, f17, f21, f5
  1375. LFD f21, 1 * SIZE(AO)
  1376. fmadd f6, f17, f22, f6
  1377. LFD f22, 2 * SIZE(AO)
  1378. fmadd f7, f17, f23, f7
  1379. LFD f23, 3 * SIZE(AO)
  1380. bdnz .L36
  1381. .align 4
  1382. .L37:
  1383. #ifndef CONJ
  1384. FSUB f0, f0, f5
  1385. FADD f1, f1, f4
  1386. FSUB f2, f2, f7
  1387. FADD f3, f3, f6
  1388. #else
  1389. FADD f0, f0, f5
  1390. FSUB f1, f4, f1
  1391. FADD f2, f2, f7
  1392. FSUB f3, f6, f3
  1393. #endif
  1394. #if defined(LN) || defined(RT)
  1395. #ifdef LN
  1396. subi r0, KK, 2
  1397. #else
  1398. subi r0, KK, 1
  1399. #endif
  1400. slwi TEMP, r0, 1 + ZBASE_SHIFT
  1401. slwi r0, r0, 0 + ZBASE_SHIFT
  1402. add AO, AORIG, TEMP
  1403. add BO, B, r0
  1404. #endif
  1405. #if defined(LN) || defined(LT)
  1406. LFD f16, 0 * SIZE(BO)
  1407. LFD f17, 1 * SIZE(BO)
  1408. LFD f18, 2 * SIZE(BO)
  1409. LFD f19, 3 * SIZE(BO)
  1410. FSUB f0, f16, f0
  1411. FSUB f1, f17, f1
  1412. FSUB f2, f18, f2
  1413. FSUB f3, f19, f3
  1414. #else
  1415. LFD f16, 0 * SIZE(AO)
  1416. LFD f17, 1 * SIZE(AO)
  1417. LFD f18, 2 * SIZE(AO)
  1418. LFD f19, 3 * SIZE(AO)
  1419. #ifndef CONJ
  1420. FSUB f0, f16, f0
  1421. FSUB f1, f17, f1
  1422. FSUB f2, f18, f2
  1423. FSUB f3, f19, f3
  1424. #else
  1425. FSUB f0, f16, f0
  1426. FADD f1, f17, f1
  1427. FSUB f2, f18, f2
  1428. FADD f3, f19, f3
  1429. #endif
  1430. #endif
  1431. #ifdef LN
  1432. LFD f16, 6 * SIZE(AO)
  1433. LFD f17, 7 * SIZE(AO)
  1434. LFD f18, 4 * SIZE(AO)
  1435. LFD f19, 5 * SIZE(AO)
  1436. LFD f20, 0 * SIZE(AO)
  1437. LFD f21, 1 * SIZE(AO)
  1438. FMUL f6, f17, f3
  1439. FMUL f7, f17, f2
  1440. #ifndef CONJ
  1441. FMSUB f2, f16, f2, f6
  1442. FMADD f3, f16, f3, f7
  1443. FMADD f0, f19, f3, f0
  1444. FNMSUB f1, f19, f2, f1
  1445. FNMSUB f0, f18, f2, f0
  1446. FNMSUB f1, f18, f3, f1
  1447. FMUL f4, f21, f1
  1448. FMUL f5, f21, f0
  1449. FMSUB f0, f20, f0, f4
  1450. FMADD f1, f20, f1, f5
  1451. #else
  1452. FMADD f2, f16, f2, f6
  1453. FMSUB f3, f16, f3, f7
  1454. FMSUB f0, f19, f3, f0
  1455. FNMADD f1, f19, f2, f1
  1456. FNMADD f0, f18, f2, f0
  1457. FNMADD f1, f18, f3, f1
  1458. FMUL f4, f21, f1
  1459. FMUL f5, f21, f0
  1460. FMADD f0, f20, f0, f4
  1461. FMSUB f1, f20, f1, f5
  1462. #endif
  1463. #endif
  1464. #ifdef LT
  1465. LFD f16, 0 * SIZE(AO)
  1466. LFD f17, 1 * SIZE(AO)
  1467. LFD f18, 2 * SIZE(AO)
  1468. LFD f19, 3 * SIZE(AO)
  1469. LFD f20, 6 * SIZE(AO)
  1470. LFD f21, 7 * SIZE(AO)
  1471. FMUL f4, f17, f1
  1472. FMUL f5, f17, f0
  1473. #ifndef CONJ
  1474. FMSUB f0, f16, f0, f4
  1475. FMADD f1, f16, f1, f5
  1476. FMADD f2, f19, f1, f2
  1477. FNMSUB f3, f19, f0, f3
  1478. FNMSUB f2, f18, f0, f2
  1479. FNMSUB f3, f18, f1, f3
  1480. FMUL f4, f21, f3
  1481. FMUL f5, f21, f2
  1482. FMSUB f2, f20, f2, f4
  1483. FMADD f3, f20, f3, f5
  1484. #else
  1485. FMADD f0, f16, f0, f4
  1486. FMSUB f1, f16, f1, f5
  1487. FMSUB f2, f19, f1, f2
  1488. FNMADD f3, f19, f0, f3
  1489. FNMADD f2, f18, f0, f2
  1490. FNMADD f3, f18, f1, f3
  1491. FMUL f4, f21, f3
  1492. FMUL f5, f21, f2
  1493. FMADD f2, f20, f2, f4
  1494. FMSUB f3, f20, f3, f5
  1495. #endif
  1496. #endif
  1497. #ifdef RN
  1498. LFD f16, 0 * SIZE(BO)
  1499. LFD f17, 1 * SIZE(BO)
  1500. FMUL f4, f17, f1
  1501. FMUL f5, f17, f0
  1502. FMUL f6, f17, f3
  1503. FMUL f7, f17, f2
  1504. #ifndef CONJ
  1505. FMSUB f0, f16, f0, f4
  1506. FMADD f1, f16, f1, f5
  1507. FMSUB f2, f16, f2, f6
  1508. FMADD f3, f16, f3, f7
  1509. #else
  1510. FMADD f0, f16, f0, f4
  1511. FMSUB f1, f16, f1, f5
  1512. FMADD f2, f16, f2, f6
  1513. FMSUB f3, f16, f3, f7
  1514. #endif
  1515. #endif
  1516. #ifdef RT
  1517. LFD f20, 0 * SIZE(BO)
  1518. LFD f21, 1 * SIZE(BO)
  1519. FMUL f4, f21, f1
  1520. FMUL f5, f21, f0
  1521. FMUL f6, f21, f3
  1522. FMUL f7, f21, f2
  1523. #ifndef CONJ
  1524. FMSUB f0, f20, f0, f4
  1525. FMADD f1, f20, f1, f5
  1526. FMSUB f2, f20, f2, f6
  1527. FMADD f3, f20, f3, f7
  1528. #else
  1529. FMADD f0, f20, f0, f4
  1530. FMSUB f1, f20, f1, f5
  1531. FMADD f2, f20, f2, f6
  1532. FMSUB f3, f20, f3, f7
  1533. #endif
  1534. #endif
  1535. #ifdef LN
  1536. subi CO1, CO1, 4 * SIZE
  1537. #endif
  1538. #if defined(LN) || defined(LT)
  1539. STFD f0, 0 * SIZE(BO)
  1540. STFD f1, 1 * SIZE(BO)
  1541. STFD f2, 2 * SIZE(BO)
  1542. STFD f3, 3 * SIZE(BO)
  1543. #else
  1544. STFD f0, 0 * SIZE(AO)
  1545. STFD f1, 1 * SIZE(AO)
  1546. STFD f2, 2 * SIZE(AO)
  1547. STFD f3, 3 * SIZE(AO)
  1548. #endif
  1549. STFD f0, 0 * SIZE(CO1)
  1550. STFD f1, 1 * SIZE(CO1)
  1551. STFD f2, 2 * SIZE(CO1)
  1552. STFD f3, 3 * SIZE(CO1)
  1553. #ifndef LN
  1554. addi CO1, CO1, 4 * SIZE
  1555. #endif
  1556. #ifdef RT
  1557. slwi r0, K, 1 + ZBASE_SHIFT
  1558. add AORIG, AORIG, r0
  1559. #endif
  1560. #if defined(LT) || defined(RN)
  1561. sub TEMP, K, KK
  1562. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1563. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1564. add AO, AO, r0
  1565. add BO, BO, TEMP
  1566. #endif
  1567. #ifdef LT
  1568. addi KK, KK, 2
  1569. #endif
  1570. #ifdef LN
  1571. subi KK, KK, 2
  1572. #endif
  1573. addic. I, I, -1
  1574. bgt .L31
  1575. .align 4
  1576. .L40:
  1577. andi. I, M, 1
  1578. ble .L49
  1579. #if defined(LT) || defined(RN)
  1580. LFD f16, 0 * SIZE(AO)
  1581. LFD f17, 1 * SIZE(AO)
  1582. LFD f18, 2 * SIZE(AO)
  1583. LFD f19, 3 * SIZE(AO)
  1584. LFD f20, 0 * SIZE(B)
  1585. LFD f21, 1 * SIZE(B)
  1586. LFD f22, 2 * SIZE(B)
  1587. LFD f23, 3 * SIZE(B)
  1588. lfs f0, FZERO
  1589. fmr f1, f0
  1590. fmr f2, f0
  1591. fmr f3, f0
  1592. fmr f4, f0
  1593. fmr f5, f0
  1594. fmr f6, f0
  1595. fmr f7, f0
  1596. srawi. r0, KK, 2
  1597. mr BO, B
  1598. mtspr CTR, r0
  1599. #else
  1600. #ifdef LN
  1601. slwi r0, K, 0 + ZBASE_SHIFT
  1602. sub AORIG, AORIG, r0
  1603. #endif
  1604. slwi r0, KK, 0 + ZBASE_SHIFT
  1605. add AO, AORIG, r0
  1606. add BO, B, r0
  1607. sub TEMP, K, KK
  1608. LFD f16, 0 * SIZE(AO)
  1609. LFD f17, 1 * SIZE(AO)
  1610. LFD f18, 2 * SIZE(AO)
  1611. LFD f19, 3 * SIZE(AO)
  1612. LFD f20, 0 * SIZE(BO)
  1613. LFD f21, 1 * SIZE(BO)
  1614. LFD f22, 2 * SIZE(BO)
  1615. LFD f23, 3 * SIZE(BO)
  1616. lfs f0, FZERO
  1617. fmr f1, f0
  1618. fmr f2, f0
  1619. fmr f3, f0
  1620. fmr f4, f0
  1621. fmr f5, f0
  1622. fmr f6, f0
  1623. fmr f7, f0
  1624. srawi. r0, TEMP, 2
  1625. mtspr CTR, r0
  1626. #endif
  1627. ble .L45
  1628. .align 4
  1629. .L42:
  1630. fmadd f0, f16, f20, f0
  1631. LFD f23, 3 * SIZE(BO)
  1632. fmadd f3, f16, f21, f3
  1633. LFD f16, 4 * SIZE(AO)
  1634. fmadd f2, f17, f20, f2
  1635. LFD f20, 4 * SIZE(BO)
  1636. fmadd f1, f17, f21, f1
  1637. LFD f17, 5 * SIZE(AO)
  1638. fmadd f4, f18, f22, f4
  1639. LFD f21, 5 * SIZE(BO)
  1640. fmadd f7, f18, f23, f7
  1641. LFD f18, 6 * SIZE(AO)
  1642. fmadd f6, f19, f22, f6
  1643. LFD f22, 6 * SIZE(BO)
  1644. fmadd f5, f19, f23, f5
  1645. LFD f19, 7 * SIZE(AO)
  1646. fmadd f0, f16, f20, f0
  1647. LFD f23, 7 * SIZE(BO)
  1648. fmadd f3, f16, f21, f3
  1649. LFDU f16, 8 * SIZE(AO)
  1650. fmadd f2, f17, f20, f2
  1651. LFDU f20, 8 * SIZE(BO)
  1652. fmadd f1, f17, f21, f1
  1653. LFD f17, 1 * SIZE(AO)
  1654. fmadd f4, f18, f22, f4
  1655. LFD f21, 1 * SIZE(BO)
  1656. fmadd f7, f18, f23, f7
  1657. LFD f18, 2 * SIZE(AO)
  1658. fmadd f6, f19, f22, f6
  1659. LFD f22, 2 * SIZE(BO)
  1660. fmadd f5, f19, f23, f5
  1661. LFD f19, 3 * SIZE(AO)
  1662. bdnz .L42
  1663. .align 4
  1664. .L45:
  1665. fadd f0, f0, f4
  1666. fadd f1, f1, f5
  1667. fadd f2, f2, f6
  1668. fadd f3, f3, f7
  1669. #if defined(LT) || defined(RN)
  1670. andi. r0, KK, 3
  1671. #else
  1672. andi. r0, TEMP, 3
  1673. #endif
  1674. mtspr CTR,r0
  1675. ble .L47
  1676. .align 4
  1677. .L46:
  1678. fmadd f0, f16, f20, f0
  1679. LFD f21, 1 * SIZE(BO)
  1680. fmadd f3, f16, f21, f3
  1681. LFDU f16, 2 * SIZE(AO)
  1682. fmadd f2, f17, f20, f2
  1683. LFDU f20, 2 * SIZE(BO)
  1684. fmadd f1, f17, f21, f1
  1685. LFD f17, 1 * SIZE(AO)
  1686. bdnz .L46
  1687. .align 4
  1688. .L47:
  1689. #ifndef CONJ
  1690. FSUB f0, f0, f1
  1691. FADD f1, f2, f3
  1692. #else
  1693. FADD f0, f0, f1
  1694. FSUB f1, f3, f2
  1695. #endif
  1696. #if defined(LN) || defined(RT)
  1697. subi r0, KK, 1
  1698. slwi r0, r0, 0 + ZBASE_SHIFT
  1699. add AO, AORIG, r0
  1700. add BO, B, r0
  1701. #endif
  1702. #if defined(LN) || defined(LT)
  1703. LFD f16, 0 * SIZE(BO)
  1704. LFD f17, 1 * SIZE(BO)
  1705. FSUB f0, f16, f0
  1706. FSUB f1, f17, f1
  1707. #else
  1708. LFD f16, 0 * SIZE(AO)
  1709. LFD f17, 1 * SIZE(AO)
  1710. #ifndef CONJ
  1711. FSUB f0, f16, f0
  1712. FSUB f1, f17, f1
  1713. #else
  1714. FSUB f0, f16, f0
  1715. FADD f1, f17, f1
  1716. #endif
  1717. #endif
  1718. #ifdef LN
  1719. LFD f20, 0 * SIZE(AO)
  1720. LFD f21, 1 * SIZE(AO)
  1721. FMUL f4, f21, f1
  1722. FMUL f5, f21, f0
  1723. #ifndef CONJ
  1724. FMSUB f0, f20, f0, f4
  1725. FMADD f1, f20, f1, f5
  1726. #else
  1727. FMADD f0, f20, f0, f4
  1728. FMSUB f1, f20, f1, f5
  1729. #endif
  1730. #endif
  1731. #ifdef LT
  1732. LFD f16, 0 * SIZE(AO)
  1733. LFD f17, 1 * SIZE(AO)
  1734. FMUL f4, f17, f1
  1735. FMUL f5, f17, f0
  1736. #ifndef CONJ
  1737. FMSUB f0, f16, f0, f4
  1738. FMADD f1, f16, f1, f5
  1739. #else
  1740. FMADD f0, f16, f0, f4
  1741. FMSUB f1, f16, f1, f5
  1742. #endif
  1743. #endif
  1744. #ifdef RN
  1745. LFD f16, 0 * SIZE(BO)
  1746. LFD f17, 1 * SIZE(BO)
  1747. FMUL f4, f17, f1
  1748. FMUL f5, f17, f0
  1749. #ifndef CONJ
  1750. FMSUB f0, f16, f0, f4
  1751. FMADD f1, f16, f1, f5
  1752. #else
  1753. FMADD f0, f16, f0, f4
  1754. FMSUB f1, f16, f1, f5
  1755. #endif
  1756. #endif
  1757. #ifdef RT
  1758. LFD f20, 0 * SIZE(BO)
  1759. LFD f21, 1 * SIZE(BO)
  1760. FMUL f4, f21, f1
  1761. FMUL f5, f21, f0
  1762. #ifndef CONJ
  1763. FMSUB f0, f20, f0, f4
  1764. FMADD f1, f20, f1, f5
  1765. #else
  1766. FMADD f0, f20, f0, f4
  1767. FMSUB f1, f20, f1, f5
  1768. #endif
  1769. #endif
  1770. #ifdef LN
  1771. subi CO1, CO1, 2 * SIZE
  1772. #endif
  1773. #if defined(LN) || defined(LT)
  1774. STFD f0, 0 * SIZE(BO)
  1775. STFD f1, 1 * SIZE(BO)
  1776. #else
  1777. STFD f0, 0 * SIZE(AO)
  1778. STFD f1, 1 * SIZE(AO)
  1779. #endif
  1780. STFD f0, 0 * SIZE(CO1)
  1781. STFD f1, 1 * SIZE(CO1)
  1782. #ifndef LN
  1783. addi CO1, CO1, 2 * SIZE
  1784. #endif
  1785. #ifdef RT
  1786. slwi r0, K, 0 + ZBASE_SHIFT
  1787. add AORIG, AORIG, r0
  1788. #endif
  1789. #if defined(LT) || defined(RN)
  1790. sub TEMP, K, KK
  1791. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1792. add AO, AO, TEMP
  1793. add BO, BO, TEMP
  1794. #endif
  1795. #ifdef LT
  1796. addi KK, KK, 1
  1797. #endif
  1798. #ifdef LN
  1799. subi KK, KK, 1
  1800. #endif
  1801. .align 4
  1802. .L49:
  1803. #ifdef LN
  1804. slwi r0, K, 0 + ZBASE_SHIFT
  1805. add B, B, r0
  1806. #endif
  1807. #if defined(LT) || defined(RN)
  1808. mr B, BO
  1809. #endif
  1810. #ifdef RN
  1811. addi KK, KK, 1
  1812. #endif
  1813. #ifdef RT
  1814. subi KK, KK, 1
  1815. #endif
  1816. .align 4
  1817. .L999:
  1818. addi r3, 0, 0
  1819. lfd f14, 0(SP)
  1820. lfd f15, 8(SP)
  1821. lfd f16, 16(SP)
  1822. lfd f17, 24(SP)
  1823. lfd f18, 32(SP)
  1824. lfd f19, 40(SP)
  1825. lfd f20, 48(SP)
  1826. lfd f21, 56(SP)
  1827. lfd f22, 64(SP)
  1828. lfd f23, 72(SP)
  1829. lfd f24, 80(SP)
  1830. lfd f25, 88(SP)
  1831. lfd f26, 96(SP)
  1832. lfd f27, 104(SP)
  1833. lfd f28, 112(SP)
  1834. lfd f29, 120(SP)
  1835. lfd f30, 128(SP)
  1836. lfd f31, 136(SP)
  1837. #ifdef __64BIT__
  1838. ld r31, 144(SP)
  1839. ld r30, 152(SP)
  1840. ld r29, 160(SP)
  1841. ld r28, 168(SP)
  1842. ld r27, 176(SP)
  1843. ld r26, 184(SP)
  1844. ld r25, 192(SP)
  1845. ld r24, 200(SP)
  1846. ld r23, 208(SP)
  1847. ld r22, 216(SP)
  1848. ld r21, 224(SP)
  1849. #else
  1850. lwz r31, 144(SP)
  1851. lwz r30, 148(SP)
  1852. lwz r29, 152(SP)
  1853. lwz r28, 156(SP)
  1854. lwz r27, 160(SP)
  1855. lwz r26, 164(SP)
  1856. lwz r25, 168(SP)
  1857. lwz r24, 172(SP)
  1858. lwz r23, 176(SP)
  1859. lwz r22, 180(SP)
  1860. lwz r21, 184(SP)
  1861. #endif
  1862. addi SP, SP, STACKSIZE
  1863. blr
  1864. EPILOGUE