You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_cell_RT.S 39 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA_R 296(SP)
  48. #define ALPHA_I 304(SP)
  49. #define FZERO 312(SP)
  50. #else
  51. #define STACKSIZE 256
  52. #define ALPHA_R 224(SP)
  53. #define ALPHA_I 232(SP)
  54. #define FZERO 240(SP)
  55. #endif
  56. #define M r3
  57. #define N r4
  58. #define K r5
  59. #if defined(linux) || defined(__FreeBSD__)
  60. #ifndef __64BIT__
  61. #define A r6
  62. #define B r7
  63. #define C r8
  64. #define LDC r9
  65. #define OFFSET r10
  66. #else
  67. #define A r8
  68. #define B r9
  69. #define C r10
  70. #define LDC r6
  71. #define OFFSET r7
  72. #endif
  73. #endif
  74. #if defined(_AIX) || defined(__APPLE__)
  75. #if !defined(__64BIT__) && defined(DOUBLE)
  76. #define A r10
  77. #define B r6
  78. #define C r7
  79. #define LDC r8
  80. #define OFFSET r9
  81. #else
  82. #define A r8
  83. #define B r9
  84. #define C r10
  85. #define LDC r6
  86. #define OFFSET r7
  87. #endif
  88. #endif
  89. #define AORIG r21
  90. #define TEMP r22
  91. #define KK r23
  92. #define I r24
  93. #define J r25
  94. #define AO r26
  95. #define BO r27
  96. #define CO1 r28
  97. #define CO2 r29
  98. #define PREA r30
  99. #define PREC r31
  100. #define PREB PREA
  101. #ifndef NEEDPARAM
  102. #ifndef DOUBLE
  103. #include "cparam.h"
  104. #else
  105. #include "zparam.h"
  106. #endif
  107. PROLOGUE
  108. PROFCODE
  109. addi SP, SP, -STACKSIZE
  110. li r0, 0
  111. stfd f14, 0(SP)
  112. stfd f15, 8(SP)
  113. stfd f16, 16(SP)
  114. stfd f17, 24(SP)
  115. stfd f18, 32(SP)
  116. stfd f19, 40(SP)
  117. stfd f20, 48(SP)
  118. stfd f21, 56(SP)
  119. stfd f22, 64(SP)
  120. stfd f23, 72(SP)
  121. stfd f24, 80(SP)
  122. stfd f25, 88(SP)
  123. stfd f26, 96(SP)
  124. stfd f27, 104(SP)
  125. stfd f28, 112(SP)
  126. stfd f29, 120(SP)
  127. stfd f30, 128(SP)
  128. stfd f31, 136(SP)
  129. #ifdef __64BIT__
  130. std r31, 144(SP)
  131. std r30, 152(SP)
  132. std r29, 160(SP)
  133. std r28, 168(SP)
  134. std r27, 176(SP)
  135. std r26, 184(SP)
  136. std r25, 192(SP)
  137. std r24, 200(SP)
  138. std r23, 208(SP)
  139. std r22, 216(SP)
  140. std r21, 224(SP)
  141. #else
  142. stw r31, 144(SP)
  143. stw r30, 148(SP)
  144. stw r29, 152(SP)
  145. stw r28, 156(SP)
  146. stw r27, 160(SP)
  147. stw r26, 164(SP)
  148. stw r25, 168(SP)
  149. stw r24, 172(SP)
  150. stw r23, 176(SP)
  151. stw r22, 180(SP)
  152. stw r21, 184(SP)
  153. #endif
  154. stw r0, FZERO
  155. #if defined(linux) || defined(__FreeBSD__)
  156. #ifdef __64BIT__
  157. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  158. #endif
  159. #endif
  160. #if defined(_AIX) || defined(__APPLE__)
  161. #ifdef __64BIT__
  162. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  163. #else
  164. #ifdef DOUBLE
  165. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  166. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  167. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  168. #else
  169. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  170. #endif
  171. #endif
  172. #endif
  173. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  174. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  175. #endif
  176. #if defined(_AIX) || defined(__APPLE__)
  177. #ifdef __64BIT__
  178. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  179. #else
  180. #ifdef DOUBLE
  181. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  182. #else
  183. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  184. #endif
  185. #endif
  186. #endif
  187. slwi LDC, LDC, ZBASE_SHIFT
  188. #ifdef LN
  189. mullw r0, M, K
  190. slwi r0, r0, ZBASE_SHIFT
  191. add A, A, r0
  192. slwi r0, M, ZBASE_SHIFT
  193. add C, C, r0
  194. #endif
  195. #ifdef RN
  196. neg KK, OFFSET
  197. #endif
  198. #ifdef RT
  199. mullw r0, N, K
  200. slwi r0, r0, ZBASE_SHIFT
  201. add B, B, r0
  202. mullw r0, N, LDC
  203. add C, C, r0
  204. sub KK, N, OFFSET
  205. #endif
  206. cmpwi cr0, M, 0
  207. ble LL(999)
  208. cmpwi cr0, N, 0
  209. ble LL(999)
  210. cmpwi cr0, K, 0
  211. ble LL(999)
  212. li PREC, 3 * SIZE
  213. li PREA, 16 * 12 * SIZE
  214. andi. J, N, 1
  215. ble LL(30)
  216. #ifdef RT
  217. slwi r0, K, 0 + ZBASE_SHIFT
  218. sub B, B, r0
  219. sub C, C, LDC
  220. #endif
  221. mr CO1, C
  222. #ifdef LN
  223. add KK, M, OFFSET
  224. #endif
  225. #ifdef LT
  226. mr KK, OFFSET
  227. #endif
  228. srawi. I, M, 1
  229. #if defined(LN) || defined(RT)
  230. mr AORIG, A
  231. #else
  232. mr AO, A
  233. #endif
  234. #ifndef RT
  235. add C, C, LDC
  236. #endif
  237. ble LL(40)
  238. .align 4
  239. LL(31):
  240. #if defined(LT) || defined(RN)
  241. LFD f20, 0 * SIZE(AO)
  242. LFD f21, 1 * SIZE(AO)
  243. LFD f22, 2 * SIZE(AO)
  244. LFD f23, 3 * SIZE(AO)
  245. LFD f24, 4 * SIZE(AO)
  246. LFD f25, 5 * SIZE(AO)
  247. LFD f26, 6 * SIZE(AO)
  248. LFD f27, 7 * SIZE(AO)
  249. LFD f16, 0 * SIZE(B)
  250. LFD f17, 1 * SIZE(B)
  251. LFD f18, 2 * SIZE(B)
  252. LFD f19, 3 * SIZE(B)
  253. lfs f0, FZERO
  254. fmr f1, f0
  255. fmr f2, f0
  256. fmr f3, f0
  257. fmr f4, f0
  258. fmr f5, f0
  259. fmr f6, f0
  260. fmr f7, f0
  261. dcbt CO1, PREC
  262. srawi. r0, KK, 2
  263. mr BO, B
  264. mtspr CTR, r0
  265. #else
  266. #ifdef LN
  267. slwi r0, K, 1 + ZBASE_SHIFT
  268. sub AORIG, AORIG, r0
  269. #endif
  270. slwi r0, KK, 1 + ZBASE_SHIFT
  271. slwi TEMP, KK, 0 + ZBASE_SHIFT
  272. add AO, AORIG, r0
  273. add BO, B, TEMP
  274. sub TEMP, K, KK
  275. LFD f20, 0 * SIZE(AO)
  276. LFD f21, 1 * SIZE(AO)
  277. LFD f22, 2 * SIZE(AO)
  278. LFD f23, 3 * SIZE(AO)
  279. LFD f24, 4 * SIZE(AO)
  280. LFD f25, 5 * SIZE(AO)
  281. LFD f26, 6 * SIZE(AO)
  282. LFD f27, 7 * SIZE(AO)
  283. LFD f16, 0 * SIZE(BO)
  284. LFD f17, 1 * SIZE(BO)
  285. LFD f18, 2 * SIZE(BO)
  286. LFD f19, 3 * SIZE(BO)
  287. lfs f0, FZERO
  288. fmr f1, f0
  289. fmr f2, f0
  290. fmr f3, f0
  291. fmr f4, f0
  292. fmr f5, f0
  293. fmr f6, f0
  294. fmr f7, f0
  295. srawi. r0, TEMP, 2
  296. mtspr CTR, r0
  297. #endif
  298. ble LL(35)
  299. .align 4
  300. LL(32):
  301. fmadd f0, f16, f20, f0
  302. fmadd f1, f16, f21, f1
  303. fmadd f2, f16, f22, f2
  304. fmadd f3, f16, f23, f3
  305. fmadd f4, f17, f20, f4
  306. fmadd f5, f17, f21, f5
  307. fmadd f6, f17, f22, f6
  308. fmadd f7, f17, f23, f7
  309. LFD f20, 8 * SIZE(AO)
  310. LFD f21, 9 * SIZE(AO)
  311. LFD f22, 10 * SIZE(AO)
  312. LFD f23, 11 * SIZE(AO)
  313. fmadd f0, f18, f24, f0
  314. fmadd f1, f18, f25, f1
  315. fmadd f2, f18, f26, f2
  316. fmadd f3, f18, f27, f3
  317. fmadd f4, f19, f24, f4
  318. fmadd f5, f19, f25, f5
  319. fmadd f6, f19, f26, f6
  320. fmadd f7, f19, f27, f7
  321. LFD f24, 12 * SIZE(AO)
  322. LFD f25, 13 * SIZE(AO)
  323. LFD f26, 14 * SIZE(AO)
  324. LFD f27, 15 * SIZE(AO)
  325. LFD f16, 4 * SIZE(BO)
  326. LFD f17, 5 * SIZE(BO)
  327. LFD f18, 6 * SIZE(BO)
  328. LFD f19, 7 * SIZE(BO)
  329. fmadd f0, f16, f20, f0
  330. fmadd f1, f16, f21, f1
  331. fmadd f2, f16, f22, f2
  332. fmadd f3, f16, f23, f3
  333. fmadd f4, f17, f20, f4
  334. fmadd f5, f17, f21, f5
  335. fmadd f6, f17, f22, f6
  336. fmadd f7, f17, f23, f7
  337. LFD f20, 16 * SIZE(AO)
  338. LFD f21, 17 * SIZE(AO)
  339. LFD f22, 18 * SIZE(AO)
  340. LFD f23, 19 * SIZE(AO)
  341. fmadd f0, f18, f24, f0
  342. fmadd f1, f18, f25, f1
  343. fmadd f2, f18, f26, f2
  344. fmadd f3, f18, f27, f3
  345. fmadd f4, f19, f24, f4
  346. fmadd f5, f19, f25, f5
  347. fmadd f6, f19, f26, f6
  348. fmadd f7, f19, f27, f7
  349. LFD f24, 20 * SIZE(AO)
  350. LFD f25, 21 * SIZE(AO)
  351. LFD f26, 22 * SIZE(AO)
  352. LFD f27, 23 * SIZE(AO)
  353. LFD f16, 8 * SIZE(BO)
  354. LFD f17, 9 * SIZE(BO)
  355. LFD f18, 10 * SIZE(BO)
  356. LFD f19, 11 * SIZE(BO)
  357. addi AO, AO, 16 * SIZE
  358. addi BO, BO, 8 * SIZE
  359. dcbt PREA, AO
  360. dcbt PREA, BO
  361. bdnz LL(32)
  362. .align 4
  363. LL(35):
  364. #if defined(LT) || defined(RN)
  365. andi. r0, KK, 3
  366. #else
  367. andi. r0, TEMP, 3
  368. #endif
  369. mtspr CTR, r0
  370. ble LL(37)
  371. .align 4
  372. LL(36):
  373. fmadd f0, f16, f20, f0
  374. fmadd f1, f16, f21, f1
  375. fmadd f2, f16, f22, f2
  376. fmadd f3, f16, f23, f3
  377. fmadd f4, f17, f20, f4
  378. fmadd f5, f17, f21, f5
  379. fmadd f6, f17, f22, f6
  380. fmadd f7, f17, f23, f7
  381. LFD f20, 4 * SIZE(AO)
  382. LFD f21, 5 * SIZE(AO)
  383. LFD f22, 6 * SIZE(AO)
  384. LFD f23, 7 * SIZE(AO)
  385. LFD f16, 2 * SIZE(BO)
  386. LFD f17, 3 * SIZE(BO)
  387. addi BO, BO, 2 * SIZE
  388. addi AO, AO, 4 * SIZE
  389. bdnz LL(36)
  390. .align 4
  391. LL(37):
  392. #ifndef CONJ
  393. FSUB f0, f0, f5
  394. FADD f1, f1, f4
  395. FSUB f2, f2, f7
  396. FADD f3, f3, f6
  397. #else
  398. FADD f0, f0, f5
  399. FSUB f1, f4, f1
  400. FADD f2, f2, f7
  401. FSUB f3, f6, f3
  402. #endif
  403. #if defined(LN) || defined(RT)
  404. #ifdef LN
  405. subi r0, KK, 2
  406. #else
  407. subi r0, KK, 1
  408. #endif
  409. slwi TEMP, r0, 1 + ZBASE_SHIFT
  410. slwi r0, r0, 0 + ZBASE_SHIFT
  411. add AO, AORIG, TEMP
  412. add BO, B, r0
  413. #endif
  414. #if defined(LN) || defined(LT)
  415. LFD f16, 0 * SIZE(BO)
  416. LFD f17, 1 * SIZE(BO)
  417. LFD f18, 2 * SIZE(BO)
  418. LFD f19, 3 * SIZE(BO)
  419. FSUB f0, f16, f0
  420. FSUB f1, f17, f1
  421. FSUB f2, f18, f2
  422. FSUB f3, f19, f3
  423. #else
  424. LFD f16, 0 * SIZE(AO)
  425. LFD f17, 1 * SIZE(AO)
  426. LFD f18, 2 * SIZE(AO)
  427. LFD f19, 3 * SIZE(AO)
  428. #ifndef CONJ
  429. FSUB f0, f16, f0
  430. FSUB f1, f17, f1
  431. FSUB f2, f18, f2
  432. FSUB f3, f19, f3
  433. #else
  434. FSUB f0, f16, f0
  435. FADD f1, f17, f1
  436. FSUB f2, f18, f2
  437. FADD f3, f19, f3
  438. #endif
  439. #endif
  440. #ifdef LN
  441. LFD f16, 6 * SIZE(AO)
  442. LFD f17, 7 * SIZE(AO)
  443. LFD f18, 4 * SIZE(AO)
  444. LFD f19, 5 * SIZE(AO)
  445. LFD f20, 0 * SIZE(AO)
  446. LFD f21, 1 * SIZE(AO)
  447. FMUL f6, f17, f3
  448. FMUL f7, f17, f2
  449. #ifndef CONJ
  450. FMSUB f2, f16, f2, f6
  451. FMADD f3, f16, f3, f7
  452. FMADD f0, f19, f3, f0
  453. FNMSUB f1, f19, f2, f1
  454. FNMSUB f0, f18, f2, f0
  455. FNMSUB f1, f18, f3, f1
  456. FMUL f4, f21, f1
  457. FMUL f5, f21, f0
  458. FMSUB f0, f20, f0, f4
  459. FMADD f1, f20, f1, f5
  460. #else
  461. FMADD f2, f16, f2, f6
  462. FMSUB f3, f16, f3, f7
  463. FMSUB f0, f19, f3, f0
  464. FNMADD f1, f19, f2, f1
  465. FNMADD f0, f18, f2, f0
  466. FNMADD f1, f18, f3, f1
  467. FMUL f4, f21, f1
  468. FMUL f5, f21, f0
  469. FMADD f0, f20, f0, f4
  470. FMSUB f1, f20, f1, f5
  471. #endif
  472. #endif
  473. #ifdef LT
  474. LFD f16, 0 * SIZE(AO)
  475. LFD f17, 1 * SIZE(AO)
  476. LFD f18, 2 * SIZE(AO)
  477. LFD f19, 3 * SIZE(AO)
  478. LFD f20, 6 * SIZE(AO)
  479. LFD f21, 7 * SIZE(AO)
  480. FMUL f4, f17, f1
  481. FMUL f5, f17, f0
  482. #ifndef CONJ
  483. FMSUB f0, f16, f0, f4
  484. FMADD f1, f16, f1, f5
  485. FMADD f2, f19, f1, f2
  486. FNMSUB f3, f19, f0, f3
  487. FNMSUB f2, f18, f0, f2
  488. FNMSUB f3, f18, f1, f3
  489. FMUL f4, f21, f3
  490. FMUL f5, f21, f2
  491. FMSUB f2, f20, f2, f4
  492. FMADD f3, f20, f3, f5
  493. #else
  494. FMADD f0, f16, f0, f4
  495. FMSUB f1, f16, f1, f5
  496. FMSUB f2, f19, f1, f2
  497. FNMADD f3, f19, f0, f3
  498. FNMADD f2, f18, f0, f2
  499. FNMADD f3, f18, f1, f3
  500. FMUL f4, f21, f3
  501. FMUL f5, f21, f2
  502. FMADD f2, f20, f2, f4
  503. FMSUB f3, f20, f3, f5
  504. #endif
  505. #endif
  506. #ifdef RN
  507. LFD f16, 0 * SIZE(BO)
  508. LFD f17, 1 * SIZE(BO)
  509. FMUL f4, f17, f1
  510. FMUL f5, f17, f0
  511. FMUL f6, f17, f3
  512. FMUL f7, f17, f2
  513. #ifndef CONJ
  514. FMSUB f0, f16, f0, f4
  515. FMADD f1, f16, f1, f5
  516. FMSUB f2, f16, f2, f6
  517. FMADD f3, f16, f3, f7
  518. #else
  519. FMADD f0, f16, f0, f4
  520. FMSUB f1, f16, f1, f5
  521. FMADD f2, f16, f2, f6
  522. FMSUB f3, f16, f3, f7
  523. #endif
  524. #endif
  525. #ifdef RT
  526. LFD f20, 0 * SIZE(BO)
  527. LFD f21, 1 * SIZE(BO)
  528. FMUL f4, f21, f1
  529. FMUL f5, f21, f0
  530. FMUL f6, f21, f3
  531. FMUL f7, f21, f2
  532. #ifndef CONJ
  533. FMSUB f0, f20, f0, f4
  534. FMADD f1, f20, f1, f5
  535. FMSUB f2, f20, f2, f6
  536. FMADD f3, f20, f3, f7
  537. #else
  538. FMADD f0, f20, f0, f4
  539. FMSUB f1, f20, f1, f5
  540. FMADD f2, f20, f2, f6
  541. FMSUB f3, f20, f3, f7
  542. #endif
  543. #endif
  544. #ifdef LN
  545. subi CO1, CO1, 4 * SIZE
  546. #endif
  547. #if defined(LN) || defined(LT)
  548. STFD f0, 0 * SIZE(BO)
  549. STFD f1, 1 * SIZE(BO)
  550. STFD f2, 2 * SIZE(BO)
  551. STFD f3, 3 * SIZE(BO)
  552. #else
  553. STFD f0, 0 * SIZE(AO)
  554. STFD f1, 1 * SIZE(AO)
  555. STFD f2, 2 * SIZE(AO)
  556. STFD f3, 3 * SIZE(AO)
  557. #endif
  558. STFD f0, 0 * SIZE(CO1)
  559. STFD f1, 1 * SIZE(CO1)
  560. STFD f2, 2 * SIZE(CO1)
  561. STFD f3, 3 * SIZE(CO1)
  562. #ifndef LN
  563. addi CO1, CO1, 4 * SIZE
  564. #endif
  565. #ifdef RT
  566. slwi r0, K, 1 + ZBASE_SHIFT
  567. add AORIG, AORIG, r0
  568. #endif
  569. #if defined(LT) || defined(RN)
  570. sub TEMP, K, KK
  571. slwi r0, TEMP, 1 + ZBASE_SHIFT
  572. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  573. add AO, AO, r0
  574. add BO, BO, TEMP
  575. #endif
  576. #ifdef LT
  577. addi KK, KK, 2
  578. #endif
  579. #ifdef LN
  580. subi KK, KK, 2
  581. #endif
  582. addic. I, I, -1
  583. bgt LL(31)
  584. .align 4
  585. LL(40):
  586. andi. I, M, 1
  587. ble LL(49)
  588. #if defined(LT) || defined(RN)
  589. LFD f16, 0 * SIZE(AO)
  590. LFD f17, 1 * SIZE(AO)
  591. LFD f18, 2 * SIZE(AO)
  592. LFD f19, 3 * SIZE(AO)
  593. LFD f20, 0 * SIZE(B)
  594. LFD f21, 1 * SIZE(B)
  595. LFD f22, 2 * SIZE(B)
  596. LFD f23, 3 * SIZE(B)
  597. lfs f0, FZERO
  598. fmr f1, f0
  599. fmr f2, f0
  600. fmr f3, f0
  601. fmr f4, f0
  602. fmr f5, f0
  603. fmr f6, f0
  604. fmr f7, f0
  605. srawi. r0, KK, 2
  606. mr BO, B
  607. mtspr CTR, r0
  608. #else
  609. #ifdef LN
  610. slwi r0, K, 0 + ZBASE_SHIFT
  611. sub AORIG, AORIG, r0
  612. #endif
  613. slwi r0, KK, 0 + ZBASE_SHIFT
  614. add AO, AORIG, r0
  615. add BO, B, r0
  616. sub TEMP, K, KK
  617. LFD f16, 0 * SIZE(AO)
  618. LFD f17, 1 * SIZE(AO)
  619. LFD f18, 2 * SIZE(AO)
  620. LFD f19, 3 * SIZE(AO)
  621. LFD f20, 0 * SIZE(BO)
  622. LFD f21, 1 * SIZE(BO)
  623. LFD f22, 2 * SIZE(BO)
  624. LFD f23, 3 * SIZE(BO)
  625. lfs f0, FZERO
  626. fmr f1, f0
  627. fmr f2, f0
  628. fmr f3, f0
  629. fmr f4, f0
  630. fmr f5, f0
  631. fmr f6, f0
  632. fmr f7, f0
  633. srawi. r0, TEMP, 2
  634. mtspr CTR, r0
  635. #endif
  636. ble LL(45)
  637. .align 4
  638. LL(42):
  639. fmadd f0, f16, f20, f0
  640. fmadd f1, f17, f21, f1
  641. fmadd f2, f17, f20, f2
  642. fmadd f3, f16, f21, f3
  643. LFD f16, 4 * SIZE(AO)
  644. LFD f17, 5 * SIZE(AO)
  645. LFD f20, 4 * SIZE(BO)
  646. LFD f21, 5 * SIZE(BO)
  647. fmadd f4, f18, f22, f4
  648. fmadd f5, f19, f23, f5
  649. fmadd f6, f19, f22, f6
  650. fmadd f7, f18, f23, f7
  651. LFD f18, 6 * SIZE(AO)
  652. LFD f19, 7 * SIZE(AO)
  653. LFD f22, 6 * SIZE(BO)
  654. LFD f23, 7 * SIZE(BO)
  655. fmadd f0, f16, f20, f0
  656. fmadd f1, f17, f21, f1
  657. fmadd f2, f17, f20, f2
  658. fmadd f3, f16, f21, f3
  659. LFD f16, 8 * SIZE(AO)
  660. LFD f17, 9 * SIZE(AO)
  661. LFD f20, 8 * SIZE(BO)
  662. LFD f21, 9 * SIZE(BO)
  663. fmadd f4, f18, f22, f4
  664. fmadd f5, f19, f23, f5
  665. fmadd f6, f19, f22, f6
  666. fmadd f7, f18, f23, f7
  667. LFD f18, 10 * SIZE(AO)
  668. LFD f19, 11 * SIZE(AO)
  669. LFD f22, 10 * SIZE(BO)
  670. LFD f23, 11 * SIZE(BO)
  671. addi AO, AO, 8 * SIZE
  672. addi BO, BO, 8 * SIZE
  673. bdnz LL(42)
  674. .align 4
  675. LL(45):
  676. fadd f0, f0, f4
  677. fadd f1, f1, f5
  678. fadd f2, f2, f6
  679. fadd f3, f3, f7
  680. #if defined(LT) || defined(RN)
  681. andi. r0, KK, 3
  682. #else
  683. andi. r0, TEMP, 3
  684. #endif
  685. mtspr CTR,r0
  686. ble LL(47)
  687. .align 4
  688. LL(46):
  689. fmadd f0, f16, f20, f0
  690. fmadd f1, f17, f21, f1
  691. fmadd f2, f17, f20, f2
  692. fmadd f3, f16, f21, f3
  693. LFD f16, 2 * SIZE(AO)
  694. LFD f17, 3 * SIZE(AO)
  695. LFD f20, 2 * SIZE(BO)
  696. LFD f21, 3 * SIZE(BO)
  697. addi AO, AO, 2 * SIZE
  698. addi BO, BO, 2 * SIZE
  699. bdnz LL(46)
  700. .align 4
  701. LL(47):
  702. #ifndef CONJ
  703. FSUB f0, f0, f1
  704. FADD f1, f2, f3
  705. #else
  706. FADD f0, f0, f1
  707. FSUB f1, f3, f2
  708. #endif
  709. #if defined(LN) || defined(RT)
  710. subi r0, KK, 1
  711. slwi r0, r0, 0 + ZBASE_SHIFT
  712. add AO, AORIG, r0
  713. add BO, B, r0
  714. #endif
  715. #if defined(LN) || defined(LT)
  716. LFD f16, 0 * SIZE(BO)
  717. LFD f17, 1 * SIZE(BO)
  718. FSUB f0, f16, f0
  719. FSUB f1, f17, f1
  720. #else
  721. LFD f16, 0 * SIZE(AO)
  722. LFD f17, 1 * SIZE(AO)
  723. #ifndef CONJ
  724. FSUB f0, f16, f0
  725. FSUB f1, f17, f1
  726. #else
  727. FSUB f0, f16, f0
  728. FADD f1, f17, f1
  729. #endif
  730. #endif
  731. #ifdef LN
  732. LFD f20, 0 * SIZE(AO)
  733. LFD f21, 1 * SIZE(AO)
  734. FMUL f4, f21, f1
  735. FMUL f5, f21, f0
  736. #ifndef CONJ
  737. FMSUB f0, f20, f0, f4
  738. FMADD f1, f20, f1, f5
  739. #else
  740. FMADD f0, f20, f0, f4
  741. FMSUB f1, f20, f1, f5
  742. #endif
  743. #endif
  744. #ifdef LT
  745. LFD f16, 0 * SIZE(AO)
  746. LFD f17, 1 * SIZE(AO)
  747. FMUL f4, f17, f1
  748. FMUL f5, f17, f0
  749. #ifndef CONJ
  750. FMSUB f0, f16, f0, f4
  751. FMADD f1, f16, f1, f5
  752. #else
  753. FMADD f0, f16, f0, f4
  754. FMSUB f1, f16, f1, f5
  755. #endif
  756. #endif
  757. #ifdef RN
  758. LFD f16, 0 * SIZE(BO)
  759. LFD f17, 1 * SIZE(BO)
  760. FMUL f4, f17, f1
  761. FMUL f5, f17, f0
  762. #ifndef CONJ
  763. FMSUB f0, f16, f0, f4
  764. FMADD f1, f16, f1, f5
  765. #else
  766. FMADD f0, f16, f0, f4
  767. FMSUB f1, f16, f1, f5
  768. #endif
  769. #endif
  770. #ifdef RT
  771. LFD f20, 0 * SIZE(BO)
  772. LFD f21, 1 * SIZE(BO)
  773. FMUL f4, f21, f1
  774. FMUL f5, f21, f0
  775. #ifndef CONJ
  776. FMSUB f0, f20, f0, f4
  777. FMADD f1, f20, f1, f5
  778. #else
  779. FMADD f0, f20, f0, f4
  780. FMSUB f1, f20, f1, f5
  781. #endif
  782. #endif
  783. #ifdef LN
  784. subi CO1, CO1, 2 * SIZE
  785. #endif
  786. #if defined(LN) || defined(LT)
  787. STFD f0, 0 * SIZE(BO)
  788. STFD f1, 1 * SIZE(BO)
  789. #else
  790. STFD f0, 0 * SIZE(AO)
  791. STFD f1, 1 * SIZE(AO)
  792. #endif
  793. STFD f0, 0 * SIZE(CO1)
  794. STFD f1, 1 * SIZE(CO1)
  795. #ifndef LN
  796. addi CO1, CO1, 2 * SIZE
  797. #endif
  798. #ifdef RT
  799. slwi r0, K, 0 + ZBASE_SHIFT
  800. add AORIG, AORIG, r0
  801. #endif
  802. #if defined(LT) || defined(RN)
  803. sub TEMP, K, KK
  804. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  805. add AO, AO, TEMP
  806. add BO, BO, TEMP
  807. #endif
  808. #ifdef LT
  809. addi KK, KK, 1
  810. #endif
  811. #ifdef LN
  812. subi KK, KK, 1
  813. #endif
  814. .align 4
  815. LL(49):
  816. #ifdef LN
  817. slwi r0, K, 0 + ZBASE_SHIFT
  818. add B, B, r0
  819. #endif
  820. #if defined(LT) || defined(RN)
  821. mr B, BO
  822. #endif
  823. #ifdef RN
  824. addi KK, KK, 1
  825. #endif
  826. #ifdef RT
  827. subi KK, KK, 1
  828. #endif
  829. .align 4
  830. LL(30):
  831. srawi. J, N, 1
  832. ble LL(999)
  833. .align 4
  834. LL(10):
  835. #ifdef RT
  836. slwi r0, K, 1 + ZBASE_SHIFT
  837. sub B, B, r0
  838. slwi r0, LDC, 1
  839. sub C, C, r0
  840. #endif
  841. mr CO1, C
  842. add CO2, C, LDC
  843. #ifdef LN
  844. add KK, M, OFFSET
  845. #endif
  846. #ifdef LT
  847. mr KK, OFFSET
  848. #endif
  849. srawi. I, M, 1
  850. #if defined(LN) || defined(RT)
  851. mr AORIG, A
  852. #else
  853. mr AO, A
  854. #endif
  855. #ifndef RT
  856. add C, CO2, LDC
  857. #endif
  858. ble LL(20)
  859. .align 4
  860. LL(11):
  861. #if defined(LT) || defined(RN)
  862. LFD f16, 0 * SIZE(AO)
  863. LFD f17, 1 * SIZE(AO)
  864. LFD f18, 2 * SIZE(AO)
  865. LFD f19, 3 * SIZE(AO)
  866. LFD f20, 0 * SIZE(B)
  867. LFD f21, 1 * SIZE(B)
  868. LFD f22, 2 * SIZE(B)
  869. LFD f23, 3 * SIZE(B)
  870. lfs f0, FZERO
  871. fmr f1, f0
  872. fmr f2, f0
  873. fmr f3, f0
  874. fmr f4, f0
  875. fmr f5, f0
  876. fmr f6, f0
  877. fmr f7, f0
  878. fmr f8, f0
  879. fmr f9, f0
  880. fmr f10, f0
  881. fmr f11, f0
  882. fmr f12, f0
  883. fmr f13, f0
  884. fmr f14, f0
  885. fmr f15, f0
  886. dcbt CO1, PREC
  887. dcbt CO2, PREC
  888. srawi. r0, KK, 2
  889. mtspr CTR, r0
  890. mr BO, B
  891. #else
  892. #ifdef LN
  893. slwi r0, K, 1 + ZBASE_SHIFT
  894. sub AORIG, AORIG, r0
  895. #endif
  896. slwi TEMP, KK, 1 + ZBASE_SHIFT
  897. add AO, AORIG, TEMP
  898. add BO, B, TEMP
  899. sub TEMP, K, KK
  900. LFD f16, 0 * SIZE(AO)
  901. LFD f17, 1 * SIZE(AO)
  902. LFD f18, 2 * SIZE(AO)
  903. LFD f19, 3 * SIZE(AO)
  904. LFD f20, 0 * SIZE(BO)
  905. LFD f21, 1 * SIZE(BO)
  906. LFD f22, 2 * SIZE(BO)
  907. LFD f23, 3 * SIZE(BO)
  908. LFD f24, 4 * SIZE(AO)
  909. LFD f25, 5 * SIZE(AO)
  910. LFD f26, 6 * SIZE(AO)
  911. LFD f28, 4 * SIZE(BO)
  912. LFD f29, 5 * SIZE(BO)
  913. LFD f30, 6 * SIZE(BO)
  914. lfs f0, FZERO
  915. fmr f1, f0
  916. fmr f2, f0
  917. fmr f3, f0
  918. fmr f4, f0
  919. fmr f5, f0
  920. fmr f6, f0
  921. fmr f7, f0
  922. fmr f8, f0
  923. fmr f9, f0
  924. fmr f10, f0
  925. fmr f11, f0
  926. fmr f12, f0
  927. fmr f13, f0
  928. fmr f14, f0
  929. fmr f15, f0
  930. dcbt CO1, PREC
  931. dcbt CO2, PREC
  932. srawi. r0, TEMP, 2
  933. mtspr CTR, r0
  934. #endif
  935. ble LL(15)
  936. .align 4
  937. #define NOP1 mr r18, r18
  938. #define NOP2 mr r19, r19
  939. LL(12):
  940. FMADD f0, f16, f20, f0
  941. dcbt AO, PREA
  942. FMADD f4, f16, f21, f4
  943. dcbt BO, PREB
  944. FMADD f8, f16, f22, f8
  945. LFD f31, 7 * SIZE(BO)
  946. FMADD f12, f16, f23, f12
  947. LFD f27, 7 * SIZE(AO)
  948. FMADD f1, f17, f20, f1
  949. LFD f16, 8 * SIZE(AO)
  950. FMADD f5, f17, f21, f5
  951. NOP2
  952. FMADD f9, f17, f22, f9
  953. NOP1
  954. FMADD f13, f17, f23, f13
  955. LFD f17, 9 * SIZE(AO)
  956. FMADD f2, f18, f20, f2
  957. NOP1
  958. FMADD f6, f18, f21, f6
  959. NOP2
  960. FMADD f10, f18, f22, f10
  961. NOP1
  962. FMADD f14, f18, f23, f14
  963. LFD f18, 10 * SIZE(AO)
  964. FMADD f3, f19, f20, f3
  965. LFD f20, 8 * SIZE(BO)
  966. FMADD f7, f19, f21, f7
  967. LFD f21, 9 * SIZE(BO)
  968. FMADD f11, f19, f22, f11
  969. LFD f22, 10 * SIZE(BO)
  970. FMADD f15, f19, f23, f15
  971. LFD f19, 11 * SIZE(AO)
  972. FMADD f0, f24, f28, f0
  973. LFD f23, 11 * SIZE(BO)
  974. FMADD f4, f24, f29, f4
  975. NOP2
  976. FMADD f8, f24, f30, f8
  977. NOP1
  978. FMADD f12, f24, f31, f12
  979. LFD f24, 12 * SIZE(AO)
  980. FMADD f1, f25, f28, f1
  981. NOP1
  982. FMADD f5, f25, f29, f5
  983. NOP2
  984. FMADD f9, f25, f30, f9
  985. NOP1
  986. FMADD f13, f25, f31, f13
  987. LFD f25, 13 * SIZE(AO)
  988. FMADD f2, f26, f28, f2
  989. NOP1
  990. FMADD f6, f26, f29, f6
  991. NOP2
  992. FMADD f10, f26, f30, f10
  993. NOP1
  994. FMADD f14, f26, f31, f14
  995. LFD f26, 14 * SIZE(AO)
  996. FMADD f3, f27, f28, f3
  997. LFD f28, 12 * SIZE(BO)
  998. FMADD f7, f27, f29, f7
  999. LFD f29, 13 * SIZE(BO)
  1000. FMADD f11, f27, f30, f11
  1001. LFD f30, 14 * SIZE(BO)
  1002. FMADD f15, f27, f31, f15
  1003. LFD f27, 15 * SIZE(AO)
  1004. FMADD f0, f16, f20, f0
  1005. LFD f31, 15 * SIZE(BO)
  1006. FMADD f4, f16, f21, f4
  1007. NOP2
  1008. FMADD f8, f16, f22, f8
  1009. NOP1
  1010. FMADD f12, f16, f23, f12
  1011. LFD f16, 16 * SIZE(AO)
  1012. FMADD f1, f17, f20, f1
  1013. NOP1
  1014. FMADD f5, f17, f21, f5
  1015. NOP2
  1016. FMADD f9, f17, f22, f9
  1017. NOP1
  1018. FMADD f13, f17, f23, f13
  1019. LFD f17, 17 * SIZE(AO)
  1020. FMADD f2, f18, f20, f2
  1021. NOP1
  1022. FMADD f6, f18, f21, f6
  1023. NOP2
  1024. FMADD f10, f18, f22, f10
  1025. NOP1
  1026. FMADD f14, f18, f23, f14
  1027. LFD f18, 18 * SIZE(AO)
  1028. FMADD f3, f19, f20, f3
  1029. LFD f20, 16 * SIZE(BO)
  1030. FMADD f7, f19, f21, f7
  1031. LFD f21, 17 * SIZE(BO)
  1032. FMADD f11, f19, f22, f11
  1033. LFD f22, 18 * SIZE(BO)
  1034. FMADD f15, f19, f23, f15
  1035. LFD f19, 19 * SIZE(AO)
  1036. FMADD f0, f24, f28, f0
  1037. LFD f23, 19 * SIZE(BO)
  1038. FMADD f4, f24, f29, f4
  1039. NOP2
  1040. FMADD f8, f24, f30, f8
  1041. NOP1
  1042. FMADD f12, f24, f31, f12
  1043. LFD f24, 20 * SIZE(AO)
  1044. FMADD f1, f25, f28, f1
  1045. NOP1
  1046. FMADD f5, f25, f29, f5
  1047. NOP2
  1048. FMADD f9, f25, f30, f9
  1049. NOP1
  1050. FMADD f13, f25, f31, f13
  1051. LFD f25, 21 * SIZE(AO)
  1052. FMADD f2, f26, f28, f2
  1053. NOP1
  1054. FMADD f6, f26, f29, f6
  1055. NOP2
  1056. FMADD f10, f26, f30, f10
  1057. NOP1
  1058. FMADD f14, f26, f31, f14
  1059. LFD f26, 22 * SIZE(AO)
  1060. FMADD f3, f27, f28, f3
  1061. LFD f28, 20 * SIZE(BO)
  1062. FMADD f7, f27, f29, f7
  1063. LFD f29, 21 * SIZE(BO)
  1064. FMADD f11, f27, f30, f11
  1065. LFD f30, 22 * SIZE(BO)
  1066. FMADD f15, f27, f31, f15
  1067. addi AO, AO, 16 * SIZE
  1068. addi BO, BO, 16 * SIZE
  1069. bdnz LL(12)
  1070. .align 4
  1071. LL(15):
  1072. #if defined(LT) || defined(RN)
  1073. andi. r0, KK, 3
  1074. #else
  1075. andi. r0, TEMP, 3
  1076. #endif
  1077. mtspr CTR, r0
  1078. ble LL(KERNEL_MainFinish)
  1079. .align 4
  1080. LL(16):
  1081. fmadd f0, f16, f20, f0
  1082. fmadd f5, f17, f21, f5
  1083. fmadd f10, f18, f22, f10
  1084. fmadd f15, f19, f23, f15
  1085. fmadd f1, f17, f20, f1
  1086. fmadd f2, f18, f20, f2
  1087. fmadd f3, f19, f20, f3
  1088. fmadd f4, f16, f21, f4
  1089. fmadd f6, f18, f21, f6
  1090. fmadd f7, f19, f21, f7
  1091. fmadd f8, f16, f22, f8
  1092. fmadd f9, f17, f22, f9
  1093. fmadd f11, f19, f22, f11
  1094. fmadd f12, f16, f23, f12
  1095. fmadd f13, f17, f23, f13
  1096. fmadd f14, f18, f23, f14
  1097. LFD f16, 4 * SIZE(AO)
  1098. LFD f17, 5 * SIZE(AO)
  1099. LFD f18, 6 * SIZE(AO)
  1100. LFD f19, 7 * SIZE(AO)
  1101. LFD f20, 4 * SIZE(BO)
  1102. LFD f21, 5 * SIZE(BO)
  1103. LFD f22, 6 * SIZE(BO)
  1104. LFD f23, 7 * SIZE(BO)
  1105. addi BO, BO, 4 * SIZE
  1106. addi AO, AO, 4 * SIZE
  1107. bdnz LL(16)
  1108. .align 4
  1109. LL(KERNEL_MainFinish):
  1110. #ifndef CONJ
  1111. FSUB f0, f0, f5
  1112. FADD f1, f1, f4
  1113. FSUB f2, f2, f7
  1114. FADD f3, f3, f6
  1115. FSUB f8, f8, f13
  1116. FADD f9, f9, f12
  1117. FSUB f10, f10, f15
  1118. FADD f11, f11, f14
  1119. #else
  1120. FADD f0, f0, f5
  1121. FSUB f1, f4, f1
  1122. FADD f2, f2, f7
  1123. FSUB f3, f6, f3
  1124. FADD f8, f8, f13
  1125. FSUB f9, f12, f9
  1126. FADD f10, f10, f15
  1127. FSUB f11, f14, f11
  1128. #endif
  1129. #if defined(LN) || defined(RT)
  1130. subi r0, KK, 2
  1131. slwi r0, r0, 1 + ZBASE_SHIFT
  1132. add AO, AORIG, r0
  1133. add BO, B, r0
  1134. #endif
  1135. #if defined(LN) || defined(LT)
  1136. LFD f16, 0 * SIZE(BO)
  1137. LFD f17, 1 * SIZE(BO)
  1138. LFD f18, 2 * SIZE(BO)
  1139. LFD f19, 3 * SIZE(BO)
  1140. LFD f20, 4 * SIZE(BO)
  1141. LFD f21, 5 * SIZE(BO)
  1142. LFD f22, 6 * SIZE(BO)
  1143. LFD f23, 7 * SIZE(BO)
  1144. FSUB f0, f16, f0
  1145. FSUB f1, f17, f1
  1146. FSUB f8, f18, f8
  1147. FSUB f9, f19, f9
  1148. FSUB f2, f20, f2
  1149. FSUB f3, f21, f3
  1150. FSUB f10, f22, f10
  1151. FSUB f11, f23, f11
  1152. #else
  1153. LFD f16, 0 * SIZE(AO)
  1154. LFD f17, 1 * SIZE(AO)
  1155. LFD f18, 2 * SIZE(AO)
  1156. LFD f19, 3 * SIZE(AO)
  1157. LFD f20, 4 * SIZE(AO)
  1158. LFD f21, 5 * SIZE(AO)
  1159. LFD f22, 6 * SIZE(AO)
  1160. LFD f23, 7 * SIZE(AO)
  1161. #ifndef CONJ
  1162. FSUB f0, f16, f0
  1163. FSUB f1, f17, f1
  1164. FSUB f2, f18, f2
  1165. FSUB f3, f19, f3
  1166. FSUB f8, f20, f8
  1167. FSUB f9, f21, f9
  1168. FSUB f10, f22, f10
  1169. FSUB f11, f23, f11
  1170. #else
  1171. FSUB f0, f16, f0
  1172. FADD f1, f17, f1
  1173. FSUB f2, f18, f2
  1174. FADD f3, f19, f3
  1175. FSUB f8, f20, f8
  1176. FADD f9, f21, f9
  1177. FSUB f10, f22, f10
  1178. FADD f11, f23, f11
  1179. #endif
  1180. #endif
  1181. #ifdef LN
  1182. LFD f16, 6 * SIZE(AO)
  1183. LFD f17, 7 * SIZE(AO)
  1184. LFD f18, 4 * SIZE(AO)
  1185. LFD f19, 5 * SIZE(AO)
  1186. LFD f20, 0 * SIZE(AO)
  1187. LFD f21, 1 * SIZE(AO)
  1188. FMUL f6, f17, f3
  1189. FMUL f7, f17, f2
  1190. FMUL f14, f17, f11
  1191. FMUL f15, f17, f10
  1192. #ifndef CONJ
  1193. FMSUB f2, f16, f2, f6
  1194. FMADD f3, f16, f3, f7
  1195. FMSUB f10, f16, f10, f14
  1196. FMADD f11, f16, f11, f15
  1197. FMADD f0, f19, f3, f0
  1198. FNMSUB f1, f19, f2, f1
  1199. FMADD f8, f19, f11, f8
  1200. FNMSUB f9, f19, f10, f9
  1201. FNMSUB f0, f18, f2, f0
  1202. FNMSUB f1, f18, f3, f1
  1203. FNMSUB f8, f18, f10, f8
  1204. FNMSUB f9, f18, f11, f9
  1205. FMUL f4, f21, f1
  1206. FMUL f5, f21, f0
  1207. FMUL f12, f21, f9
  1208. FMUL f13, f21, f8
  1209. FMSUB f0, f20, f0, f4
  1210. FMADD f1, f20, f1, f5
  1211. FMSUB f8, f20, f8, f12
  1212. FMADD f9, f20, f9, f13
  1213. #else
  1214. FMADD f2, f16, f2, f6
  1215. FMSUB f3, f16, f3, f7
  1216. FMADD f10, f16, f10, f14
  1217. FMSUB f11, f16, f11, f15
  1218. FMSUB f0, f19, f3, f0
  1219. FNMADD f1, f19, f2, f1
  1220. FMSUB f8, f19, f11, f8
  1221. FNMADD f9, f19, f10, f9
  1222. FNMADD f0, f18, f2, f0
  1223. FNMADD f1, f18, f3, f1
  1224. FNMADD f8, f18, f10, f8
  1225. FNMADD f9, f18, f11, f9
  1226. FMUL f4, f21, f1
  1227. FMUL f5, f21, f0
  1228. FMUL f12, f21, f9
  1229. FMUL f13, f21, f8
  1230. FMADD f0, f20, f0, f4
  1231. FMSUB f1, f20, f1, f5
  1232. FMADD f8, f20, f8, f12
  1233. FMSUB f9, f20, f9, f13
  1234. #endif
  1235. #endif
  1236. #ifdef LT
  1237. LFD f16, 0 * SIZE(AO)
  1238. LFD f17, 1 * SIZE(AO)
  1239. LFD f18, 2 * SIZE(AO)
  1240. LFD f19, 3 * SIZE(AO)
  1241. LFD f20, 6 * SIZE(AO)
  1242. LFD f21, 7 * SIZE(AO)
  1243. FMUL f4, f17, f1
  1244. FMUL f5, f17, f0
  1245. FMUL f12, f17, f9
  1246. FMUL f13, f17, f8
  1247. #ifndef CONJ
  1248. FMSUB f0, f16, f0, f4
  1249. FMADD f1, f16, f1, f5
  1250. FMSUB f8, f16, f8, f12
  1251. FMADD f9, f16, f9, f13
  1252. FMADD f2, f19, f1, f2
  1253. FNMSUB f3, f19, f0, f3
  1254. FMADD f10, f19, f9, f10
  1255. FNMSUB f11, f19, f8, f11
  1256. FNMSUB f2, f18, f0, f2
  1257. FNMSUB f3, f18, f1, f3
  1258. FNMSUB f10, f18, f8, f10
  1259. FNMSUB f11, f18, f9, f11
  1260. FMUL f4, f21, f3
  1261. FMUL f5, f21, f2
  1262. FMUL f12, f21, f11
  1263. FMUL f13, f21, f10
  1264. FMSUB f2, f20, f2, f4
  1265. FMADD f3, f20, f3, f5
  1266. FMSUB f10, f20, f10, f12
  1267. FMADD f11, f20, f11, f13
  1268. #else
  1269. FMADD f0, f16, f0, f4
  1270. FMSUB f1, f16, f1, f5
  1271. FMADD f8, f16, f8, f12
  1272. FMSUB f9, f16, f9, f13
  1273. FMSUB f2, f19, f1, f2
  1274. FNMADD f3, f19, f0, f3
  1275. FMSUB f10, f19, f9, f10
  1276. FNMADD f11, f19, f8, f11
  1277. FNMADD f2, f18, f0, f2
  1278. FNMADD f3, f18, f1, f3
  1279. FNMADD f10, f18, f8, f10
  1280. FNMADD f11, f18, f9, f11
  1281. FMUL f4, f21, f3
  1282. FMUL f5, f21, f2
  1283. FMUL f12, f21, f11
  1284. FMUL f13, f21, f10
  1285. FMADD f2, f20, f2, f4
  1286. FMSUB f3, f20, f3, f5
  1287. FMADD f10, f20, f10, f12
  1288. FMSUB f11, f20, f11, f13
  1289. #endif
  1290. #endif
  1291. #ifdef RN
  1292. LFD f16, 0 * SIZE(BO)
  1293. LFD f17, 1 * SIZE(BO)
  1294. LFD f18, 2 * SIZE(BO)
  1295. LFD f19, 3 * SIZE(BO)
  1296. LFD f20, 6 * SIZE(BO)
  1297. LFD f21, 7 * SIZE(BO)
  1298. FMUL f4, f17, f1
  1299. FMUL f5, f17, f0
  1300. FMUL f6, f17, f3
  1301. FMUL f7, f17, f2
  1302. #ifndef CONJ
  1303. FMSUB f0, f16, f0, f4
  1304. FMADD f1, f16, f1, f5
  1305. FMSUB f2, f16, f2, f6
  1306. FMADD f3, f16, f3, f7
  1307. FMADD f8, f19, f1, f8
  1308. FNMSUB f9, f19, f0, f9
  1309. FMADD f10, f19, f3, f10
  1310. FNMSUB f11, f19, f2, f11
  1311. FNMSUB f8, f18, f0, f8
  1312. FNMSUB f9, f18, f1, f9
  1313. FNMSUB f10, f18, f2, f10
  1314. FNMSUB f11, f18, f3, f11
  1315. FMUL f4, f21, f9
  1316. FMUL f5, f21, f8
  1317. FMUL f6, f21, f11
  1318. FMUL f7, f21, f10
  1319. FMSUB f8, f20, f8, f4
  1320. FMADD f9, f20, f9, f5
  1321. FMSUB f10, f20, f10, f6
  1322. FMADD f11, f20, f11, f7
  1323. #else
  1324. FMADD f0, f16, f0, f4
  1325. FMSUB f1, f16, f1, f5
  1326. FMADD f2, f16, f2, f6
  1327. FMSUB f3, f16, f3, f7
  1328. FMSUB f8, f19, f1, f8
  1329. FNMADD f9, f19, f0, f9
  1330. FMSUB f10, f19, f3, f10
  1331. FNMADD f11, f19, f2, f11
  1332. FNMADD f8, f18, f0, f8
  1333. FNMADD f9, f18, f1, f9
  1334. FNMADD f10, f18, f2, f10
  1335. FNMADD f11, f18, f3, f11
  1336. FMUL f4, f21, f9
  1337. FMUL f5, f21, f8
  1338. FMUL f6, f21, f11
  1339. FMUL f7, f21, f10
  1340. FMADD f8, f20, f8, f4
  1341. FMSUB f9, f20, f9, f5
  1342. FMADD f10, f20, f10, f6
  1343. FMSUB f11, f20, f11, f7
  1344. #endif
  1345. #endif
  1346. #ifdef RT
  1347. LFD f16, 6 * SIZE(BO)
  1348. LFD f17, 7 * SIZE(BO)
  1349. LFD f18, 4 * SIZE(BO)
  1350. LFD f19, 5 * SIZE(BO)
  1351. LFD f20, 0 * SIZE(BO)
  1352. LFD f21, 1 * SIZE(BO)
  1353. FMUL f12, f17, f9
  1354. FMUL f13, f17, f8
  1355. FMUL f14, f17, f11
  1356. FMUL f15, f17, f10
  1357. #ifndef CONJ
  1358. FMSUB f8, f16, f8, f12
  1359. FMADD f9, f16, f9, f13
  1360. FMSUB f10, f16, f10, f14
  1361. FMADD f11, f16, f11, f15
  1362. FMADD f0, f19, f9, f0
  1363. FNMSUB f1, f19, f8, f1
  1364. FMADD f2, f19, f11, f2
  1365. FNMSUB f3, f19, f10, f3
  1366. FNMSUB f0, f18, f8, f0
  1367. FNMSUB f1, f18, f9, f1
  1368. FNMSUB f2, f18, f10, f2
  1369. FNMSUB f3, f18, f11, f3
  1370. FMUL f4, f21, f1
  1371. FMUL f5, f21, f0
  1372. FMUL f6, f21, f3
  1373. FMUL f7, f21, f2
  1374. FMSUB f0, f20, f0, f4
  1375. FMADD f1, f20, f1, f5
  1376. FMSUB f2, f20, f2, f6
  1377. FMADD f3, f20, f3, f7
  1378. #else
  1379. FMADD f8, f16, f8, f12
  1380. FMSUB f9, f16, f9, f13
  1381. FMADD f10, f16, f10, f14
  1382. FMSUB f11, f16, f11, f15
  1383. FMSUB f0, f19, f9, f0
  1384. FNMADD f1, f19, f8, f1
  1385. FMSUB f2, f19, f11, f2
  1386. FNMADD f3, f19, f10, f3
  1387. FNMADD f0, f18, f8, f0
  1388. FNMADD f1, f18, f9, f1
  1389. FNMADD f2, f18, f10, f2
  1390. FNMADD f3, f18, f11, f3
  1391. FMUL f4, f21, f1
  1392. FMUL f5, f21, f0
  1393. FMUL f6, f21, f3
  1394. FMUL f7, f21, f2
  1395. FMADD f0, f20, f0, f4
  1396. FMSUB f1, f20, f1, f5
  1397. FMADD f2, f20, f2, f6
  1398. FMSUB f3, f20, f3, f7
  1399. #endif
  1400. #endif
  1401. #ifdef LN
  1402. subi CO1, CO1, 4 * SIZE
  1403. subi CO2, CO2, 4 * SIZE
  1404. #endif
  1405. #if defined(LN) || defined(LT)
  1406. STFD f0, 0 * SIZE(BO)
  1407. STFD f1, 1 * SIZE(BO)
  1408. STFD f8, 2 * SIZE(BO)
  1409. STFD f9, 3 * SIZE(BO)
  1410. STFD f2, 4 * SIZE(BO)
  1411. STFD f3, 5 * SIZE(BO)
  1412. STFD f10, 6 * SIZE(BO)
  1413. STFD f11, 7 * SIZE(BO)
  1414. #else
  1415. STFD f0, 0 * SIZE(AO)
  1416. STFD f1, 1 * SIZE(AO)
  1417. STFD f2, 2 * SIZE(AO)
  1418. STFD f3, 3 * SIZE(AO)
  1419. STFD f8, 4 * SIZE(AO)
  1420. STFD f9, 5 * SIZE(AO)
  1421. STFD f10, 6 * SIZE(AO)
  1422. STFD f11, 7 * SIZE(AO)
  1423. #endif
  1424. STFD f0, 0 * SIZE(CO1)
  1425. STFD f1, 1 * SIZE(CO1)
  1426. STFD f2, 2 * SIZE(CO1)
  1427. STFD f3, 3 * SIZE(CO1)
  1428. STFD f8, 0 * SIZE(CO2)
  1429. STFD f9, 1 * SIZE(CO2)
  1430. STFD f10, 2 * SIZE(CO2)
  1431. STFD f11, 3 * SIZE(CO2)
  1432. #ifndef LN
  1433. addi CO1, CO1, 4 * SIZE
  1434. addi CO2, CO2, 4 * SIZE
  1435. #endif
  1436. #ifdef RT
  1437. slwi r0, K, 1 + ZBASE_SHIFT
  1438. add AORIG, AORIG, r0
  1439. #endif
  1440. #if defined(LT) || defined(RN)
  1441. sub TEMP, K, KK
  1442. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1443. add AO, AO, TEMP
  1444. add BO, BO, TEMP
  1445. #endif
  1446. #ifdef LT
  1447. addi KK, KK, 2
  1448. #endif
  1449. #ifdef LN
  1450. subi KK, KK, 2
  1451. #endif
  1452. addic. I, I, -1
  1453. bgt LL(11)
  1454. .align 4
  1455. LL(20):
  1456. andi. I, M, 1
  1457. ble LL(29)
  1458. #if defined(LT) || defined(RN)
  1459. LFD f16, 0 * SIZE(AO)
  1460. LFD f17, 1 * SIZE(AO)
  1461. LFD f18, 2 * SIZE(AO)
  1462. LFD f19, 3 * SIZE(AO)
  1463. LFD f20, 0 * SIZE(B)
  1464. LFD f21, 1 * SIZE(B)
  1465. LFD f22, 2 * SIZE(B)
  1466. LFD f23, 3 * SIZE(B)
  1467. LFD f24, 4 * SIZE(B)
  1468. LFD f25, 5 * SIZE(B)
  1469. LFD f26, 6 * SIZE(B)
  1470. LFD f27, 7 * SIZE(B)
  1471. lfs f0, FZERO
  1472. fmr f1, f0
  1473. fmr f2, f0
  1474. fmr f3, f0
  1475. fmr f4, f0
  1476. fmr f5, f0
  1477. fmr f6, f0
  1478. fmr f7, f0
  1479. srawi. r0, KK, 2
  1480. mr BO, B
  1481. mtspr CTR, r0
  1482. #else
  1483. #ifdef LN
  1484. slwi r0, K, 0 + ZBASE_SHIFT
  1485. sub AORIG, AORIG, r0
  1486. #endif
  1487. slwi r0, KK, 0 + ZBASE_SHIFT
  1488. slwi TEMP, KK, 1 + ZBASE_SHIFT
  1489. add AO, AORIG, r0
  1490. add BO, B, TEMP
  1491. sub TEMP, K, KK
  1492. LFD f16, 0 * SIZE(AO)
  1493. LFD f17, 1 * SIZE(AO)
  1494. LFD f18, 2 * SIZE(AO)
  1495. LFD f19, 3 * SIZE(AO)
  1496. LFD f20, 0 * SIZE(BO)
  1497. LFD f21, 1 * SIZE(BO)
  1498. LFD f22, 2 * SIZE(BO)
  1499. LFD f23, 3 * SIZE(BO)
  1500. LFD f24, 4 * SIZE(BO)
  1501. LFD f25, 5 * SIZE(BO)
  1502. LFD f26, 6 * SIZE(BO)
  1503. LFD f27, 7 * SIZE(BO)
  1504. lfs f0, FZERO
  1505. fmr f1, f0
  1506. fmr f2, f0
  1507. fmr f3, f0
  1508. fmr f4, f0
  1509. fmr f5, f0
  1510. fmr f6, f0
  1511. fmr f7, f0
  1512. srawi. r0, TEMP, 2
  1513. mtspr CTR, r0
  1514. #endif
  1515. ble LL(25)
  1516. .align 4
  1517. LL(22):
  1518. fmadd f0, f16, f20, f0
  1519. fmadd f1, f16, f21, f1
  1520. fmadd f2, f16, f22, f2
  1521. fmadd f3, f16, f23, f3
  1522. fmadd f4, f17, f20, f4
  1523. fmadd f5, f17, f21, f5
  1524. fmadd f6, f17, f22, f6
  1525. fmadd f7, f17, f23, f7
  1526. LFD f20, 8 * SIZE(BO)
  1527. LFD f21, 9 * SIZE(BO)
  1528. LFD f22, 10 * SIZE(BO)
  1529. LFD f23, 11 * SIZE(BO)
  1530. fmadd f0, f18, f24, f0
  1531. fmadd f1, f18, f25, f1
  1532. fmadd f2, f18, f26, f2
  1533. fmadd f3, f18, f27, f3
  1534. fmadd f4, f19, f24, f4
  1535. fmadd f5, f19, f25, f5
  1536. fmadd f6, f19, f26, f6
  1537. fmadd f7, f19, f27, f7
  1538. LFD f24, 12 * SIZE(BO)
  1539. LFD f25, 13 * SIZE(BO)
  1540. LFD f26, 14 * SIZE(BO)
  1541. LFD f27, 15 * SIZE(BO)
  1542. LFD f16, 4 * SIZE(AO)
  1543. LFD f17, 5 * SIZE(AO)
  1544. LFD f18, 6 * SIZE(AO)
  1545. LFD f19, 7 * SIZE(AO)
  1546. fmadd f0, f16, f20, f0
  1547. fmadd f1, f16, f21, f1
  1548. fmadd f2, f16, f22, f2
  1549. fmadd f3, f16, f23, f3
  1550. fmadd f4, f17, f20, f4
  1551. fmadd f5, f17, f21, f5
  1552. fmadd f6, f17, f22, f6
  1553. fmadd f7, f17, f23, f7
  1554. LFD f20, 16 * SIZE(BO)
  1555. LFD f21, 17 * SIZE(BO)
  1556. LFD f22, 18 * SIZE(BO)
  1557. LFD f23, 19 * SIZE(BO)
  1558. fmadd f0, f18, f24, f0
  1559. fmadd f1, f18, f25, f1
  1560. fmadd f2, f18, f26, f2
  1561. fmadd f3, f18, f27, f3
  1562. fmadd f4, f19, f24, f4
  1563. fmadd f5, f19, f25, f5
  1564. fmadd f6, f19, f26, f6
  1565. fmadd f7, f19, f27, f7
  1566. LFD f16, 8 * SIZE(AO)
  1567. LFD f17, 9 * SIZE(AO)
  1568. LFD f18, 10 * SIZE(AO)
  1569. LFD f19, 11 * SIZE(AO)
  1570. LFD f24, 20 * SIZE(BO)
  1571. LFD f25, 21 * SIZE(BO)
  1572. LFD f26, 22 * SIZE(BO)
  1573. LFD f27, 23 * SIZE(BO)
  1574. addi BO, BO, 16 * SIZE
  1575. addi AO, AO, 8 * SIZE
  1576. bdnz LL(22)
  1577. .align 4
  1578. LL(25):
  1579. #if defined(LT) || defined(RN)
  1580. andi. r0, KK, 3
  1581. #else
  1582. andi. r0, TEMP, 3
  1583. #endif
  1584. mtspr CTR, r0
  1585. ble LL(27)
  1586. .align 4
  1587. LL(26):
  1588. fmadd f0, f16, f20, f0
  1589. fmadd f1, f16, f21, f1
  1590. fmadd f2, f16, f22, f2
  1591. fmadd f3, f16, f23, f3
  1592. fmadd f4, f17, f20, f4
  1593. fmadd f5, f17, f21, f5
  1594. fmadd f6, f17, f22, f6
  1595. fmadd f7, f17, f23, f7
  1596. LFD f20, 4 * SIZE(BO)
  1597. LFD f21, 5 * SIZE(BO)
  1598. LFD f22, 6 * SIZE(BO)
  1599. LFD f23, 7 * SIZE(BO)
  1600. LFD f16, 2 * SIZE(AO)
  1601. LFD f17, 3 * SIZE(AO)
  1602. addi AO, AO, 2 * SIZE
  1603. addi BO, BO, 4 * SIZE
  1604. bdnz LL(26)
  1605. .align 4
  1606. LL(27):
  1607. #ifndef CONJ
  1608. FSUB f0, f0, f5
  1609. FADD f1, f1, f4
  1610. FSUB f2, f2, f7
  1611. FADD f3, f3, f6
  1612. #else
  1613. #if defined(LN) || defined(LT)
  1614. FADD f0, f0, f5
  1615. FSUB f1, f1, f4
  1616. FADD f2, f2, f7
  1617. FSUB f3, f3, f6
  1618. #else
  1619. FADD f0, f0, f5
  1620. FSUB f1, f4, f1
  1621. FADD f2, f2, f7
  1622. FSUB f3, f6, f3
  1623. #endif
  1624. #endif
  1625. #if defined(LN) || defined(RT)
  1626. #ifdef LN
  1627. subi r0, KK, 1
  1628. #else
  1629. subi r0, KK, 2
  1630. #endif
  1631. slwi TEMP, r0, 0 + ZBASE_SHIFT
  1632. slwi r0, r0, 1 + ZBASE_SHIFT
  1633. add AO, AORIG, TEMP
  1634. add BO, B, r0
  1635. #endif
  1636. #if defined(LN) || defined(LT)
  1637. LFD f16, 0 * SIZE(BO)
  1638. LFD f17, 1 * SIZE(BO)
  1639. LFD f18, 2 * SIZE(BO)
  1640. LFD f19, 3 * SIZE(BO)
  1641. FSUB f0, f16, f0
  1642. FSUB f1, f17, f1
  1643. FSUB f2, f18, f2
  1644. FSUB f3, f19, f3
  1645. #else
  1646. LFD f16, 0 * SIZE(AO)
  1647. LFD f17, 1 * SIZE(AO)
  1648. LFD f20, 2 * SIZE(AO)
  1649. LFD f21, 3 * SIZE(AO)
  1650. FSUB f0, f16, f0
  1651. FSUB f1, f17, f1
  1652. FSUB f2, f20, f2
  1653. FSUB f3, f21, f3
  1654. #endif
  1655. #ifdef LN
  1656. LFD f20, 0 * SIZE(AO)
  1657. LFD f21, 1 * SIZE(AO)
  1658. FMUL f4, f21, f1
  1659. FMUL f5, f21, f0
  1660. FMUL f12, f21, f3
  1661. FMUL f13, f21, f2
  1662. #ifndef CONJ
  1663. FMSUB f0, f20, f0, f4
  1664. FMADD f1, f20, f1, f5
  1665. FMSUB f2, f20, f2, f12
  1666. FMADD f3, f20, f3, f13
  1667. #else
  1668. FMADD f0, f20, f0, f4
  1669. FMSUB f1, f20, f1, f5
  1670. FMADD f2, f20, f2, f12
  1671. FMSUB f3, f20, f3, f13
  1672. #endif
  1673. #endif
  1674. #ifdef LT
  1675. LFD f16, 0 * SIZE(AO)
  1676. LFD f17, 1 * SIZE(AO)
  1677. FMUL f4, f17, f1
  1678. FMUL f5, f17, f0
  1679. FMUL f12, f17, f3
  1680. FMUL f13, f17, f2
  1681. #ifndef CONJ
  1682. FMSUB f0, f16, f0, f4
  1683. FMADD f1, f16, f1, f5
  1684. FMSUB f2, f16, f2, f12
  1685. FMADD f3, f16, f3, f13
  1686. #else
  1687. FMADD f0, f16, f0, f4
  1688. FMSUB f1, f16, f1, f5
  1689. FMADD f2, f16, f2, f12
  1690. FMSUB f3, f16, f3, f13
  1691. #endif
  1692. #endif
  1693. #ifdef RN
  1694. LFD f16, 0 * SIZE(BO)
  1695. LFD f17, 1 * SIZE(BO)
  1696. LFD f18, 2 * SIZE(BO)
  1697. LFD f19, 3 * SIZE(BO)
  1698. LFD f20, 6 * SIZE(BO)
  1699. LFD f21, 7 * SIZE(BO)
  1700. FMUL f4, f17, f1
  1701. FMUL f5, f17, f0
  1702. #ifndef CONJ
  1703. FMSUB f0, f16, f0, f4
  1704. FMADD f1, f16, f1, f5
  1705. FMADD f2, f19, f1, f2
  1706. FNMSUB f3, f19, f0, f3
  1707. FNMSUB f2, f18, f0, f2
  1708. FNMSUB f3, f18, f1, f3
  1709. FMUL f4, f21, f3
  1710. FMUL f5, f21, f2
  1711. FMSUB f2, f20, f2, f4
  1712. FMADD f3, f20, f3, f5
  1713. #else
  1714. FMADD f0, f16, f0, f4
  1715. FMSUB f1, f16, f1, f5
  1716. FMSUB f2, f19, f1, f2
  1717. FNMADD f3, f19, f0, f3
  1718. FNMADD f2, f18, f0, f2
  1719. FNMADD f3, f18, f1, f3
  1720. FMUL f4, f21, f3
  1721. FMUL f5, f21, f2
  1722. FMADD f2, f20, f2, f4
  1723. FMSUB f3, f20, f3, f5
  1724. #endif
  1725. #endif
  1726. #ifdef RT
  1727. LFD f16, 6 * SIZE(BO)
  1728. LFD f17, 7 * SIZE(BO)
  1729. LFD f18, 4 * SIZE(BO)
  1730. LFD f19, 5 * SIZE(BO)
  1731. LFD f20, 0 * SIZE(BO)
  1732. LFD f21, 1 * SIZE(BO)
  1733. FMUL f12, f17, f3
  1734. FMUL f13, f17, f2
  1735. #ifndef CONJ
  1736. FMSUB f2, f16, f2, f12
  1737. FMADD f3, f16, f3, f13
  1738. FMADD f0, f19, f3, f0
  1739. FNMSUB f1, f19, f2, f1
  1740. FNMSUB f0, f18, f2, f0
  1741. FNMSUB f1, f18, f3, f1
  1742. FMUL f4, f21, f1
  1743. FMUL f5, f21, f0
  1744. FMSUB f0, f20, f0, f4
  1745. FMADD f1, f20, f1, f5
  1746. #else
  1747. FMADD f2, f16, f2, f12
  1748. FMSUB f3, f16, f3, f13
  1749. FMSUB f0, f19, f3, f0
  1750. FNMADD f1, f19, f2, f1
  1751. FNMADD f0, f18, f2, f0
  1752. FNMADD f1, f18, f3, f1
  1753. FMUL f4, f21, f1
  1754. FMUL f5, f21, f0
  1755. FMADD f0, f20, f0, f4
  1756. FMSUB f1, f20, f1, f5
  1757. #endif
  1758. #endif
  1759. #ifdef LN
  1760. subi CO1, CO1, 2 * SIZE
  1761. subi CO2, CO2, 2 * SIZE
  1762. #endif
  1763. #if defined(LN) || defined(LT)
  1764. STFD f0, 0 * SIZE(BO)
  1765. STFD f1, 1 * SIZE(BO)
  1766. STFD f2, 2 * SIZE(BO)
  1767. STFD f3, 3 * SIZE(BO)
  1768. #else
  1769. STFD f0, 0 * SIZE(AO)
  1770. STFD f1, 1 * SIZE(AO)
  1771. STFD f2, 2 * SIZE(AO)
  1772. STFD f3, 3 * SIZE(AO)
  1773. #endif
  1774. STFD f0, 0 * SIZE(CO1)
  1775. STFD f1, 1 * SIZE(CO1)
  1776. STFD f2, 0 * SIZE(CO2)
  1777. STFD f3, 1 * SIZE(CO2)
  1778. #ifndef LN
  1779. addi CO1, CO1, 2 * SIZE
  1780. addi CO2, CO2, 2 * SIZE
  1781. #endif
  1782. #ifdef RT
  1783. slwi r0, K, 0 + ZBASE_SHIFT
  1784. add AORIG, AORIG, r0
  1785. #endif
  1786. #if defined(LT) || defined(RN)
  1787. sub TEMP, K, KK
  1788. slwi r0, TEMP, 0 + ZBASE_SHIFT
  1789. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1790. add AO, AO, r0
  1791. add BO, BO, TEMP
  1792. #endif
  1793. #ifdef LT
  1794. addi KK, KK, 1
  1795. #endif
  1796. #ifdef LN
  1797. subi KK, KK, 1
  1798. #endif
  1799. .align 4
  1800. LL(29):
  1801. #ifdef LN
  1802. slwi r0, K, 1 + ZBASE_SHIFT
  1803. add B, B, r0
  1804. #endif
  1805. #if defined(LT) || defined(RN)
  1806. mr B, BO
  1807. #endif
  1808. #ifdef RN
  1809. addi KK, KK, 2
  1810. #endif
  1811. #ifdef RT
  1812. subi KK, KK, 2
  1813. #endif
  1814. addic. J, J, -1
  1815. bgt LL(10)
  1816. .align 4
  1817. LL(999):
  1818. addi r3, 0, 0
  1819. lfd f14, 0(SP)
  1820. lfd f15, 8(SP)
  1821. lfd f16, 16(SP)
  1822. lfd f17, 24(SP)
  1823. lfd f18, 32(SP)
  1824. lfd f19, 40(SP)
  1825. lfd f20, 48(SP)
  1826. lfd f21, 56(SP)
  1827. lfd f22, 64(SP)
  1828. lfd f23, 72(SP)
  1829. lfd f24, 80(SP)
  1830. lfd f25, 88(SP)
  1831. lfd f26, 96(SP)
  1832. lfd f27, 104(SP)
  1833. lfd f28, 112(SP)
  1834. lfd f29, 120(SP)
  1835. lfd f30, 128(SP)
  1836. lfd f31, 136(SP)
  1837. #ifdef __64BIT__
  1838. ld r31, 144(SP)
  1839. ld r30, 152(SP)
  1840. ld r29, 160(SP)
  1841. ld r28, 168(SP)
  1842. ld r27, 176(SP)
  1843. ld r26, 184(SP)
  1844. ld r25, 192(SP)
  1845. ld r24, 200(SP)
  1846. ld r23, 208(SP)
  1847. ld r22, 216(SP)
  1848. ld r21, 224(SP)
  1849. #else
  1850. lwz r31, 144(SP)
  1851. lwz r30, 148(SP)
  1852. lwz r29, 152(SP)
  1853. lwz r28, 156(SP)
  1854. lwz r27, 160(SP)
  1855. lwz r26, 164(SP)
  1856. lwz r25, 168(SP)
  1857. lwz r24, 172(SP)
  1858. lwz r23, 176(SP)
  1859. lwz r22, 180(SP)
  1860. lwz r21, 184(SP)
  1861. #endif
  1862. addi SP, SP, STACKSIZE
  1863. blr
  1864. EPILOGUE
  1865. #endif