You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_ppc440_RT.S 60 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define A1 f16
  99. #define A2 f17
  100. #define A3 f18
  101. #define A4 f19
  102. #define A5 f20
  103. #define A6 f21
  104. #define B1 f22
  105. #define B2 f23
  106. #define B3 f24
  107. #define B4 f25
  108. #define B5 f26
  109. #define B6 f27
  110. #define B7 f28
  111. #define B8 f29
  112. #define B9 f30
  113. #define B10 f31
  114. PROLOGUE
  115. PROFCODE
  116. addi SP, SP, -STACKSIZE
  117. li r0, 0
  118. stfd f14, 0(SP)
  119. stfd f15, 8(SP)
  120. stfd f16, 16(SP)
  121. stfd f17, 24(SP)
  122. stfd f18, 32(SP)
  123. stfd f19, 40(SP)
  124. stfd f20, 48(SP)
  125. stfd f21, 56(SP)
  126. stfd f22, 64(SP)
  127. stfd f23, 72(SP)
  128. stfd f24, 80(SP)
  129. stfd f25, 88(SP)
  130. stfd f26, 96(SP)
  131. stfd f27, 104(SP)
  132. stfd f28, 112(SP)
  133. stfd f29, 120(SP)
  134. stfd f30, 128(SP)
  135. stfd f31, 136(SP)
  136. #ifdef __64BIT__
  137. std r31, 144(SP)
  138. std r30, 152(SP)
  139. std r29, 160(SP)
  140. std r28, 168(SP)
  141. std r27, 176(SP)
  142. std r26, 184(SP)
  143. std r25, 192(SP)
  144. std r24, 200(SP)
  145. std r23, 208(SP)
  146. std r22, 216(SP)
  147. std r21, 224(SP)
  148. std r20, 232(SP)
  149. std r19, 240(SP)
  150. std r18, 248(SP)
  151. #else
  152. stw r31, 144(SP)
  153. stw r30, 148(SP)
  154. stw r29, 152(SP)
  155. stw r28, 156(SP)
  156. stw r27, 160(SP)
  157. stw r26, 164(SP)
  158. stw r25, 168(SP)
  159. stw r24, 172(SP)
  160. stw r23, 176(SP)
  161. stw r22, 180(SP)
  162. stw r21, 184(SP)
  163. stw r20, 188(SP)
  164. stw r19, 192(SP)
  165. stw r18, 196(SP)
  166. #endif
  167. stw r0, FZERO
  168. #if defined(_AIX) || defined(__APPLE__)
  169. #if !defined(__64BIT__) && defined(DOUBLE)
  170. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  171. #endif
  172. #endif
  173. slwi LDC, LDC, BASE_SHIFT
  174. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  175. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  176. #endif
  177. #if defined(_AIX) || defined(__APPLE__)
  178. #ifdef __64BIT__
  179. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  180. #else
  181. #ifdef DOUBLE
  182. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  183. #else
  184. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  185. #endif
  186. #endif
  187. #endif
  188. #ifdef LN
  189. mullw r0, M, K
  190. slwi r0, r0, BASE_SHIFT
  191. add A, A, r0
  192. slwi r0, M, BASE_SHIFT
  193. add C, C, r0
  194. #endif
  195. #ifdef RN
  196. neg KK, OFFSET
  197. #endif
  198. #ifdef RT
  199. mullw r0, N, K
  200. slwi r0, r0, BASE_SHIFT
  201. add B, B, r0
  202. mullw r0, N, LDC
  203. add C, C, r0
  204. sub KK, N, OFFSET
  205. #endif
  206. cmpwi cr0, M, 0
  207. ble .L999
  208. cmpwi cr0, N, 0
  209. ble .L999
  210. cmpwi cr0, K, 0
  211. ble .L999
  212. lfs f0, FZERO
  213. .L70:
  214. andi. J, N, 1
  215. ble .L40
  216. #ifdef RT
  217. slwi r0, K, 0 + BASE_SHIFT
  218. sub B, B, r0
  219. sub C, C, LDC
  220. #endif
  221. mr CO1, C
  222. #ifdef LN
  223. add KK, M, OFFSET
  224. #endif
  225. #ifdef LT
  226. mr KK, OFFSET
  227. #endif
  228. fmr f1, f0
  229. fmr f2, f0
  230. fmr f3, f0
  231. srawi. I, M, 2
  232. #if defined(LN) || defined(RT)
  233. mr AORIG, A
  234. #else
  235. mr AO, A
  236. #endif
  237. #ifndef RT
  238. add C, CO1, LDC
  239. #endif
  240. ble .L80
  241. .align 4
  242. .L71:
  243. #if defined(LT) || defined(RN)
  244. LFD f16, 0 * SIZE(AO)
  245. LFD f17, 1 * SIZE(AO)
  246. LFD f18, 2 * SIZE(AO)
  247. LFD f19, 3 * SIZE(AO)
  248. LFD f20, 0 * SIZE(B)
  249. LFD f21, 1 * SIZE(B)
  250. LFD f22, 2 * SIZE(B)
  251. LFD f23, 3 * SIZE(B)
  252. srawi. r0, KK, 2
  253. mtspr CTR, r0
  254. mr BO, B
  255. #else
  256. #ifdef LN
  257. slwi r0, K, 2 + BASE_SHIFT
  258. sub AORIG, AORIG, r0
  259. #endif
  260. slwi r0, KK, 2 + BASE_SHIFT
  261. slwi TEMP, KK, 0 + BASE_SHIFT
  262. add AO, AORIG, r0
  263. add BO, B, TEMP
  264. sub TEMP, K, KK
  265. LFD f16, 0 * SIZE(AO)
  266. LFD f17, 1 * SIZE(AO)
  267. LFD f18, 2 * SIZE(AO)
  268. LFD f19, 3 * SIZE(AO)
  269. LFD f20, 0 * SIZE(BO)
  270. LFD f21, 1 * SIZE(BO)
  271. LFD f22, 2 * SIZE(BO)
  272. LFD f23, 3 * SIZE(BO)
  273. srawi. r0, TEMP, 2
  274. mtspr CTR, r0
  275. #endif
  276. ble .L75
  277. .align 5
  278. .L72:
  279. FMADD f0, f16, f20, f0
  280. LFD f16, 4 * SIZE(AO)
  281. FMADD f1, f17, f20, f1
  282. LFD f17, 5 * SIZE(AO)
  283. FMADD f2, f18, f20, f2
  284. LFD f18, 6 * SIZE(AO)
  285. FMADD f3, f19, f20, f3
  286. LFD f19, 7 * SIZE(AO)
  287. LFDU f20, 4 * SIZE(BO)
  288. FMADD f0, f16, f21, f0
  289. LFD f16, 8 * SIZE(AO)
  290. FMADD f1, f17, f21, f1
  291. LFD f17, 9 * SIZE(AO)
  292. FMADD f2, f18, f21, f2
  293. LFD f18, 10 * SIZE(AO)
  294. FMADD f3, f19, f21, f3
  295. LFD f19, 11 * SIZE(AO)
  296. LFD f21, 1 * SIZE(BO)
  297. FMADD f0, f16, f22, f0
  298. LFD f16, 12 * SIZE(AO)
  299. FMADD f1, f17, f22, f1
  300. LFD f17, 13 * SIZE(AO)
  301. FMADD f2, f18, f22, f2
  302. LFD f18, 14 * SIZE(AO)
  303. FMADD f3, f19, f22, f3
  304. LFD f19, 15 * SIZE(AO)
  305. LFD f22, 2 * SIZE(BO)
  306. FMADD f0, f16, f23, f0
  307. LFDU f16, 16 * SIZE(AO)
  308. FMADD f1, f17, f23, f1
  309. LFD f17, 1 * SIZE(AO)
  310. FMADD f2, f18, f23, f2
  311. LFD f18, 2 * SIZE(AO)
  312. FMADD f3, f19, f23, f3
  313. LFD f19, 3 * SIZE(AO)
  314. LFD f23, 3 * SIZE(BO)
  315. bdnz .L72
  316. .align 4
  317. .L75:
  318. #if defined(LT) || defined(RN)
  319. andi. r0, KK, 3
  320. #else
  321. andi. r0, TEMP, 3
  322. #endif
  323. mtspr CTR, r0
  324. ble+ .L78
  325. .align 4
  326. .L76:
  327. FMADD f0, f16, f20, f0
  328. LFDU f16, 4 * SIZE(AO)
  329. FMADD f1, f17, f20, f1
  330. LFD f17, 1 * SIZE(AO)
  331. FMADD f2, f18, f20, f2
  332. LFD f18, 2 * SIZE(AO)
  333. FMADD f3, f19, f20, f3
  334. LFDU f20, 1 * SIZE(BO)
  335. LFD f19, 3 * SIZE(AO)
  336. bdnz .L76
  337. .align 4
  338. .L78:
  339. #if defined(LN) || defined(RT)
  340. #ifdef LN
  341. subi r0, KK, 4
  342. #else
  343. subi r0, KK, 1
  344. #endif
  345. slwi TEMP, r0, 2 + BASE_SHIFT
  346. slwi r0, r0, 0 + BASE_SHIFT
  347. add AO, AORIG, TEMP
  348. add BO, B, r0
  349. #endif
  350. #if defined(LN) || defined(LT)
  351. LFD f16, 0 * SIZE(BO)
  352. LFD f20, 1 * SIZE(BO)
  353. LFD f24, 2 * SIZE(BO)
  354. LFD f28, 3 * SIZE(BO)
  355. FSUB f0, f16, f0
  356. FSUB f1, f20, f1
  357. FSUB f2, f24, f2
  358. FSUB f3, f28, f3
  359. #else
  360. LFD f16, 0 * SIZE(AO)
  361. LFD f17, 1 * SIZE(AO)
  362. LFD f18, 2 * SIZE(AO)
  363. LFD f19, 3 * SIZE(AO)
  364. FSUB f0, f16, f0
  365. FSUB f1, f17, f1
  366. FSUB f2, f18, f2
  367. FSUB f3, f19, f3
  368. #endif
  369. #ifdef LN
  370. LFD f16, 15 * SIZE(AO)
  371. LFD f17, 14 * SIZE(AO)
  372. LFD f18, 13 * SIZE(AO)
  373. LFD f19, 12 * SIZE(AO)
  374. FMUL f3, f16, f3
  375. FNMSUB f2, f17, f3, f2
  376. FNMSUB f1, f18, f3, f1
  377. FNMSUB f0, f19, f3, f0
  378. LFD f16, 10 * SIZE(AO)
  379. LFD f17, 9 * SIZE(AO)
  380. LFD f18, 8 * SIZE(AO)
  381. LFD f19, 5 * SIZE(AO)
  382. LFD f20, 4 * SIZE(AO)
  383. LFD f21, 0 * SIZE(AO)
  384. FMUL f2, f16, f2
  385. FNMSUB f1, f17, f2, f1
  386. FNMSUB f0, f18, f2, f0
  387. FMUL f1, f19, f1
  388. FNMSUB f0, f20, f1, f0
  389. FMUL f0, f21, f0
  390. #endif
  391. #ifdef LT
  392. LFD f16, 0 * SIZE(AO)
  393. LFD f17, 1 * SIZE(AO)
  394. LFD f18, 2 * SIZE(AO)
  395. LFD f19, 3 * SIZE(AO)
  396. FMUL f0, f16, f0
  397. FNMSUB f1, f17, f0, f1
  398. FNMSUB f2, f18, f0, f2
  399. FNMSUB f3, f19, f0, f3
  400. LFD f17, 5 * SIZE(AO)
  401. LFD f18, 6 * SIZE(AO)
  402. LFD f19, 7 * SIZE(AO)
  403. FMUL f1, f17, f1
  404. FNMSUB f2, f18, f1, f2
  405. FNMSUB f3, f19, f1, f3
  406. LFD f18, 10 * SIZE(AO)
  407. LFD f19, 11 * SIZE(AO)
  408. FMUL f2, f18, f2
  409. FNMSUB f3, f19, f2, f3
  410. LFD f19, 15 * SIZE(AO)
  411. FMUL f3, f19, f3
  412. #endif
  413. #ifdef RN
  414. LFD f16, 0 * SIZE(BO)
  415. FMUL f0, f16, f0
  416. FMUL f1, f16, f1
  417. FMUL f2, f16, f2
  418. FMUL f3, f16, f3
  419. #endif
  420. #ifdef RT
  421. LFD f21, 0 * SIZE(BO)
  422. FMUL f0, f21, f0
  423. FMUL f1, f21, f1
  424. FMUL f2, f21, f2
  425. FMUL f3, f21, f3
  426. #endif
  427. #ifdef LN
  428. subi CO1, CO1, 4 * SIZE
  429. #endif
  430. #if defined(LN) || defined(LT)
  431. STFD f0, 0 * SIZE(BO)
  432. STFD f1, 1 * SIZE(BO)
  433. STFD f2, 2 * SIZE(BO)
  434. STFD f3, 3 * SIZE(BO)
  435. #else
  436. STFD f0, 0 * SIZE(AO)
  437. STFD f1, 1 * SIZE(AO)
  438. STFD f2, 2 * SIZE(AO)
  439. STFD f3, 3 * SIZE(AO)
  440. #endif
  441. STFD f0, 0 * SIZE(CO1)
  442. STFD f1, 1 * SIZE(CO1)
  443. STFD f2, 2 * SIZE(CO1)
  444. STFD f3, 3 * SIZE(CO1)
  445. lfs f0, FZERO
  446. fmr f1, f0
  447. fmr f2, f0
  448. fmr f3, f0
  449. #ifndef LN
  450. addi CO1, CO1, 4 * SIZE
  451. #endif
  452. #ifdef RT
  453. slwi r0, K, 2 + BASE_SHIFT
  454. add AORIG, AORIG, r0
  455. #endif
  456. #if defined(LT) || defined(RN)
  457. sub TEMP, K, KK
  458. slwi r0, TEMP, 2 + BASE_SHIFT
  459. slwi TEMP, TEMP, 0 + BASE_SHIFT
  460. add AO, AO, r0
  461. add BO, BO, TEMP
  462. #endif
  463. #ifdef LN
  464. subi KK, KK, 4
  465. #endif
  466. #ifdef LT
  467. addi KK, KK, 4
  468. #endif
  469. addic. I, I, -1
  470. bgt+ .L71
  471. .align 4
  472. .L80:
  473. andi. I, M, 2
  474. ble .L90
  475. #if defined(LT) || defined(RN)
  476. LFD f16, 0 * SIZE(AO)
  477. LFD f17, 1 * SIZE(AO)
  478. LFD f18, 2 * SIZE(AO)
  479. LFD f19, 3 * SIZE(AO)
  480. LFD f20, 0 * SIZE(B)
  481. LFD f21, 1 * SIZE(B)
  482. LFD f22, 2 * SIZE(B)
  483. LFD f23, 3 * SIZE(B)
  484. srawi. r0, KK, 2
  485. mtspr CTR, r0
  486. mr BO, B
  487. #else
  488. #ifdef LN
  489. slwi r0, K, 1 + BASE_SHIFT
  490. sub AORIG, AORIG, r0
  491. #endif
  492. slwi r0, KK, 1 + BASE_SHIFT
  493. slwi TEMP, KK, 0 + BASE_SHIFT
  494. add AO, AORIG, r0
  495. add BO, B, TEMP
  496. sub TEMP, K, KK
  497. LFD f16, 0 * SIZE(AO)
  498. LFD f17, 1 * SIZE(AO)
  499. LFD f18, 2 * SIZE(AO)
  500. LFD f19, 3 * SIZE(AO)
  501. LFD f20, 0 * SIZE(BO)
  502. LFD f21, 1 * SIZE(BO)
  503. LFD f22, 2 * SIZE(BO)
  504. LFD f23, 3 * SIZE(BO)
  505. srawi. r0, TEMP, 2
  506. mtspr CTR, r0
  507. #endif
  508. ble .L85
  509. .align 5
  510. .L82:
  511. FMADD f0, f16, f20, f0
  512. LFD f16, 4 * SIZE(AO)
  513. FMADD f1, f17, f20, f1
  514. LFDU f20, 4 * SIZE(BO)
  515. LFD f17, 5 * SIZE(AO)
  516. FMADD f2, f18, f21, f2
  517. LFD f18, 6 * SIZE(AO)
  518. FMADD f3, f19, f21, f3
  519. LFD f21, 1 * SIZE(BO)
  520. LFD f19, 7 * SIZE(AO)
  521. FMADD f0, f16, f22, f0
  522. LFDU f16, 8 * SIZE(AO)
  523. FMADD f1, f17, f22, f1
  524. LFD f22, 2 * SIZE(BO)
  525. LFD f17, 1 * SIZE(AO)
  526. FMADD f2, f18, f23, f2
  527. LFD f18, 2 * SIZE(AO)
  528. FMADD f3, f19, f23, f3
  529. LFD f23, 3 * SIZE(BO)
  530. LFD f19, 3 * SIZE(AO)
  531. bdnz .L82
  532. .align 4
  533. .L85:
  534. #if defined(LT) || defined(RN)
  535. andi. r0, KK, 3
  536. #else
  537. andi. r0, TEMP, 3
  538. #endif
  539. mtspr CTR, r0
  540. ble+ .L88
  541. .align 4
  542. .L86:
  543. FMADD f0, f16, f20, f0
  544. LFDU f16, 2 * SIZE(AO)
  545. FMADD f1, f17, f20, f1
  546. LFDU f20, 1 * SIZE(BO)
  547. LFD f17, 1 * SIZE(AO)
  548. bdnz .L86
  549. .align 4
  550. .L88:
  551. FADD f0, f2, f0
  552. FADD f1, f3, f1
  553. #if defined(LN) || defined(RT)
  554. #ifdef LN
  555. subi r0, KK, 2
  556. #else
  557. subi r0, KK, 1
  558. #endif
  559. slwi TEMP, r0, 1 + BASE_SHIFT
  560. slwi r0, r0, 0 + BASE_SHIFT
  561. add AO, AORIG, TEMP
  562. add BO, B, r0
  563. #endif
  564. #if defined(LN) || defined(LT)
  565. LFD f16, 0 * SIZE(BO)
  566. LFD f20, 1 * SIZE(BO)
  567. FSUB f0, f16, f0
  568. FSUB f1, f20, f1
  569. #else
  570. LFD f16, 0 * SIZE(AO)
  571. LFD f17, 1 * SIZE(AO)
  572. FSUB f0, f16, f0
  573. FSUB f1, f17, f1
  574. #endif
  575. #ifdef LN
  576. LFD f19, 3 * SIZE(AO)
  577. LFD f20, 2 * SIZE(AO)
  578. LFD f21, 0 * SIZE(AO)
  579. FMUL f1, f19, f1
  580. FNMSUB f0, f20, f1, f0
  581. FMUL f0, f21, f0
  582. #endif
  583. #ifdef LT
  584. LFD f16, 0 * SIZE(AO)
  585. LFD f17, 1 * SIZE(AO)
  586. FMUL f0, f16, f0
  587. FNMSUB f1, f17, f0, f1
  588. LFD f17, 3 * SIZE(AO)
  589. FMUL f1, f17, f1
  590. #endif
  591. #ifdef RN
  592. LFD f16, 0 * SIZE(BO)
  593. FMUL f0, f16, f0
  594. FMUL f1, f16, f1
  595. #endif
  596. #ifdef RT
  597. LFD f21, 0 * SIZE(BO)
  598. FMUL f0, f21, f0
  599. FMUL f1, f21, f1
  600. #endif
  601. #ifdef LN
  602. subi CO1, CO1, 2 * SIZE
  603. #endif
  604. #if defined(LN) || defined(LT)
  605. STFD f0, 0 * SIZE(BO)
  606. STFD f1, 1 * SIZE(BO)
  607. #else
  608. STFD f0, 0 * SIZE(AO)
  609. STFD f1, 1 * SIZE(AO)
  610. #endif
  611. STFD f0, 0 * SIZE(CO1)
  612. STFD f1, 1 * SIZE(CO1)
  613. lfs f0, FZERO
  614. fmr f1, f0
  615. fmr f2, f0
  616. fmr f3, f0
  617. #ifndef LN
  618. addi CO1, CO1, 2 * SIZE
  619. #endif
  620. #ifdef RT
  621. slwi r0, K, 1 + BASE_SHIFT
  622. add AORIG, AORIG, r0
  623. #endif
  624. #if defined(LT) || defined(RN)
  625. sub TEMP, K, KK
  626. slwi r0, TEMP, 1 + BASE_SHIFT
  627. slwi TEMP, TEMP, 0 + BASE_SHIFT
  628. add AO, AO, r0
  629. add BO, BO, TEMP
  630. #endif
  631. #ifdef LN
  632. subi KK, KK, 2
  633. #endif
  634. #ifdef LT
  635. addi KK, KK, 2
  636. #endif
  637. .align 4
  638. .L90:
  639. andi. I, M, 1
  640. ble .L99
  641. #if defined(LT) || defined(RN)
  642. LFD f16, 0 * SIZE(AO)
  643. LFD f17, 1 * SIZE(AO)
  644. LFD f18, 2 * SIZE(AO)
  645. LFD f19, 3 * SIZE(AO)
  646. LFD f20, 0 * SIZE(B)
  647. LFD f21, 1 * SIZE(B)
  648. LFD f22, 2 * SIZE(B)
  649. LFD f23, 3 * SIZE(B)
  650. srawi. r0, KK, 3
  651. mtspr CTR, r0
  652. mr BO, B
  653. #else
  654. #ifdef LN
  655. slwi r0, K, BASE_SHIFT
  656. sub AORIG, AORIG, r0
  657. #endif
  658. slwi r0, KK, 0 + BASE_SHIFT
  659. slwi TEMP, KK, 0 + BASE_SHIFT
  660. add AO, AORIG, r0
  661. add BO, B, TEMP
  662. sub TEMP, K, KK
  663. LFD f16, 0 * SIZE(AO)
  664. LFD f17, 1 * SIZE(AO)
  665. LFD f18, 2 * SIZE(AO)
  666. LFD f19, 3 * SIZE(AO)
  667. LFD f20, 0 * SIZE(BO)
  668. LFD f21, 1 * SIZE(BO)
  669. LFD f22, 2 * SIZE(BO)
  670. LFD f23, 3 * SIZE(BO)
  671. srawi. r0, TEMP, 3
  672. mtspr CTR, r0
  673. #endif
  674. ble .L95
  675. .align 5
  676. .L92:
  677. FMADD f0, f16, f20, f0
  678. LFD f16, 4 * SIZE(AO)
  679. LFD f20, 4 * SIZE(BO)
  680. FMADD f1, f17, f21, f1
  681. LFD f17, 5 * SIZE(AO)
  682. LFD f21, 5 * SIZE(BO)
  683. FMADD f2, f18, f22, f2
  684. LFD f18, 6 * SIZE(AO)
  685. LFD f22, 6 * SIZE(BO)
  686. FMADD f3, f19, f23, f3
  687. LFD f19, 7 * SIZE(AO)
  688. LFD f23, 7 * SIZE(BO)
  689. FMADD f0, f16, f20, f0
  690. LFDU f16, 8 * SIZE(AO)
  691. LFDU f20, 8 * SIZE(BO)
  692. FMADD f1, f17, f21, f1
  693. LFD f17, 1 * SIZE(AO)
  694. LFD f21, 1 * SIZE(BO)
  695. FMADD f2, f18, f22, f2
  696. LFD f18, 2 * SIZE(AO)
  697. LFD f22, 2 * SIZE(BO)
  698. FMADD f3, f19, f23, f3
  699. LFD f19, 3 * SIZE(AO)
  700. LFD f23, 3 * SIZE(BO)
  701. bdnz .L92
  702. .align 4
  703. .L95:
  704. #if defined(LT) || defined(RN)
  705. andi. r0, KK, 7
  706. #else
  707. andi. r0, TEMP, 7
  708. #endif
  709. mtspr CTR, r0
  710. ble+ .L98
  711. .align 4
  712. .L96:
  713. FMADD f0, f16, f20, f0
  714. LFDU f16, 1 * SIZE(AO)
  715. LFDU f20, 1 * SIZE(BO)
  716. bdnz .L96
  717. .align 4
  718. .L98:
  719. FADD f0, f1, f0
  720. FADD f2, f3, f2
  721. FADD f0, f2, f0
  722. #if defined(LN) || defined(RT)
  723. #ifdef LN
  724. subi r0, KK, 1
  725. #else
  726. subi r0, KK, 1
  727. #endif
  728. slwi TEMP, r0, 0 + BASE_SHIFT
  729. slwi r0, r0, 0 + BASE_SHIFT
  730. add AO, AORIG, TEMP
  731. add BO, B, r0
  732. #endif
  733. #if defined(LN) || defined(LT)
  734. LFD f16, 0 * SIZE(BO)
  735. FSUB f0, f16, f0
  736. #else
  737. LFD f16, 0 * SIZE(AO)
  738. FSUB f0, f16, f0
  739. #endif
  740. #ifdef LN
  741. LFD f21, 0 * SIZE(AO)
  742. FMUL f0, f21, f0
  743. #endif
  744. #ifdef LT
  745. LFD f16, 0 * SIZE(AO)
  746. FMUL f0, f16, f0
  747. #endif
  748. #ifdef RN
  749. LFD f16, 0 * SIZE(BO)
  750. FMUL f0, f16, f0
  751. #endif
  752. #ifdef RT
  753. LFD f21, 0 * SIZE(BO)
  754. FMUL f0, f21, f0
  755. #endif
  756. #ifdef LN
  757. subi CO1, CO1, 1 * SIZE
  758. #endif
  759. #if defined(LN) || defined(LT)
  760. STFD f0, 0 * SIZE(BO)
  761. #else
  762. STFD f0, 0 * SIZE(AO)
  763. #endif
  764. STFD f0, 0 * SIZE(CO1)
  765. lfs f0, FZERO
  766. #ifndef LN
  767. addi CO1, CO1, 1 * SIZE
  768. #endif
  769. #ifdef RT
  770. slwi r0, K, 0 + BASE_SHIFT
  771. add AORIG, AORIG, r0
  772. #endif
  773. #if defined(LT) || defined(RN)
  774. sub TEMP, K, KK
  775. slwi r0, TEMP, 0 + BASE_SHIFT
  776. slwi TEMP, TEMP, 0 + BASE_SHIFT
  777. add AO, AO, r0
  778. add BO, BO, TEMP
  779. #endif
  780. #ifdef LN
  781. subi KK, KK, 1
  782. #endif
  783. #ifdef LT
  784. addi KK, KK, 1
  785. #endif
  786. .align 4
  787. .L99:
  788. #ifdef LN
  789. slwi r0, K, 0 + BASE_SHIFT
  790. add B, B, r0
  791. #endif
  792. #if defined(LT) || defined(RN)
  793. mr B, BO
  794. #endif
  795. #ifdef RN
  796. addi KK, KK, 1
  797. #endif
  798. #ifdef RT
  799. subi KK, KK, 1
  800. #endif
  801. .align 4
  802. .L40:
  803. andi. J, N, 2
  804. ble .L09
  805. #ifdef RT
  806. slwi r0, K, 1 + BASE_SHIFT
  807. sub B, B, r0
  808. slwi r0, LDC, 1
  809. sub C, C, r0
  810. #endif
  811. mr CO1, C
  812. add CO2, C, LDC
  813. #ifdef LN
  814. add KK, M, OFFSET
  815. #endif
  816. #ifdef LT
  817. mr KK, OFFSET
  818. #endif
  819. fmr f1, f0
  820. fmr f2, f0
  821. fmr f3, f0
  822. fmr f4, f0
  823. fmr f5, f0
  824. fmr f6, f0
  825. fmr f7, f0
  826. srawi. I, M, 2
  827. #if defined(LN) || defined(RT)
  828. mr AORIG, A
  829. #else
  830. mr AO, A
  831. #endif
  832. #ifndef RT
  833. add C, CO2, LDC
  834. #endif
  835. ble .L50
  836. .align 4
  837. .L41:
  838. #if defined(LT) || defined(RN)
  839. LFD f16, 0 * SIZE(AO)
  840. LFD f17, 1 * SIZE(AO)
  841. LFD f18, 2 * SIZE(AO)
  842. LFD f19, 3 * SIZE(AO)
  843. LFD f20, 0 * SIZE(B)
  844. LFD f21, 1 * SIZE(B)
  845. LFD f22, 2 * SIZE(B)
  846. LFD f23, 3 * SIZE(B)
  847. srawi. r0, KK, 2
  848. mtspr CTR, r0
  849. mr BO, B
  850. #else
  851. #ifdef LN
  852. slwi r0, K, 2 + BASE_SHIFT
  853. sub AORIG, AORIG, r0
  854. #endif
  855. slwi r0, KK, 2 + BASE_SHIFT
  856. slwi TEMP, KK, 1 + BASE_SHIFT
  857. add AO, AORIG, r0
  858. add BO, B, TEMP
  859. sub TEMP, K, KK
  860. LFD f16, 0 * SIZE(AO)
  861. LFD f17, 1 * SIZE(AO)
  862. LFD f18, 2 * SIZE(AO)
  863. LFD f19, 3 * SIZE(AO)
  864. LFD f20, 0 * SIZE(BO)
  865. LFD f21, 1 * SIZE(BO)
  866. LFD f22, 2 * SIZE(BO)
  867. LFD f23, 3 * SIZE(BO)
  868. srawi. r0, TEMP, 2
  869. mtspr CTR, r0
  870. #endif
  871. ble .L45
  872. .align 5
  873. .L42:
  874. FMADD f0, f16, f20, f0
  875. FMADD f1, f17, f20, f1
  876. FMADD f2, f18, f20, f2
  877. FMADD f3, f19, f20, f3
  878. LFD f20, 4 * SIZE(BO)
  879. FMADD f4, f16, f21, f4
  880. LFD f16, 4 * SIZE(AO)
  881. FMADD f5, f17, f21, f5
  882. LFD f17, 5 * SIZE(AO)
  883. FMADD f6, f18, f21, f6
  884. LFD f18, 6 * SIZE(AO)
  885. FMADD f7, f19, f21, f7
  886. LFD f19, 7 * SIZE(AO)
  887. FMADD f0, f16, f22, f0
  888. LFD f21, 5 * SIZE(BO)
  889. FMADD f1, f17, f22, f1
  890. FMADD f2, f18, f22, f2
  891. FMADD f3, f19, f22, f3
  892. LFD f22, 6 * SIZE(BO)
  893. FMADD f4, f16, f23, f4
  894. LFD f16, 8 * SIZE(AO)
  895. FMADD f5, f17, f23, f5
  896. LFD f17, 9 * SIZE(AO)
  897. FMADD f6, f18, f23, f6
  898. LFD f18, 10 * SIZE(AO)
  899. FMADD f7, f19, f23, f7
  900. LFD f19, 11 * SIZE(AO)
  901. FMADD f0, f16, f20, f0
  902. LFD f23, 7 * SIZE(BO)
  903. FMADD f1, f17, f20, f1
  904. FMADD f2, f18, f20, f2
  905. FMADD f3, f19, f20, f3
  906. LFDU f20, 8 * SIZE(BO)
  907. FMADD f4, f16, f21, f4
  908. LFD f16, 12 * SIZE(AO)
  909. FMADD f5, f17, f21, f5
  910. LFD f17, 13 * SIZE(AO)
  911. FMADD f6, f18, f21, f6
  912. LFD f18, 14 * SIZE(AO)
  913. FMADD f7, f19, f21, f7
  914. LFD f19, 15 * SIZE(AO)
  915. FMADD f0, f16, f22, f0
  916. LFD f21, 1 * SIZE(BO)
  917. FMADD f1, f17, f22, f1
  918. FMADD f2, f18, f22, f2
  919. FMADD f3, f19, f22, f3
  920. LFD f22, 2 * SIZE(BO)
  921. FMADD f4, f16, f23, f4
  922. LFDU f16, 16 * SIZE(AO)
  923. FMADD f5, f17, f23, f5
  924. LFD f17, 1 * SIZE(AO)
  925. FMADD f6, f18, f23, f6
  926. LFD f18, 2 * SIZE(AO)
  927. FMADD f7, f19, f23, f7
  928. LFD f19, 3 * SIZE(AO)
  929. LFD f23, 3 * SIZE(BO)
  930. bdnz .L42
  931. .align 4
  932. .L45:
  933. #if defined(LT) || defined(RN)
  934. andi. r0, KK, 3
  935. #else
  936. andi. r0, TEMP, 3
  937. #endif
  938. mtspr CTR, r0
  939. ble+ .L48
  940. .align 4
  941. .L46:
  942. FMADD f0, f16, f20, f0
  943. FMADD f1, f17, f20, f1
  944. FMADD f2, f18, f20, f2
  945. FMADD f3, f19, f20, f3
  946. LFDU f20, 2 * SIZE(BO)
  947. FMADD f4, f16, f21, f4
  948. LFDU f16, 4 * SIZE(AO)
  949. FMADD f5, f17, f21, f5
  950. LFD f17, 1 * SIZE(AO)
  951. FMADD f6, f18, f21, f6
  952. LFD f18, 2 * SIZE(AO)
  953. FMADD f7, f19, f21, f7
  954. LFD f19, 3 * SIZE(AO)
  955. LFD f21, 1 * SIZE(BO)
  956. bdnz .L46
  957. .align 4
  958. .L48:
  959. #if defined(LN) || defined(RT)
  960. #ifdef LN
  961. subi r0, KK, 4
  962. #else
  963. subi r0, KK, 2
  964. #endif
  965. slwi TEMP, r0, 2 + BASE_SHIFT
  966. slwi r0, r0, 1 + BASE_SHIFT
  967. add AO, AORIG, TEMP
  968. add BO, B, r0
  969. #endif
  970. #if defined(LN) || defined(LT)
  971. LFD f16, 0 * SIZE(BO)
  972. LFD f17, 1 * SIZE(BO)
  973. LFD f20, 2 * SIZE(BO)
  974. LFD f21, 3 * SIZE(BO)
  975. LFD f24, 4 * SIZE(BO)
  976. LFD f25, 5 * SIZE(BO)
  977. LFD f28, 6 * SIZE(BO)
  978. LFD f29, 7 * SIZE(BO)
  979. FSUB f0, f16, f0
  980. FSUB f4, f17, f4
  981. FSUB f1, f20, f1
  982. FSUB f5, f21, f5
  983. FSUB f2, f24, f2
  984. FSUB f6, f25, f6
  985. FSUB f3, f28, f3
  986. FSUB f7, f29, f7
  987. #else
  988. LFD f16, 0 * SIZE(AO)
  989. LFD f17, 1 * SIZE(AO)
  990. LFD f18, 2 * SIZE(AO)
  991. LFD f19, 3 * SIZE(AO)
  992. LFD f20, 4 * SIZE(AO)
  993. LFD f21, 5 * SIZE(AO)
  994. LFD f22, 6 * SIZE(AO)
  995. LFD f23, 7 * SIZE(AO)
  996. FSUB f0, f16, f0
  997. FSUB f1, f17, f1
  998. FSUB f2, f18, f2
  999. FSUB f3, f19, f3
  1000. FSUB f4, f20, f4
  1001. FSUB f5, f21, f5
  1002. FSUB f6, f22, f6
  1003. FSUB f7, f23, f7
  1004. #endif
  1005. #ifdef LN
  1006. LFD f16, 15 * SIZE(AO)
  1007. LFD f17, 14 * SIZE(AO)
  1008. LFD f18, 13 * SIZE(AO)
  1009. LFD f19, 12 * SIZE(AO)
  1010. FMUL f3, f16, f3
  1011. FMUL f7, f16, f7
  1012. FNMSUB f2, f17, f3, f2
  1013. FNMSUB f6, f17, f7, f6
  1014. FNMSUB f1, f18, f3, f1
  1015. FNMSUB f5, f18, f7, f5
  1016. FNMSUB f0, f19, f3, f0
  1017. FNMSUB f4, f19, f7, f4
  1018. LFD f16, 10 * SIZE(AO)
  1019. LFD f17, 9 * SIZE(AO)
  1020. LFD f18, 8 * SIZE(AO)
  1021. LFD f19, 5 * SIZE(AO)
  1022. LFD f20, 4 * SIZE(AO)
  1023. LFD f21, 0 * SIZE(AO)
  1024. FMUL f2, f16, f2
  1025. FMUL f6, f16, f6
  1026. FNMSUB f1, f17, f2, f1
  1027. FNMSUB f5, f17, f6, f5
  1028. FNMSUB f0, f18, f2, f0
  1029. FNMSUB f4, f18, f6, f4
  1030. FMUL f1, f19, f1
  1031. FMUL f5, f19, f5
  1032. FNMSUB f0, f20, f1, f0
  1033. FNMSUB f4, f20, f5, f4
  1034. FMUL f0, f21, f0
  1035. FMUL f4, f21, f4
  1036. #endif
  1037. #ifdef LT
  1038. LFD f16, 0 * SIZE(AO)
  1039. LFD f17, 1 * SIZE(AO)
  1040. LFD f18, 2 * SIZE(AO)
  1041. LFD f19, 3 * SIZE(AO)
  1042. FMUL f0, f16, f0
  1043. FMUL f4, f16, f4
  1044. FNMSUB f1, f17, f0, f1
  1045. FNMSUB f5, f17, f4, f5
  1046. FNMSUB f2, f18, f0, f2
  1047. FNMSUB f6, f18, f4, f6
  1048. FNMSUB f3, f19, f0, f3
  1049. FNMSUB f7, f19, f4, f7
  1050. LFD f17, 5 * SIZE(AO)
  1051. LFD f18, 6 * SIZE(AO)
  1052. LFD f19, 7 * SIZE(AO)
  1053. FMUL f1, f17, f1
  1054. FMUL f5, f17, f5
  1055. FNMSUB f2, f18, f1, f2
  1056. FNMSUB f6, f18, f5, f6
  1057. FNMSUB f3, f19, f1, f3
  1058. FNMSUB f7, f19, f5, f7
  1059. LFD f18, 10 * SIZE(AO)
  1060. LFD f19, 11 * SIZE(AO)
  1061. FMUL f2, f18, f2
  1062. FMUL f6, f18, f6
  1063. FNMSUB f3, f19, f2, f3
  1064. FNMSUB f7, f19, f6, f7
  1065. LFD f19, 15 * SIZE(AO)
  1066. FMUL f3, f19, f3
  1067. FMUL f7, f19, f7
  1068. #endif
  1069. #ifdef RN
  1070. LFD f16, 0 * SIZE(BO)
  1071. LFD f17, 1 * SIZE(BO)
  1072. LFD f18, 3 * SIZE(BO)
  1073. FMUL f0, f16, f0
  1074. FMUL f1, f16, f1
  1075. FMUL f2, f16, f2
  1076. FMUL f3, f16, f3
  1077. FNMSUB f4, f17, f0, f4
  1078. FNMSUB f5, f17, f1, f5
  1079. FNMSUB f6, f17, f2, f6
  1080. FNMSUB f7, f17, f3, f7
  1081. FMUL f4, f18, f4
  1082. FMUL f5, f18, f5
  1083. FMUL f6, f18, f6
  1084. FMUL f7, f18, f7
  1085. #endif
  1086. #ifdef RT
  1087. LFD f19, 3 * SIZE(BO)
  1088. LFD f20, 2 * SIZE(BO)
  1089. LFD f21, 0 * SIZE(BO)
  1090. FMUL f4, f19, f4
  1091. FMUL f5, f19, f5
  1092. FMUL f6, f19, f6
  1093. FMUL f7, f19, f7
  1094. FNMSUB f0, f20, f4, f0
  1095. FNMSUB f1, f20, f5, f1
  1096. FNMSUB f2, f20, f6, f2
  1097. FNMSUB f3, f20, f7, f3
  1098. FMUL f0, f21, f0
  1099. FMUL f1, f21, f1
  1100. FMUL f2, f21, f2
  1101. FMUL f3, f21, f3
  1102. #endif
  1103. #ifdef LN
  1104. subi CO1, CO1, 4 * SIZE
  1105. subi CO2, CO2, 4 * SIZE
  1106. #endif
  1107. #if defined(LN) || defined(LT)
  1108. STFD f0, 0 * SIZE(BO)
  1109. STFD f4, 1 * SIZE(BO)
  1110. STFD f1, 2 * SIZE(BO)
  1111. STFD f5, 3 * SIZE(BO)
  1112. STFD f2, 4 * SIZE(BO)
  1113. STFD f6, 5 * SIZE(BO)
  1114. STFD f3, 6 * SIZE(BO)
  1115. STFD f7, 7 * SIZE(BO)
  1116. #else
  1117. STFD f0, 0 * SIZE(AO)
  1118. STFD f1, 1 * SIZE(AO)
  1119. STFD f2, 2 * SIZE(AO)
  1120. STFD f3, 3 * SIZE(AO)
  1121. STFD f4, 4 * SIZE(AO)
  1122. STFD f5, 5 * SIZE(AO)
  1123. STFD f6, 6 * SIZE(AO)
  1124. STFD f7, 7 * SIZE(AO)
  1125. #endif
  1126. STFD f0, 0 * SIZE(CO1)
  1127. STFD f1, 1 * SIZE(CO1)
  1128. STFD f2, 2 * SIZE(CO1)
  1129. STFD f3, 3 * SIZE(CO1)
  1130. STFD f4, 0 * SIZE(CO2)
  1131. STFD f5, 1 * SIZE(CO2)
  1132. STFD f6, 2 * SIZE(CO2)
  1133. STFD f7, 3 * SIZE(CO2)
  1134. lfs f0, FZERO
  1135. fmr f1, f0
  1136. fmr f2, f0
  1137. fmr f3, f0
  1138. fmr f4, f0
  1139. fmr f5, f0
  1140. fmr f6, f0
  1141. fmr f7, f0
  1142. #ifndef LN
  1143. addi CO1, CO1, 4 * SIZE
  1144. addi CO2, CO2, 4 * SIZE
  1145. #endif
  1146. #ifdef RT
  1147. slwi r0, K, 2 + BASE_SHIFT
  1148. add AORIG, AORIG, r0
  1149. #endif
  1150. #if defined(LT) || defined(RN)
  1151. sub TEMP, K, KK
  1152. slwi r0, TEMP, 2 + BASE_SHIFT
  1153. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1154. add AO, AO, r0
  1155. add BO, BO, TEMP
  1156. #endif
  1157. #ifdef LN
  1158. subi KK, KK, 4
  1159. #endif
  1160. #ifdef LT
  1161. addi KK, KK, 4
  1162. #endif
  1163. addic. I, I, -1
  1164. bgt+ .L41
  1165. .align 4
  1166. .L50:
  1167. andi. I, M, 2
  1168. ble .L60
  1169. #if defined(LT) || defined(RN)
  1170. LFD f16, 0 * SIZE(AO)
  1171. LFD f17, 1 * SIZE(AO)
  1172. LFD f18, 2 * SIZE(AO)
  1173. LFD f19, 3 * SIZE(AO)
  1174. LFD f20, 0 * SIZE(B)
  1175. LFD f21, 1 * SIZE(B)
  1176. LFD f22, 2 * SIZE(B)
  1177. LFD f23, 3 * SIZE(B)
  1178. LFD f24, 4 * SIZE(B)
  1179. LFD f25, 5 * SIZE(B)
  1180. LFD f26, 6 * SIZE(B)
  1181. LFD f27, 7 * SIZE(B)
  1182. srawi. r0, KK, 2
  1183. mtspr CTR, r0
  1184. mr BO, B
  1185. #else
  1186. #ifdef LN
  1187. slwi r0, K, 1 + BASE_SHIFT
  1188. sub AORIG, AORIG, r0
  1189. #endif
  1190. slwi r0, KK, 1 + BASE_SHIFT
  1191. slwi TEMP, KK, 1 + BASE_SHIFT
  1192. add AO, AORIG, r0
  1193. add BO, B, TEMP
  1194. sub TEMP, K, KK
  1195. LFD f16, 0 * SIZE(AO)
  1196. LFD f17, 1 * SIZE(AO)
  1197. LFD f18, 2 * SIZE(AO)
  1198. LFD f19, 3 * SIZE(AO)
  1199. LFD f20, 0 * SIZE(BO)
  1200. LFD f21, 1 * SIZE(BO)
  1201. LFD f22, 2 * SIZE(BO)
  1202. LFD f23, 3 * SIZE(BO)
  1203. LFD f24, 4 * SIZE(BO)
  1204. LFD f25, 5 * SIZE(BO)
  1205. LFD f26, 6 * SIZE(BO)
  1206. LFD f27, 7 * SIZE(BO)
  1207. srawi. r0, TEMP, 2
  1208. mtspr CTR, r0
  1209. #endif
  1210. ble .L55
  1211. .align 5
  1212. .L52:
  1213. FMADD f0, f16, f20, f0
  1214. FMADD f1, f17, f20, f1
  1215. LFDU f20, 8 * SIZE(BO)
  1216. FMADD f2, f16, f21, f2
  1217. LFD f16, 4 * SIZE(AO)
  1218. FMADD f3, f17, f21, f3
  1219. LFD f17, 5 * SIZE(AO)
  1220. FMADD f4, f18, f22, f4
  1221. LFD f21, 1 * SIZE(BO)
  1222. FMADD f5, f19, f22, f5
  1223. LFD f22, 2 * SIZE(BO)
  1224. FMADD f6, f18, f23, f6
  1225. LFD f18, 6 * SIZE(AO)
  1226. FMADD f7, f19, f23, f7
  1227. LFD f19, 7 * SIZE(AO)
  1228. FMADD f0, f16, f24, f0
  1229. LFD f23, 3 * SIZE(BO)
  1230. FMADD f1, f17, f24, f1
  1231. LFD f24, 4 * SIZE(BO)
  1232. FMADD f2, f16, f25, f2
  1233. LFDU f16, 8 * SIZE(AO)
  1234. FMADD f3, f17, f25, f3
  1235. LFD f17, 1 * SIZE(AO)
  1236. FMADD f4, f18, f26, f4
  1237. LFD f25, 5 * SIZE(BO)
  1238. FMADD f5, f19, f26, f5
  1239. LFD f26, 6 * SIZE(BO)
  1240. FMADD f6, f18, f27, f6
  1241. LFD f18, 2 * SIZE(AO)
  1242. FMADD f7, f19, f27, f7
  1243. LFD f19, 3 * SIZE(AO)
  1244. LFD f27, 7 * SIZE(BO)
  1245. bdnz .L52
  1246. .align 4
  1247. .L55:
  1248. #if defined(LT) || defined(RN)
  1249. andi. r0, KK, 3
  1250. #else
  1251. andi. r0, TEMP, 3
  1252. #endif
  1253. mtspr CTR, r0
  1254. ble+ .L58
  1255. .align 4
  1256. .L56:
  1257. FMADD f0, f16, f20, f0
  1258. FMADD f1, f17, f20, f1
  1259. LFDU f20, 2 * SIZE(BO)
  1260. FMADD f2, f16, f21, f2
  1261. LFDU f16, 2 * SIZE(AO)
  1262. FMADD f3, f17, f21, f3
  1263. LFD f17, 1 * SIZE(AO)
  1264. LFD f21, 1 * SIZE(BO)
  1265. bdnz .L56
  1266. .align 4
  1267. .L58:
  1268. FADD f0, f4, f0
  1269. FADD f1, f5, f1
  1270. FADD f2, f6, f2
  1271. FADD f3, f7, f3
  1272. #if defined(LN) || defined(RT)
  1273. #ifdef LN
  1274. subi r0, KK, 2
  1275. #else
  1276. subi r0, KK, 2
  1277. #endif
  1278. slwi TEMP, r0, 1 + BASE_SHIFT
  1279. slwi r0, r0, 1 + BASE_SHIFT
  1280. add AO, AORIG, TEMP
  1281. add BO, B, r0
  1282. #endif
  1283. #if defined(LN) || defined(LT)
  1284. LFD f16, 0 * SIZE(BO)
  1285. LFD f17, 1 * SIZE(BO)
  1286. LFD f20, 2 * SIZE(BO)
  1287. LFD f21, 3 * SIZE(BO)
  1288. FSUB f0, f16, f0
  1289. FSUB f2, f17, f2
  1290. FSUB f1, f20, f1
  1291. FSUB f3, f21, f3
  1292. #else
  1293. LFD f16, 0 * SIZE(AO)
  1294. LFD f17, 1 * SIZE(AO)
  1295. LFD f20, 2 * SIZE(AO)
  1296. LFD f21, 3 * SIZE(AO)
  1297. FSUB f0, f16, f0
  1298. FSUB f1, f17, f1
  1299. FSUB f2, f20, f2
  1300. FSUB f3, f21, f3
  1301. #endif
  1302. #ifdef LN
  1303. LFD f19, 3 * SIZE(AO)
  1304. LFD f20, 2 * SIZE(AO)
  1305. LFD f21, 0 * SIZE(AO)
  1306. FMUL f1, f19, f1
  1307. FMUL f3, f19, f3
  1308. FNMSUB f0, f20, f1, f0
  1309. FNMSUB f2, f20, f3, f2
  1310. FMUL f0, f21, f0
  1311. FMUL f2, f21, f2
  1312. #endif
  1313. #ifdef LT
  1314. LFD f16, 0 * SIZE(AO)
  1315. LFD f17, 1 * SIZE(AO)
  1316. FMUL f0, f16, f0
  1317. FMUL f2, f16, f2
  1318. FNMSUB f1, f17, f0, f1
  1319. FNMSUB f3, f17, f2, f3
  1320. LFD f17, 3 * SIZE(AO)
  1321. FMUL f1, f17, f1
  1322. FMUL f3, f17, f3
  1323. #endif
  1324. #ifdef RN
  1325. LFD f16, 0 * SIZE(BO)
  1326. LFD f17, 1 * SIZE(BO)
  1327. LFD f18, 3 * SIZE(BO)
  1328. FMUL f0, f16, f0
  1329. FMUL f1, f16, f1
  1330. FNMSUB f2, f17, f0, f2
  1331. FNMSUB f3, f17, f1, f3
  1332. FMUL f2, f18, f2
  1333. FMUL f3, f18, f3
  1334. #endif
  1335. #ifdef RT
  1336. LFD f19, 3 * SIZE(BO)
  1337. LFD f20, 2 * SIZE(BO)
  1338. LFD f21, 0 * SIZE(BO)
  1339. FMUL f2, f19, f2
  1340. FMUL f3, f19, f3
  1341. FNMSUB f0, f20, f2, f0
  1342. FNMSUB f1, f20, f3, f1
  1343. FMUL f0, f21, f0
  1344. FMUL f1, f21, f1
  1345. #endif
  1346. #ifdef LN
  1347. subi CO1, CO1, 2 * SIZE
  1348. subi CO2, CO2, 2 * SIZE
  1349. #endif
  1350. #if defined(LN) || defined(LT)
  1351. STFD f0, 0 * SIZE(BO)
  1352. STFD f2, 1 * SIZE(BO)
  1353. STFD f1, 2 * SIZE(BO)
  1354. STFD f3, 3 * SIZE(BO)
  1355. #else
  1356. STFD f0, 0 * SIZE(AO)
  1357. STFD f1, 1 * SIZE(AO)
  1358. STFD f2, 2 * SIZE(AO)
  1359. STFD f3, 3 * SIZE(AO)
  1360. #endif
  1361. STFD f0, 0 * SIZE(CO1)
  1362. STFD f1, 1 * SIZE(CO1)
  1363. STFD f2, 0 * SIZE(CO2)
  1364. STFD f3, 1 * SIZE(CO2)
  1365. lfs f0, FZERO
  1366. fmr f1, f0
  1367. fmr f2, f0
  1368. fmr f3, f0
  1369. fmr f4, f0
  1370. fmr f5, f0
  1371. fmr f6, f0
  1372. fmr f7, f0
  1373. #ifndef LN
  1374. addi CO1, CO1, 2 * SIZE
  1375. addi CO2, CO2, 2 * SIZE
  1376. #endif
  1377. #ifdef RT
  1378. slwi r0, K, 1 + BASE_SHIFT
  1379. add AORIG, AORIG, r0
  1380. #endif
  1381. #if defined(LT) || defined(RN)
  1382. sub TEMP, K, KK
  1383. slwi r0, TEMP, 1 + BASE_SHIFT
  1384. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1385. add AO, AO, r0
  1386. add BO, BO, TEMP
  1387. #endif
  1388. #ifdef LN
  1389. subi KK, KK, 2
  1390. #endif
  1391. #ifdef LT
  1392. addi KK, KK, 2
  1393. #endif
  1394. .align 4
  1395. .L60:
  1396. andi. I, M, 1
  1397. ble .L69
  1398. #if defined(LT) || defined(RN)
  1399. LFD f16, 0 * SIZE(AO)
  1400. LFD f17, 1 * SIZE(AO)
  1401. LFD f18, 2 * SIZE(AO)
  1402. LFD f19, 3 * SIZE(AO)
  1403. LFD f20, 0 * SIZE(B)
  1404. LFD f21, 1 * SIZE(B)
  1405. LFD f22, 2 * SIZE(B)
  1406. LFD f23, 3 * SIZE(B)
  1407. LFD f24, 4 * SIZE(B)
  1408. LFD f25, 5 * SIZE(B)
  1409. LFD f26, 6 * SIZE(B)
  1410. LFD f27, 7 * SIZE(B)
  1411. srawi. r0, KK, 2
  1412. mtspr CTR, r0
  1413. mr BO, B
  1414. #else
  1415. #ifdef LN
  1416. slwi r0, K, BASE_SHIFT
  1417. sub AORIG, AORIG, r0
  1418. #endif
  1419. slwi r0, KK, 0 + BASE_SHIFT
  1420. slwi TEMP, KK, 1 + BASE_SHIFT
  1421. add AO, AORIG, r0
  1422. add BO, B, TEMP
  1423. sub TEMP, K, KK
  1424. LFD f16, 0 * SIZE(AO)
  1425. LFD f17, 1 * SIZE(AO)
  1426. LFD f18, 2 * SIZE(AO)
  1427. LFD f19, 3 * SIZE(AO)
  1428. LFD f20, 0 * SIZE(BO)
  1429. LFD f21, 1 * SIZE(BO)
  1430. LFD f22, 2 * SIZE(BO)
  1431. LFD f23, 3 * SIZE(BO)
  1432. LFD f24, 4 * SIZE(BO)
  1433. LFD f25, 5 * SIZE(BO)
  1434. LFD f26, 6 * SIZE(BO)
  1435. LFD f27, 7 * SIZE(BO)
  1436. srawi. r0, TEMP, 2
  1437. mtspr CTR, r0
  1438. #endif
  1439. ble .L65
  1440. .align 5
  1441. .L62:
  1442. FMADD f0, f16, f20, f0
  1443. LFDU f20, 8 * SIZE(BO)
  1444. FMADD f1, f16, f21, f1
  1445. LFDU f16, 4 * SIZE(AO)
  1446. LFD f21, 1 * SIZE(BO)
  1447. FMADD f2, f17, f22, f2
  1448. LFD f22, 2 * SIZE(BO)
  1449. FMADD f3, f17, f23, f3
  1450. LFD f17, 1 * SIZE(AO)
  1451. LFD f23, 3 * SIZE(BO)
  1452. FMADD f0, f18, f24, f0
  1453. LFD f24, 4 * SIZE(BO)
  1454. FMADD f1, f18, f25, f1
  1455. LFD f18, 2 * SIZE(AO)
  1456. LFD f25, 5 * SIZE(BO)
  1457. FMADD f2, f19, f26, f2
  1458. LFD f26, 6 * SIZE(BO)
  1459. FMADD f3, f19, f27, f3
  1460. LFD f19, 3 * SIZE(AO)
  1461. LFD f27, 7 * SIZE(BO)
  1462. bdnz .L62
  1463. .align 4
  1464. .L65:
  1465. #if defined(LT) || defined(RN)
  1466. andi. r0, KK, 3
  1467. #else
  1468. andi. r0, TEMP, 3
  1469. #endif
  1470. mtspr CTR, r0
  1471. ble+ .L68
  1472. .align 4
  1473. .L66:
  1474. FMADD f0, f16, f20, f0
  1475. LFDU f20, 2 * SIZE(BO)
  1476. FMADD f1, f16, f21, f1
  1477. LFDU f16, 1 * SIZE(AO)
  1478. LFD f21, 1 * SIZE(BO)
  1479. bdnz .L66
  1480. .align 4
  1481. .L68:
  1482. FADD f0, f2, f0
  1483. FADD f1, f3, f1
  1484. #if defined(LN) || defined(RT)
  1485. #ifdef LN
  1486. subi r0, KK, 1
  1487. #else
  1488. subi r0, KK, 2
  1489. #endif
  1490. slwi TEMP, r0, 0 + BASE_SHIFT
  1491. slwi r0, r0, 1 + BASE_SHIFT
  1492. add AO, AORIG, TEMP
  1493. add BO, B, r0
  1494. #endif
  1495. #if defined(LN) || defined(LT)
  1496. LFD f16, 0 * SIZE(BO)
  1497. LFD f17, 1 * SIZE(BO)
  1498. FSUB f0, f16, f0
  1499. FSUB f1, f17, f1
  1500. #else
  1501. LFD f16, 0 * SIZE(AO)
  1502. LFD f20, 1 * SIZE(AO)
  1503. FSUB f0, f16, f0
  1504. FSUB f1, f20, f1
  1505. #endif
  1506. #ifdef LN
  1507. LFD f21, 0 * SIZE(AO)
  1508. FMUL f0, f21, f0
  1509. FMUL f1, f21, f1
  1510. #endif
  1511. #ifdef LT
  1512. LFD f16, 0 * SIZE(AO)
  1513. FMUL f0, f16, f0
  1514. FMUL f1, f16, f1
  1515. #endif
  1516. #ifdef RN
  1517. LFD f16, 0 * SIZE(BO)
  1518. LFD f17, 1 * SIZE(BO)
  1519. LFD f18, 3 * SIZE(BO)
  1520. FMUL f0, f16, f0
  1521. FNMSUB f1, f17, f0, f1
  1522. FMUL f1, f18, f1
  1523. #endif
  1524. #ifdef RT
  1525. LFD f19, 3 * SIZE(BO)
  1526. LFD f20, 2 * SIZE(BO)
  1527. LFD f21, 0 * SIZE(BO)
  1528. FMUL f1, f19, f1
  1529. FNMSUB f0, f20, f1, f0
  1530. FMUL f0, f21, f0
  1531. #endif
  1532. #ifdef LN
  1533. subi CO1, CO1, 1 * SIZE
  1534. subi CO2, CO2, 1 * SIZE
  1535. #endif
  1536. #if defined(LN) || defined(LT)
  1537. STFD f0, 0 * SIZE(BO)
  1538. STFD f1, 1 * SIZE(BO)
  1539. #else
  1540. STFD f0, 0 * SIZE(AO)
  1541. STFD f1, 1 * SIZE(AO)
  1542. #endif
  1543. STFD f0, 0 * SIZE(CO1)
  1544. STFD f1, 0 * SIZE(CO2)
  1545. lfs f0, FZERO
  1546. fmr f1, f0
  1547. fmr f4, f0
  1548. fmr f5, f0
  1549. #ifndef LN
  1550. addi CO1, CO1, 1 * SIZE
  1551. addi CO2, CO2, 1 * SIZE
  1552. #endif
  1553. #ifdef RT
  1554. slwi r0, K, 0 + BASE_SHIFT
  1555. add AORIG, AORIG, r0
  1556. #endif
  1557. #if defined(LT) || defined(RN)
  1558. sub TEMP, K, KK
  1559. slwi r0, TEMP, 0 + BASE_SHIFT
  1560. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1561. add AO, AO, r0
  1562. add BO, BO, TEMP
  1563. #endif
  1564. #ifdef LN
  1565. subi KK, KK, 1
  1566. #endif
  1567. #ifdef LT
  1568. addi KK, KK, 1
  1569. #endif
  1570. .align 4
  1571. .L69:
  1572. #ifdef LN
  1573. slwi r0, K, 1 + BASE_SHIFT
  1574. add B, B, r0
  1575. #endif
  1576. #if defined(LT) || defined(RN)
  1577. mr B, BO
  1578. #endif
  1579. #ifdef RN
  1580. addi KK, KK, 2
  1581. #endif
  1582. #ifdef RT
  1583. subi KK, KK, 2
  1584. #endif
  1585. lfs f0, FZERO
  1586. .align 4
  1587. .L09:
  1588. srawi. J, N, 2
  1589. ble .L999
  1590. .align 4
  1591. .L10:
  1592. #ifdef RT
  1593. slwi r0, K, 2 + BASE_SHIFT
  1594. sub B, B, r0
  1595. slwi r0, LDC, 2
  1596. sub C, C, r0
  1597. #endif
  1598. mr CO1, C
  1599. add CO2, C, LDC
  1600. add CO3, CO2, LDC
  1601. add CO4, CO3, LDC
  1602. #ifdef LN
  1603. add KK, M, OFFSET
  1604. #endif
  1605. #ifdef LT
  1606. mr KK, OFFSET
  1607. #endif
  1608. fmr f1, f0
  1609. fmr f2, f0
  1610. fmr f3, f0
  1611. fmr f4, f0
  1612. fmr f5, f0
  1613. fmr f6, f0
  1614. fmr f7, f0
  1615. fmr f8, f0
  1616. fmr f9, f0
  1617. fmr f10, f0
  1618. fmr f11, f0
  1619. fmr f12, f0
  1620. fmr f13, f0
  1621. fmr f14, f0
  1622. fmr f15, f0
  1623. srawi. I, M, 2
  1624. #if defined(LN) || defined(RT)
  1625. mr AORIG, A
  1626. #else
  1627. mr AO, A
  1628. #endif
  1629. #ifndef RT
  1630. add C, CO4, LDC
  1631. #endif
  1632. ble .L20
  1633. .align 4
  1634. .L11:
  1635. #if defined(LT) || defined(RN)
  1636. LFD A1, 0 * SIZE(AO)
  1637. LFD A2, 1 * SIZE(AO)
  1638. LFD A4, 4 * SIZE(AO)
  1639. LFD A5, 8 * SIZE(AO)
  1640. LFD B1, 0 * SIZE(B)
  1641. LFD B2, 1 * SIZE(B)
  1642. LFD B3, 2 * SIZE(B)
  1643. LFD B4, 3 * SIZE(B)
  1644. LFD B5, 4 * SIZE(B)
  1645. LFD B6, 8 * SIZE(B)
  1646. LFD B7, 12 * SIZE(B)
  1647. srawi. r0, KK, 2
  1648. mtspr CTR, r0
  1649. mr BO, B
  1650. #else
  1651. #ifdef LN
  1652. slwi r0, K, 2 + BASE_SHIFT
  1653. sub AORIG, AORIG, r0
  1654. #endif
  1655. slwi TEMP, KK, 2 + BASE_SHIFT
  1656. add AO, AORIG, TEMP
  1657. add BO, B, TEMP
  1658. sub TEMP, K, KK
  1659. LFD A1, 0 * SIZE(AO)
  1660. LFD A2, 1 * SIZE(AO)
  1661. LFD A4, 4 * SIZE(AO)
  1662. LFD A5, 8 * SIZE(AO)
  1663. LFD B1, 0 * SIZE(BO)
  1664. LFD B2, 1 * SIZE(BO)
  1665. LFD B3, 2 * SIZE(BO)
  1666. LFD B4, 3 * SIZE(BO)
  1667. LFD B5, 4 * SIZE(BO)
  1668. LFD B6, 8 * SIZE(BO)
  1669. LFD B7, 12 * SIZE(BO)
  1670. srawi. r0, TEMP, 2
  1671. mtspr CTR, r0
  1672. #endif
  1673. ble .L15
  1674. .align 4
  1675. .L12:
  1676. FMADD f0, A1, B1, f0
  1677. LFD A3, 2 * SIZE(AO)
  1678. FMADD f4, A1, B2, f4
  1679. LFD A6, 12 * SIZE(AO)
  1680. FMADD f8, A1, B3, f8
  1681. nop
  1682. FMADD f12, A1, B4, f12
  1683. nop
  1684. FMADD f1, A2, B1, f1
  1685. LFD A1, 3 * SIZE(AO)
  1686. FMADD f5, A2, B2, f5
  1687. nop
  1688. FMADD f9, A2, B3, f9
  1689. nop
  1690. FMADD f13, A2, B4, f13
  1691. nop
  1692. FMADD f2, A3, B1, f2
  1693. nop
  1694. FMADD f6, A3, B2, f6
  1695. LFD B8, 5 * SIZE(BO)
  1696. FMADD f10, A3, B3, f10
  1697. LFD B9, 6 * SIZE(BO)
  1698. FMADD f14, A3, B4, f14
  1699. LFD B10, 7 * SIZE(BO)
  1700. FMADD f3, A1, B1, f3
  1701. LFD A2, 5 * SIZE(AO)
  1702. FMADD f7, A1, B2, f7
  1703. LFD B1, 16 * SIZE(BO)
  1704. FMADD f11, A1, B3, f11
  1705. nop
  1706. FMADD f15, A1, B4, f15
  1707. nop
  1708. FMADD f0, A4, B5, f0
  1709. LFD A3, 6 * SIZE(AO)
  1710. FMADD f4, A4, B8, f4
  1711. LFD A1, 16 * SIZE(AO)
  1712. FMADD f8, A4, B9, f8
  1713. nop
  1714. FMADD f12, A4, B10, f12
  1715. nop
  1716. FMADD f1, A2, B5, f1
  1717. LFD A4, 7 * SIZE(AO)
  1718. FMADD f5, A2, B8, f5
  1719. nop
  1720. FMADD f9, A2, B9, f9
  1721. nop
  1722. FMADD f13, A2, B10, f13
  1723. nop
  1724. FMADD f2, A3, B5, f2
  1725. nop
  1726. FMADD f6, A3, B8, f6
  1727. LFD B2, 9 * SIZE(BO)
  1728. FMADD f10, A3, B9, f10
  1729. LFD B3, 10 * SIZE(BO)
  1730. FMADD f14, A3, B10, f14
  1731. LFD B4, 11 * SIZE(BO)
  1732. FMADD f3, A4, B5, f3
  1733. LFD A2, 9 * SIZE(AO)
  1734. FMADD f7, A4, B8, f7
  1735. LFD B5, 20 * SIZE(BO)
  1736. FMADD f11, A4, B9, f11
  1737. nop
  1738. FMADD f15, A4, B10, f15
  1739. nop
  1740. FMADD f0, A5, B6, f0
  1741. LFD A3, 10 * SIZE(AO)
  1742. FMADD f4, A5, B2, f4
  1743. LFD A4, 20 * SIZE(AO)
  1744. FMADD f8, A5, B3, f8
  1745. nop
  1746. FMADD f12, A5, B4, f12
  1747. nop
  1748. FMADD f1, A2, B6, f1
  1749. LFD A5, 11 * SIZE(AO)
  1750. FMADD f5, A2, B2, f5
  1751. nop
  1752. FMADD f9, A2, B3, f9
  1753. nop
  1754. FMADD f13, A2, B4, f13
  1755. nop
  1756. FMADD f2, A3, B6, f2
  1757. nop
  1758. FMADD f6, A3, B2, f6
  1759. LFD B8, 13 * SIZE(BO)
  1760. FMADD f10, A3, B3, f10
  1761. LFD B9, 14 * SIZE(BO)
  1762. FMADD f14, A3, B4, f14
  1763. LFD B10,15 * SIZE(BO)
  1764. FMADD f3, A5, B6, f3
  1765. LFD A2, 13 * SIZE(AO)
  1766. FMADD f7, A5, B2, f7
  1767. LFD B6, 24 * SIZE(BO)
  1768. FMADD f11, A5, B3, f11
  1769. nop
  1770. FMADD f15, A5, B4, f15
  1771. nop
  1772. FMADD f0, A6, B7, f0
  1773. LFD A3, 14 * SIZE(AO)
  1774. FMADD f4, A6, B8, f4
  1775. LFD A5, 24 * SIZE(AO)
  1776. FMADD f8, A6, B9, f8
  1777. nop
  1778. FMADD f12, A6, B10, f12
  1779. nop
  1780. FMADD f1, A2, B7, f1
  1781. LFD A6, 15 * SIZE(AO)
  1782. FMADD f5, A2, B8, f5
  1783. nop
  1784. FMADD f9, A2, B9, f9
  1785. nop
  1786. FMADD f13, A2, B10, f13
  1787. nop
  1788. FMADD f2, A3, B7, f2
  1789. addi AO, AO, 16 * SIZE
  1790. FMADD f6, A3, B8, f6
  1791. LFD B2, 17 * SIZE(BO)
  1792. FMADD f10, A3, B9, f10
  1793. LFD B3, 18 * SIZE(BO)
  1794. FMADD f14, A3, B10, f14
  1795. LFD B4, 19 * SIZE(BO)
  1796. FMADD f3, A6, B7, f3
  1797. LFD A2, 1 * SIZE(AO)
  1798. FMADD f7, A6, B8, f7
  1799. LFD B7, 28 * SIZE(BO)
  1800. FMADD f11, A6, B9, f11
  1801. addi BO, BO, 16 * SIZE
  1802. FMADD f15, A6, B10, f15
  1803. bdnz .L12
  1804. .align 4
  1805. .L15:
  1806. #if defined(LT) || defined(RN)
  1807. andi. r0, KK, 3
  1808. #else
  1809. andi. r0, TEMP, 3
  1810. #endif
  1811. mtspr CTR, r0
  1812. ble+ .L18
  1813. .align 4
  1814. .L16:
  1815. FMADD f0, A1, B1, f0
  1816. LFD A3, 2 * SIZE(AO)
  1817. FMADD f4, A1, B2, f4
  1818. FMADD f8, A1, B3, f8
  1819. FMADD f12, A1, B4, f12
  1820. LFD A4, 3 * SIZE(AO)
  1821. FMADD f1, A2, B1, f1
  1822. FMADD f5, A2, B2, f5
  1823. FMADD f9, A2, B3, f9
  1824. FMADD f13, A2, B4, f13
  1825. LFDU A1, 4 * SIZE(AO)
  1826. FMADD f2, A3, B1, f2
  1827. FMADD f6, A3, B2, f6
  1828. FMADD f10, A3, B3, f10
  1829. FMADD f14, A3, B4, f14
  1830. LFD A2, 1 * SIZE(AO)
  1831. FMADD f3, A4, B1, f3
  1832. LFDU B1, 4 * SIZE(BO)
  1833. FMADD f7, A4, B2, f7
  1834. LFD B2, 1 * SIZE(BO)
  1835. FMADD f11, A4, B3, f11
  1836. LFD B3, 2 * SIZE(BO)
  1837. FMADD f15, A4, B4, f15
  1838. LFD B4, 3 * SIZE(BO)
  1839. bdnz .L16
  1840. .align 4
  1841. .L18:
  1842. #if defined(LN) || defined(RT)
  1843. subi r0, KK, 4
  1844. slwi r0, r0, 2 + BASE_SHIFT
  1845. add AO, AORIG, r0
  1846. add BO, B, r0
  1847. #endif
  1848. #if defined(LN) || defined(LT)
  1849. LFD f16, 0 * SIZE(BO)
  1850. LFD f17, 1 * SIZE(BO)
  1851. LFD f18, 2 * SIZE(BO)
  1852. LFD f19, 3 * SIZE(BO)
  1853. LFD f20, 4 * SIZE(BO)
  1854. LFD f21, 5 * SIZE(BO)
  1855. LFD f22, 6 * SIZE(BO)
  1856. LFD f23, 7 * SIZE(BO)
  1857. LFD f24, 8 * SIZE(BO)
  1858. LFD f25, 9 * SIZE(BO)
  1859. LFD f26, 10 * SIZE(BO)
  1860. LFD f27, 11 * SIZE(BO)
  1861. LFD f28, 12 * SIZE(BO)
  1862. LFD f29, 13 * SIZE(BO)
  1863. LFD f30, 14 * SIZE(BO)
  1864. LFD f31, 15 * SIZE(BO)
  1865. FSUB f0, f16, f0
  1866. FSUB f4, f17, f4
  1867. FSUB f8, f18, f8
  1868. FSUB f12, f19, f12
  1869. FSUB f1, f20, f1
  1870. FSUB f5, f21, f5
  1871. FSUB f9, f22, f9
  1872. FSUB f13, f23, f13
  1873. FSUB f2, f24, f2
  1874. FSUB f6, f25, f6
  1875. FSUB f10, f26, f10
  1876. FSUB f14, f27, f14
  1877. FSUB f3, f28, f3
  1878. FSUB f7, f29, f7
  1879. FSUB f11, f30, f11
  1880. FSUB f15, f31, f15
  1881. #else
  1882. LFD f16, 0 * SIZE(AO)
  1883. LFD f17, 1 * SIZE(AO)
  1884. LFD f18, 2 * SIZE(AO)
  1885. LFD f19, 3 * SIZE(AO)
  1886. LFD f20, 4 * SIZE(AO)
  1887. LFD f21, 5 * SIZE(AO)
  1888. LFD f22, 6 * SIZE(AO)
  1889. LFD f23, 7 * SIZE(AO)
  1890. LFD f24, 8 * SIZE(AO)
  1891. LFD f25, 9 * SIZE(AO)
  1892. LFD f26, 10 * SIZE(AO)
  1893. LFD f27, 11 * SIZE(AO)
  1894. LFD f28, 12 * SIZE(AO)
  1895. LFD f29, 13 * SIZE(AO)
  1896. LFD f30, 14 * SIZE(AO)
  1897. LFD f31, 15 * SIZE(AO)
  1898. FSUB f0, f16, f0
  1899. FSUB f1, f17, f1
  1900. FSUB f2, f18, f2
  1901. FSUB f3, f19, f3
  1902. FSUB f4, f20, f4
  1903. FSUB f5, f21, f5
  1904. FSUB f6, f22, f6
  1905. FSUB f7, f23, f7
  1906. FSUB f8, f24, f8
  1907. FSUB f9, f25, f9
  1908. FSUB f10, f26, f10
  1909. FSUB f11, f27, f11
  1910. FSUB f12, f28, f12
  1911. FSUB f13, f29, f13
  1912. FSUB f14, f30, f14
  1913. FSUB f15, f31, f15
  1914. #endif
  1915. #ifdef LN
  1916. LFD f16, 15 * SIZE(AO)
  1917. LFD f17, 14 * SIZE(AO)
  1918. LFD f18, 13 * SIZE(AO)
  1919. LFD f19, 12 * SIZE(AO)
  1920. FMUL f3, f16, f3
  1921. FMUL f7, f16, f7
  1922. FMUL f11, f16, f11
  1923. FMUL f15, f16, f15
  1924. FNMSUB f2, f17, f3, f2
  1925. FNMSUB f6, f17, f7, f6
  1926. FNMSUB f10, f17, f11, f10
  1927. FNMSUB f14, f17, f15, f14
  1928. FNMSUB f1, f18, f3, f1
  1929. FNMSUB f5, f18, f7, f5
  1930. FNMSUB f9, f18, f11, f9
  1931. FNMSUB f13, f18, f15, f13
  1932. FNMSUB f0, f19, f3, f0
  1933. FNMSUB f4, f19, f7, f4
  1934. FNMSUB f8, f19, f11, f8
  1935. FNMSUB f12, f19, f15, f12
  1936. LFD f16, 10 * SIZE(AO)
  1937. LFD f17, 9 * SIZE(AO)
  1938. LFD f18, 8 * SIZE(AO)
  1939. LFD f19, 5 * SIZE(AO)
  1940. FMUL f2, f16, f2
  1941. FMUL f6, f16, f6
  1942. FMUL f10, f16, f10
  1943. FMUL f14, f16, f14
  1944. LFD f20, 4 * SIZE(AO)
  1945. LFD f21, 0 * SIZE(AO)
  1946. FNMSUB f1, f17, f2, f1
  1947. FNMSUB f5, f17, f6, f5
  1948. FNMSUB f9, f17, f10, f9
  1949. FNMSUB f13, f17, f14, f13
  1950. FNMSUB f0, f18, f2, f0
  1951. FNMSUB f4, f18, f6, f4
  1952. FNMSUB f8, f18, f10, f8
  1953. FNMSUB f12, f18, f14, f12
  1954. FMUL f1, f19, f1
  1955. FMUL f5, f19, f5
  1956. FMUL f9, f19, f9
  1957. FMUL f13, f19, f13
  1958. FNMSUB f0, f20, f1, f0
  1959. FNMSUB f4, f20, f5, f4
  1960. FNMSUB f8, f20, f9, f8
  1961. FNMSUB f12, f20, f13, f12
  1962. FMUL f0, f21, f0
  1963. FMUL f4, f21, f4
  1964. FMUL f8, f21, f8
  1965. FMUL f12, f21, f12
  1966. #endif
  1967. #ifdef LT
  1968. LFD f16, 0 * SIZE(AO)
  1969. LFD f17, 1 * SIZE(AO)
  1970. LFD f18, 2 * SIZE(AO)
  1971. LFD f19, 3 * SIZE(AO)
  1972. FMUL f0, f16, f0
  1973. FMUL f4, f16, f4
  1974. FMUL f8, f16, f8
  1975. FMUL f12, f16, f12
  1976. FNMSUB f1, f17, f0, f1
  1977. FNMSUB f5, f17, f4, f5
  1978. FNMSUB f9, f17, f8, f9
  1979. FNMSUB f13, f17, f12, f13
  1980. FNMSUB f2, f18, f0, f2
  1981. FNMSUB f6, f18, f4, f6
  1982. FNMSUB f10, f18, f8, f10
  1983. FNMSUB f14, f18, f12, f14
  1984. FNMSUB f3, f19, f0, f3
  1985. FNMSUB f7, f19, f4, f7
  1986. FNMSUB f11, f19, f8, f11
  1987. FNMSUB f15, f19, f12, f15
  1988. LFD f16, 5 * SIZE(AO)
  1989. LFD f17, 6 * SIZE(AO)
  1990. LFD f18, 7 * SIZE(AO)
  1991. LFD f19, 10 * SIZE(AO)
  1992. FMUL f1, f16, f1
  1993. FMUL f5, f16, f5
  1994. FMUL f9, f16, f9
  1995. FMUL f13, f16, f13
  1996. LFD f20, 11 * SIZE(AO)
  1997. LFD f21, 15 * SIZE(AO)
  1998. FNMSUB f2, f17, f1, f2
  1999. FNMSUB f6, f17, f5, f6
  2000. FNMSUB f10, f17, f9, f10
  2001. FNMSUB f14, f17, f13, f14
  2002. FNMSUB f3, f18, f1, f3
  2003. FNMSUB f7, f18, f5, f7
  2004. FNMSUB f11, f18, f9, f11
  2005. FNMSUB f15, f18, f13, f15
  2006. FMUL f2, f19, f2
  2007. FMUL f6, f19, f6
  2008. FMUL f10, f19, f10
  2009. FMUL f14, f19, f14
  2010. FNMSUB f3, f20, f2, f3
  2011. FNMSUB f7, f20, f6, f7
  2012. FNMSUB f11, f20, f10, f11
  2013. FNMSUB f15, f20, f14, f15
  2014. FMUL f3, f21, f3
  2015. FMUL f7, f21, f7
  2016. FMUL f11, f21, f11
  2017. FMUL f15, f21, f15
  2018. #endif
  2019. #ifdef RN
  2020. LFD f16, 0 * SIZE(BO)
  2021. LFD f17, 1 * SIZE(BO)
  2022. LFD f18, 2 * SIZE(BO)
  2023. LFD f19, 3 * SIZE(BO)
  2024. FMUL f0, f16, f0
  2025. FMUL f1, f16, f1
  2026. FMUL f2, f16, f2
  2027. FMUL f3, f16, f3
  2028. FNMSUB f4, f17, f0, f4
  2029. FNMSUB f5, f17, f1, f5
  2030. FNMSUB f6, f17, f2, f6
  2031. FNMSUB f7, f17, f3, f7
  2032. FNMSUB f8, f18, f0, f8
  2033. FNMSUB f9, f18, f1, f9
  2034. FNMSUB f10, f18, f2, f10
  2035. FNMSUB f11, f18, f3, f11
  2036. FNMSUB f12, f19, f0, f12
  2037. FNMSUB f13, f19, f1, f13
  2038. FNMSUB f14, f19, f2, f14
  2039. FNMSUB f15, f19, f3, f15
  2040. LFD f16, 5 * SIZE(BO)
  2041. LFD f17, 6 * SIZE(BO)
  2042. LFD f18, 7 * SIZE(BO)
  2043. LFD f19, 10 * SIZE(BO)
  2044. FMUL f4, f16, f4
  2045. FMUL f5, f16, f5
  2046. FMUL f6, f16, f6
  2047. FMUL f7, f16, f7
  2048. LFD f20, 11 * SIZE(BO)
  2049. LFD f21, 15 * SIZE(BO)
  2050. FNMSUB f8, f17, f4, f8
  2051. FNMSUB f9, f17, f5, f9
  2052. FNMSUB f10, f17, f6, f10
  2053. FNMSUB f11, f17, f7, f11
  2054. FNMSUB f12, f18, f4, f12
  2055. FNMSUB f13, f18, f5, f13
  2056. FNMSUB f14, f18, f6, f14
  2057. FNMSUB f15, f18, f7, f15
  2058. FMUL f8, f19, f8
  2059. FMUL f9, f19, f9
  2060. FMUL f10, f19, f10
  2061. FMUL f11, f19, f11
  2062. FNMSUB f12, f20, f8, f12
  2063. FNMSUB f13, f20, f9, f13
  2064. FNMSUB f14, f20, f10, f14
  2065. FNMSUB f15, f20, f11, f15
  2066. FMUL f12, f21, f12
  2067. FMUL f13, f21, f13
  2068. FMUL f14, f21, f14
  2069. FMUL f15, f21, f15
  2070. #endif
  2071. #ifdef RT
  2072. LFD f16, 15 * SIZE(BO)
  2073. LFD f17, 14 * SIZE(BO)
  2074. LFD f18, 13 * SIZE(BO)
  2075. LFD f19, 12 * SIZE(BO)
  2076. FMUL f12, f16, f12
  2077. FMUL f13, f16, f13
  2078. FMUL f14, f16, f14
  2079. FMUL f15, f16, f15
  2080. FNMSUB f8, f17, f12, f8
  2081. FNMSUB f9, f17, f13, f9
  2082. FNMSUB f10, f17, f14, f10
  2083. FNMSUB f11, f17, f15, f11
  2084. FNMSUB f4, f18, f12, f4
  2085. FNMSUB f5, f18, f13, f5
  2086. FNMSUB f6, f18, f14, f6
  2087. FNMSUB f7, f18, f15, f7
  2088. FNMSUB f0, f19, f12, f0
  2089. FNMSUB f1, f19, f13, f1
  2090. FNMSUB f2, f19, f14, f2
  2091. FNMSUB f3, f19, f15, f3
  2092. LFD f16, 10 * SIZE(BO)
  2093. LFD f17, 9 * SIZE(BO)
  2094. LFD f18, 8 * SIZE(BO)
  2095. LFD f19, 5 * SIZE(BO)
  2096. FMUL f8, f16, f8
  2097. FMUL f9, f16, f9
  2098. FMUL f10, f16, f10
  2099. FMUL f11, f16, f11
  2100. LFD f20, 4 * SIZE(BO)
  2101. LFD f21, 0 * SIZE(BO)
  2102. FNMSUB f4, f17, f8, f4
  2103. FNMSUB f5, f17, f9, f5
  2104. FNMSUB f6, f17, f10, f6
  2105. FNMSUB f7, f17, f11, f7
  2106. FNMSUB f0, f18, f8, f0
  2107. FNMSUB f1, f18, f9, f1
  2108. FNMSUB f2, f18, f10, f2
  2109. FNMSUB f3, f18, f11, f3
  2110. FMUL f4, f19, f4
  2111. FMUL f5, f19, f5
  2112. FMUL f6, f19, f6
  2113. FMUL f7, f19, f7
  2114. FNMSUB f0, f20, f4, f0
  2115. FNMSUB f1, f20, f5, f1
  2116. FNMSUB f2, f20, f6, f2
  2117. FNMSUB f3, f20, f7, f3
  2118. FMUL f0, f21, f0
  2119. FMUL f1, f21, f1
  2120. FMUL f2, f21, f2
  2121. FMUL f3, f21, f3
  2122. #endif
  2123. #ifdef LN
  2124. subi CO1, CO1, 4 * SIZE
  2125. subi CO2, CO2, 4 * SIZE
  2126. subi CO3, CO3, 4 * SIZE
  2127. subi CO4, CO4, 4 * SIZE
  2128. #endif
  2129. #if defined(LN) || defined(LT)
  2130. STFD f0, 0 * SIZE(BO)
  2131. STFD f4, 1 * SIZE(BO)
  2132. STFD f8, 2 * SIZE(BO)
  2133. STFD f12, 3 * SIZE(BO)
  2134. STFD f1, 4 * SIZE(BO)
  2135. STFD f5, 5 * SIZE(BO)
  2136. STFD f9, 6 * SIZE(BO)
  2137. STFD f13, 7 * SIZE(BO)
  2138. STFD f2, 8 * SIZE(BO)
  2139. STFD f6, 9 * SIZE(BO)
  2140. STFD f10, 10 * SIZE(BO)
  2141. STFD f14, 11 * SIZE(BO)
  2142. STFD f3, 12 * SIZE(BO)
  2143. STFD f7, 13 * SIZE(BO)
  2144. STFD f11, 14 * SIZE(BO)
  2145. STFD f15, 15 * SIZE(BO)
  2146. #else
  2147. STFD f0, 0 * SIZE(AO)
  2148. STFD f1, 1 * SIZE(AO)
  2149. STFD f2, 2 * SIZE(AO)
  2150. STFD f3, 3 * SIZE(AO)
  2151. STFD f4, 4 * SIZE(AO)
  2152. STFD f5, 5 * SIZE(AO)
  2153. STFD f6, 6 * SIZE(AO)
  2154. STFD f7, 7 * SIZE(AO)
  2155. STFD f8, 8 * SIZE(AO)
  2156. STFD f9, 9 * SIZE(AO)
  2157. STFD f10, 10 * SIZE(AO)
  2158. STFD f11, 11 * SIZE(AO)
  2159. STFD f12, 12 * SIZE(AO)
  2160. STFD f13, 13 * SIZE(AO)
  2161. STFD f14, 14 * SIZE(AO)
  2162. STFD f15, 15 * SIZE(AO)
  2163. #endif
  2164. STFD f0, 0 * SIZE(CO1)
  2165. STFD f1, 1 * SIZE(CO1)
  2166. STFD f2, 2 * SIZE(CO1)
  2167. STFD f3, 3 * SIZE(CO1)
  2168. STFD f4, 0 * SIZE(CO2)
  2169. STFD f5, 1 * SIZE(CO2)
  2170. STFD f6, 2 * SIZE(CO2)
  2171. STFD f7, 3 * SIZE(CO2)
  2172. STFD f8, 0 * SIZE(CO3)
  2173. STFD f9, 1 * SIZE(CO3)
  2174. STFD f10, 2 * SIZE(CO3)
  2175. STFD f11, 3 * SIZE(CO3)
  2176. STFD f12, 0 * SIZE(CO4)
  2177. STFD f13, 1 * SIZE(CO4)
  2178. STFD f14, 2 * SIZE(CO4)
  2179. STFD f15, 3 * SIZE(CO4)
  2180. lfs f0, FZERO
  2181. fmr f1, f0
  2182. fmr f2, f0
  2183. fmr f3, f0
  2184. fmr f4, f0
  2185. fmr f5, f0
  2186. fmr f6, f0
  2187. fmr f7, f0
  2188. fmr f8, f0
  2189. fmr f9, f0
  2190. fmr f10, f0
  2191. fmr f11, f0
  2192. fmr f12, f0
  2193. fmr f13, f0
  2194. fmr f14, f0
  2195. fmr f15, f0
  2196. #ifndef LN
  2197. addi CO1, CO1, 4 * SIZE
  2198. addi CO2, CO2, 4 * SIZE
  2199. addi CO3, CO3, 4 * SIZE
  2200. addi CO4, CO4, 4 * SIZE
  2201. #endif
  2202. #ifdef RT
  2203. slwi r0, K, 2 + BASE_SHIFT
  2204. add AORIG, AORIG, r0
  2205. #endif
  2206. #if defined(LT) || defined(RN)
  2207. sub TEMP, K, KK
  2208. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2209. add AO, AO, TEMP
  2210. add BO, BO, TEMP
  2211. #endif
  2212. #ifdef LT
  2213. addi KK, KK, 4
  2214. #endif
  2215. #ifdef LN
  2216. subi KK, KK, 4
  2217. #endif
  2218. addic. I, I, -1
  2219. bgt+ .L11
  2220. .align 4
  2221. .L20:
  2222. andi. I, M, 2
  2223. ble .L30
  2224. #if defined(LT) || defined(RN)
  2225. LFD f16, 0 * SIZE(AO)
  2226. LFD f17, 1 * SIZE(AO)
  2227. LFD f18, 2 * SIZE(AO)
  2228. LFD f19, 3 * SIZE(AO)
  2229. LFD f20, 0 * SIZE(B)
  2230. LFD f21, 1 * SIZE(B)
  2231. LFD f22, 2 * SIZE(B)
  2232. LFD f23, 3 * SIZE(B)
  2233. LFD f24, 4 * SIZE(B)
  2234. LFD f25, 5 * SIZE(B)
  2235. LFD f26, 6 * SIZE(B)
  2236. LFD f27, 7 * SIZE(B)
  2237. srawi. r0, KK, 2
  2238. mtspr CTR, r0
  2239. mr BO, B
  2240. #else
  2241. #ifdef LN
  2242. slwi r0, K, 1 + BASE_SHIFT
  2243. sub AORIG, AORIG, r0
  2244. #endif
  2245. slwi r0, KK, 1 + BASE_SHIFT
  2246. slwi TEMP, KK, 2 + BASE_SHIFT
  2247. add AO, AORIG, r0
  2248. add BO, B, TEMP
  2249. sub TEMP, K, KK
  2250. LFD f16, 0 * SIZE(AO)
  2251. LFD f17, 1 * SIZE(AO)
  2252. LFD f18, 2 * SIZE(AO)
  2253. LFD f19, 3 * SIZE(AO)
  2254. LFD f20, 0 * SIZE(BO)
  2255. LFD f21, 1 * SIZE(BO)
  2256. LFD f22, 2 * SIZE(BO)
  2257. LFD f23, 3 * SIZE(BO)
  2258. LFD f24, 4 * SIZE(BO)
  2259. LFD f25, 5 * SIZE(BO)
  2260. LFD f26, 6 * SIZE(BO)
  2261. LFD f27, 7 * SIZE(BO)
  2262. srawi. r0, TEMP, 2
  2263. mtspr CTR, r0
  2264. #endif
  2265. ble .L25
  2266. .align 5
  2267. .L22:
  2268. FMADD f0, f16, f20, f0
  2269. nop
  2270. FMADD f1, f17, f20, f1
  2271. LFD f20, 8 * SIZE(BO)
  2272. FMADD f4, f16, f21, f4
  2273. nop
  2274. FMADD f5, f17, f21, f5
  2275. LFD f21, 9 * SIZE(BO)
  2276. FMADD f8, f16, f22, f8
  2277. nop
  2278. FMADD f9, f17, f22, f9
  2279. LFD f22, 10 * SIZE(BO)
  2280. FMADD f12, f16, f23, f12
  2281. LFD f16, 4 * SIZE(AO)
  2282. FMADD f13, f17, f23, f13
  2283. LFD f23, 11 * SIZE(BO)
  2284. FMADD f2, f18, f24, f2
  2285. LFD f17, 5 * SIZE(AO)
  2286. FMADD f3, f19, f24, f3
  2287. LFD f24, 12 * SIZE(BO)
  2288. FMADD f6, f18, f25, f6
  2289. nop
  2290. FMADD f7, f19, f25, f7
  2291. LFD f25, 13 * SIZE(BO)
  2292. FMADD f10, f18, f26, f10
  2293. nop
  2294. FMADD f11, f19, f26, f11
  2295. LFD f26, 14 * SIZE(BO)
  2296. FMADD f14, f18, f27, f14
  2297. LFD f18, 6 * SIZE(AO)
  2298. FMADD f15, f19, f27, f15
  2299. LFD f27, 15 * SIZE(BO)
  2300. FMADD f0, f16, f20, f0
  2301. LFD f19, 7 * SIZE(AO)
  2302. FMADD f1, f17, f20, f1
  2303. LFDU f20, 16 * SIZE(BO)
  2304. FMADD f4, f16, f21, f4
  2305. nop
  2306. FMADD f5, f17, f21, f5
  2307. LFD f21, 1 * SIZE(BO)
  2308. FMADD f8, f16, f22, f8
  2309. nop
  2310. FMADD f9, f17, f22, f9
  2311. LFD f22, 2 * SIZE(BO)
  2312. FMADD f12, f16, f23, f12
  2313. LFDU f16, 8 * SIZE(AO)
  2314. FMADD f13, f17, f23, f13
  2315. LFD f23, 3 * SIZE(BO)
  2316. FMADD f2, f18, f24, f2
  2317. LFD f17, 1 * SIZE(AO)
  2318. FMADD f3, f19, f24, f3
  2319. LFD f24, 4 * SIZE(BO)
  2320. FMADD f6, f18, f25, f6
  2321. nop
  2322. FMADD f7, f19, f25, f7
  2323. LFD f25, 5 * SIZE(BO)
  2324. FMADD f10, f18, f26, f10
  2325. nop
  2326. FMADD f11, f19, f26, f11
  2327. LFD f26, 6 * SIZE(BO)
  2328. FMADD f14, f18, f27, f14
  2329. LFD f18, 2 * SIZE(AO)
  2330. FMADD f15, f19, f27, f15
  2331. LFD f19, 3 * SIZE(AO)
  2332. LFD f27, 7 * SIZE(BO)
  2333. bdnz .L22
  2334. fadd f0, f2, f0
  2335. fadd f1, f3, f1
  2336. fadd f4, f6, f4
  2337. fadd f5, f7, f5
  2338. fadd f8, f10, f8
  2339. fadd f9, f11, f9
  2340. fadd f12, f14, f12
  2341. fadd f13, f15, f13
  2342. .align 4
  2343. .L25:
  2344. #if defined(LT) || defined(RN)
  2345. andi. r0, KK, 3
  2346. #else
  2347. andi. r0, TEMP, 3
  2348. #endif
  2349. mtspr CTR, r0
  2350. ble+ .L28
  2351. .align 4
  2352. .L26:
  2353. FMADD f0, f16, f20, f0
  2354. nop
  2355. FMADD f1, f17, f20, f1
  2356. LFDU f20, 4 * SIZE(BO)
  2357. FMADD f4, f16, f21, f4
  2358. nop
  2359. FMADD f5, f17, f21, f5
  2360. LFD f21, 1 * SIZE(BO)
  2361. FMADD f8, f16, f22, f8
  2362. nop
  2363. FMADD f9, f17, f22, f9
  2364. LFD f22, 2 * SIZE(BO)
  2365. FMADD f12, f16, f23, f12
  2366. LFDU f16, 2 * SIZE(AO)
  2367. FMADD f13, f17, f23, f13
  2368. LFD f17, 1 * SIZE(AO)
  2369. LFD f23, 3 * SIZE(BO)
  2370. bdnz .L26
  2371. .align 4
  2372. .L28:
  2373. #if defined(LN) || defined(RT)
  2374. #ifdef LN
  2375. subi r0, KK, 2
  2376. #else
  2377. subi r0, KK, 4
  2378. #endif
  2379. slwi TEMP, r0, 1 + BASE_SHIFT
  2380. slwi r0, r0, 2 + BASE_SHIFT
  2381. add AO, AORIG, TEMP
  2382. add BO, B, r0
  2383. #endif
  2384. #if defined(LN) || defined(LT)
  2385. LFD f16, 0 * SIZE(BO)
  2386. LFD f17, 1 * SIZE(BO)
  2387. LFD f18, 2 * SIZE(BO)
  2388. LFD f19, 3 * SIZE(BO)
  2389. LFD f20, 4 * SIZE(BO)
  2390. LFD f21, 5 * SIZE(BO)
  2391. LFD f22, 6 * SIZE(BO)
  2392. LFD f23, 7 * SIZE(BO)
  2393. FSUB f0, f16, f0
  2394. FSUB f4, f17, f4
  2395. FSUB f8, f18, f8
  2396. FSUB f12, f19, f12
  2397. FSUB f1, f20, f1
  2398. FSUB f5, f21, f5
  2399. FSUB f9, f22, f9
  2400. FSUB f13, f23, f13
  2401. #else
  2402. LFD f16, 0 * SIZE(AO)
  2403. LFD f17, 1 * SIZE(AO)
  2404. LFD f20, 2 * SIZE(AO)
  2405. LFD f21, 3 * SIZE(AO)
  2406. LFD f24, 4 * SIZE(AO)
  2407. LFD f25, 5 * SIZE(AO)
  2408. LFD f28, 6 * SIZE(AO)
  2409. LFD f29, 7 * SIZE(AO)
  2410. FSUB f0, f16, f0
  2411. FSUB f1, f17, f1
  2412. FSUB f4, f20, f4
  2413. FSUB f5, f21, f5
  2414. FSUB f8, f24, f8
  2415. FSUB f9, f25, f9
  2416. FSUB f12, f28, f12
  2417. FSUB f13, f29, f13
  2418. #endif
  2419. #ifdef LN
  2420. LFD f19, 3 * SIZE(AO)
  2421. LFD f20, 2 * SIZE(AO)
  2422. LFD f21, 0 * SIZE(AO)
  2423. FMUL f1, f19, f1
  2424. FMUL f5, f19, f5
  2425. FMUL f9, f19, f9
  2426. FMUL f13, f19, f13
  2427. FNMSUB f0, f20, f1, f0
  2428. FNMSUB f4, f20, f5, f4
  2429. FNMSUB f8, f20, f9, f8
  2430. FNMSUB f12, f20, f13, f12
  2431. FMUL f0, f21, f0
  2432. FMUL f4, f21, f4
  2433. FMUL f8, f21, f8
  2434. FMUL f12, f21, f12
  2435. #endif
  2436. #ifdef LT
  2437. LFD f16, 0 * SIZE(AO)
  2438. LFD f17, 1 * SIZE(AO)
  2439. FMUL f0, f16, f0
  2440. FMUL f4, f16, f4
  2441. FMUL f8, f16, f8
  2442. FMUL f12, f16, f12
  2443. FNMSUB f1, f17, f0, f1
  2444. FNMSUB f5, f17, f4, f5
  2445. FNMSUB f9, f17, f8, f9
  2446. FNMSUB f13, f17, f12, f13
  2447. LFD f17, 3 * SIZE(AO)
  2448. FMUL f1, f17, f1
  2449. FMUL f5, f17, f5
  2450. FMUL f9, f17, f9
  2451. FMUL f13, f17, f13
  2452. #endif
  2453. #ifdef RN
  2454. LFD f16, 0 * SIZE(BO)
  2455. LFD f17, 1 * SIZE(BO)
  2456. LFD f18, 2 * SIZE(BO)
  2457. LFD f19, 3 * SIZE(BO)
  2458. FMUL f0, f16, f0
  2459. FMUL f1, f16, f1
  2460. FNMSUB f4, f17, f0, f4
  2461. FNMSUB f5, f17, f1, f5
  2462. FNMSUB f8, f18, f0, f8
  2463. FNMSUB f9, f18, f1, f9
  2464. FNMSUB f12, f19, f0, f12
  2465. FNMSUB f13, f19, f1, f13
  2466. LFD f16, 5 * SIZE(BO)
  2467. LFD f17, 6 * SIZE(BO)
  2468. LFD f18, 7 * SIZE(BO)
  2469. LFD f19, 10 * SIZE(BO)
  2470. LFD f20, 11 * SIZE(BO)
  2471. LFD f21, 15 * SIZE(BO)
  2472. FMUL f4, f16, f4
  2473. FMUL f5, f16, f5
  2474. FNMSUB f8, f17, f4, f8
  2475. FNMSUB f9, f17, f5, f9
  2476. FNMSUB f12, f18, f4, f12
  2477. FNMSUB f13, f18, f5, f13
  2478. FMUL f8, f19, f8
  2479. FMUL f9, f19, f9
  2480. FNMSUB f12, f20, f8, f12
  2481. FNMSUB f13, f20, f9, f13
  2482. FMUL f12, f21, f12
  2483. FMUL f13, f21, f13
  2484. #endif
  2485. #ifdef RT
  2486. LFD f16, 15 * SIZE(BO)
  2487. LFD f17, 14 * SIZE(BO)
  2488. LFD f18, 13 * SIZE(BO)
  2489. LFD f19, 12 * SIZE(BO)
  2490. FMUL f12, f16, f12
  2491. FMUL f13, f16, f13
  2492. FNMSUB f8, f17, f12, f8
  2493. FNMSUB f9, f17, f13, f9
  2494. FNMSUB f4, f18, f12, f4
  2495. FNMSUB f5, f18, f13, f5
  2496. FNMSUB f0, f19, f12, f0
  2497. FNMSUB f1, f19, f13, f1
  2498. LFD f16, 10 * SIZE(BO)
  2499. LFD f17, 9 * SIZE(BO)
  2500. LFD f18, 8 * SIZE(BO)
  2501. LFD f19, 5 * SIZE(BO)
  2502. LFD f20, 4 * SIZE(BO)
  2503. LFD f21, 0 * SIZE(BO)
  2504. FMUL f8, f16, f8
  2505. FMUL f9, f16, f9
  2506. FNMSUB f4, f17, f8, f4
  2507. FNMSUB f5, f17, f9, f5
  2508. FNMSUB f0, f18, f8, f0
  2509. FNMSUB f1, f18, f9, f1
  2510. FMUL f4, f19, f4
  2511. FMUL f5, f19, f5
  2512. FNMSUB f0, f20, f4, f0
  2513. FNMSUB f1, f20, f5, f1
  2514. FMUL f0, f21, f0
  2515. FMUL f1, f21, f1
  2516. #endif
  2517. #ifdef LN
  2518. subi CO1, CO1, 2 * SIZE
  2519. subi CO2, CO2, 2 * SIZE
  2520. subi CO3, CO3, 2 * SIZE
  2521. subi CO4, CO4, 2 * SIZE
  2522. #endif
  2523. #if defined(LN) || defined(LT)
  2524. STFD f0, 0 * SIZE(BO)
  2525. STFD f4, 1 * SIZE(BO)
  2526. STFD f8, 2 * SIZE(BO)
  2527. STFD f12, 3 * SIZE(BO)
  2528. STFD f1, 4 * SIZE(BO)
  2529. STFD f5, 5 * SIZE(BO)
  2530. STFD f9, 6 * SIZE(BO)
  2531. STFD f13, 7 * SIZE(BO)
  2532. #else
  2533. STFD f0, 0 * SIZE(AO)
  2534. STFD f1, 1 * SIZE(AO)
  2535. STFD f4, 2 * SIZE(AO)
  2536. STFD f5, 3 * SIZE(AO)
  2537. STFD f8, 4 * SIZE(AO)
  2538. STFD f9, 5 * SIZE(AO)
  2539. STFD f12, 6 * SIZE(AO)
  2540. STFD f13, 7 * SIZE(AO)
  2541. #endif
  2542. STFD f0, 0 * SIZE(CO1)
  2543. STFD f1, 1 * SIZE(CO1)
  2544. STFD f4, 0 * SIZE(CO2)
  2545. STFD f5, 1 * SIZE(CO2)
  2546. STFD f8, 0 * SIZE(CO3)
  2547. STFD f9, 1 * SIZE(CO3)
  2548. STFD f12, 0 * SIZE(CO4)
  2549. STFD f13, 1 * SIZE(CO4)
  2550. lfs f0, FZERO
  2551. fmr f1, f0
  2552. fmr f2, f0
  2553. fmr f3, f0
  2554. fmr f4, f0
  2555. fmr f5, f0
  2556. fmr f6, f0
  2557. fmr f7, f0
  2558. fmr f8, f0
  2559. fmr f9, f0
  2560. fmr f10, f0
  2561. fmr f11, f0
  2562. fmr f12, f0
  2563. fmr f13, f0
  2564. fmr f14, f0
  2565. fmr f15, f0
  2566. #ifndef LN
  2567. addi CO1, CO1, 2 * SIZE
  2568. addi CO2, CO2, 2 * SIZE
  2569. addi CO3, CO3, 2 * SIZE
  2570. addi CO4, CO4, 2 * SIZE
  2571. #endif
  2572. #ifdef RT
  2573. slwi r0, K, 1 + BASE_SHIFT
  2574. add AORIG, AORIG, r0
  2575. #endif
  2576. #if defined(LT) || defined(RN)
  2577. sub TEMP, K, KK
  2578. slwi r0, TEMP, 1 + BASE_SHIFT
  2579. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2580. add AO, AO, r0
  2581. add BO, BO, TEMP
  2582. #endif
  2583. #ifdef LN
  2584. subi KK, KK, 2
  2585. #endif
  2586. #ifdef LT
  2587. addi KK, KK, 2
  2588. #endif
  2589. .align 4
  2590. .L30:
  2591. andi. I, M, 1
  2592. ble .L39
  2593. #if defined(LT) || defined(RN)
  2594. LFD f16, 0 * SIZE(AO)
  2595. LFD f17, 1 * SIZE(AO)
  2596. LFD f18, 2 * SIZE(AO)
  2597. LFD f19, 3 * SIZE(AO)
  2598. LFD f20, 0 * SIZE(B)
  2599. LFD f21, 1 * SIZE(B)
  2600. LFD f22, 2 * SIZE(B)
  2601. LFD f23, 3 * SIZE(B)
  2602. LFD f24, 4 * SIZE(B)
  2603. LFD f25, 5 * SIZE(B)
  2604. LFD f26, 6 * SIZE(B)
  2605. LFD f27, 7 * SIZE(B)
  2606. srawi. r0, KK, 2
  2607. mtspr CTR, r0
  2608. mr BO, B
  2609. #else
  2610. #ifdef LN
  2611. slwi r0, K, BASE_SHIFT
  2612. sub AORIG, AORIG, r0
  2613. #endif
  2614. slwi r0, KK, 0 + BASE_SHIFT
  2615. slwi TEMP, KK, 2 + BASE_SHIFT
  2616. add AO, AORIG, r0
  2617. add BO, B, TEMP
  2618. sub TEMP, K, KK
  2619. LFD f16, 0 * SIZE(AO)
  2620. LFD f17, 1 * SIZE(AO)
  2621. LFD f18, 2 * SIZE(AO)
  2622. LFD f19, 3 * SIZE(AO)
  2623. LFD f20, 0 * SIZE(BO)
  2624. LFD f21, 1 * SIZE(BO)
  2625. LFD f22, 2 * SIZE(BO)
  2626. LFD f23, 3 * SIZE(BO)
  2627. LFD f24, 4 * SIZE(BO)
  2628. LFD f25, 5 * SIZE(BO)
  2629. LFD f26, 6 * SIZE(BO)
  2630. LFD f27, 7 * SIZE(BO)
  2631. srawi. r0, TEMP, 2
  2632. mtspr CTR, r0
  2633. #endif
  2634. ble .L35
  2635. .align 5
  2636. .L32:
  2637. FMADD f0, f16, f20, f0
  2638. LFD f20, 8 * SIZE(BO)
  2639. FMADD f4, f16, f21, f4
  2640. LFD f21, 9 * SIZE(BO)
  2641. FMADD f8, f16, f22, f8
  2642. LFD f22, 10 * SIZE(BO)
  2643. FMADD f12, f16, f23, f12
  2644. LFD f23, 11 * SIZE(BO)
  2645. LFDU f16, 4 * SIZE(AO)
  2646. FMADD f1, f17, f24, f1
  2647. LFD f24, 12 * SIZE(BO)
  2648. FMADD f5, f17, f25, f5
  2649. LFD f25, 13 * SIZE(BO)
  2650. FMADD f9, f17, f26, f9
  2651. LFD f26, 14 * SIZE(BO)
  2652. FMADD f13, f17, f27, f13
  2653. LFD f27, 15 * SIZE(BO)
  2654. LFD f17, 1 * SIZE(AO)
  2655. FMADD f0, f18, f20, f0
  2656. LFDU f20, 16 * SIZE(BO)
  2657. FMADD f4, f18, f21, f4
  2658. LFD f21, 1 * SIZE(BO)
  2659. FMADD f8, f18, f22, f8
  2660. LFD f22, 2 * SIZE(BO)
  2661. FMADD f12, f18, f23, f12
  2662. LFD f23, 3 * SIZE(BO)
  2663. LFD f18, 2 * SIZE(AO)
  2664. FMADD f1, f19, f24, f1
  2665. LFD f24, 4 * SIZE(BO)
  2666. FMADD f5, f19, f25, f5
  2667. LFD f25, 5 * SIZE(BO)
  2668. FMADD f9, f19, f26, f9
  2669. LFD f26, 6 * SIZE(BO)
  2670. FMADD f13, f19, f27, f13
  2671. LFD f27, 7 * SIZE(BO)
  2672. LFD f19, 3 * SIZE(AO)
  2673. bdnz .L32
  2674. fadd f0, f1, f0
  2675. fadd f4, f5, f4
  2676. fadd f8, f9, f8
  2677. fadd f12, f13, f12
  2678. .align 4
  2679. .L35:
  2680. #if defined(LT) || defined(RN)
  2681. andi. r0, KK, 3
  2682. #else
  2683. andi. r0, TEMP, 3
  2684. #endif
  2685. mtspr CTR, r0
  2686. ble+ .L38
  2687. .align 4
  2688. .L36:
  2689. FMADD f0, f16, f20, f0
  2690. LFDU f20, 4 * SIZE(BO)
  2691. FMADD f4, f16, f21, f4
  2692. LFD f21, 1 * SIZE(BO)
  2693. FMADD f8, f16, f22, f8
  2694. LFD f22, 2 * SIZE(BO)
  2695. FMADD f12, f16, f23, f12
  2696. LFDU f16, 1 * SIZE(AO)
  2697. LFD f23, 3 * SIZE(BO)
  2698. bdnz .L36
  2699. .align 4
  2700. .L38:
  2701. #if defined(LN) || defined(RT)
  2702. #ifdef LN
  2703. subi r0, KK, 1
  2704. #else
  2705. subi r0, KK, 4
  2706. #endif
  2707. slwi TEMP, r0, 0 + BASE_SHIFT
  2708. slwi r0, r0, 2 + BASE_SHIFT
  2709. add AO, AORIG, TEMP
  2710. add BO, B, r0
  2711. #endif
  2712. #if defined(LN) || defined(LT)
  2713. LFD f16, 0 * SIZE(BO)
  2714. LFD f17, 1 * SIZE(BO)
  2715. LFD f18, 2 * SIZE(BO)
  2716. LFD f19, 3 * SIZE(BO)
  2717. FSUB f0, f16, f0
  2718. FSUB f4, f17, f4
  2719. FSUB f8, f18, f8
  2720. FSUB f12, f19, f12
  2721. #else
  2722. LFD f16, 0 * SIZE(AO)
  2723. LFD f20, 1 * SIZE(AO)
  2724. LFD f24, 2 * SIZE(AO)
  2725. LFD f28, 3 * SIZE(AO)
  2726. FSUB f0, f16, f0
  2727. FSUB f4, f20, f4
  2728. FSUB f8, f24, f8
  2729. FSUB f12, f28, f12
  2730. #endif
  2731. #ifdef LN
  2732. LFD f21, 0 * SIZE(AO)
  2733. FMUL f0, f21, f0
  2734. FMUL f4, f21, f4
  2735. FMUL f8, f21, f8
  2736. FMUL f12, f21, f12
  2737. #endif
  2738. #ifdef LT
  2739. LFD f16, 0 * SIZE(AO)
  2740. FMUL f0, f16, f0
  2741. FMUL f4, f16, f4
  2742. FMUL f8, f16, f8
  2743. FMUL f12, f16, f12
  2744. #endif
  2745. #ifdef RN
  2746. LFD f16, 0 * SIZE(BO)
  2747. LFD f17, 1 * SIZE(BO)
  2748. LFD f18, 2 * SIZE(BO)
  2749. LFD f19, 3 * SIZE(BO)
  2750. FMUL f0, f16, f0
  2751. FNMSUB f4, f17, f0, f4
  2752. FNMSUB f8, f18, f0, f8
  2753. FNMSUB f12, f19, f0, f12
  2754. LFD f16, 5 * SIZE(BO)
  2755. LFD f17, 6 * SIZE(BO)
  2756. LFD f18, 7 * SIZE(BO)
  2757. LFD f19, 10 * SIZE(BO)
  2758. LFD f20, 11 * SIZE(BO)
  2759. LFD f21, 15 * SIZE(BO)
  2760. FMUL f4, f16, f4
  2761. FNMSUB f8, f17, f4, f8
  2762. FNMSUB f12, f18, f4, f12
  2763. FMUL f8, f19, f8
  2764. FNMSUB f12, f20, f8, f12
  2765. FMUL f12, f21, f12
  2766. #endif
  2767. #ifdef RT
  2768. LFD f16, 15 * SIZE(BO)
  2769. LFD f17, 14 * SIZE(BO)
  2770. LFD f18, 13 * SIZE(BO)
  2771. LFD f19, 12 * SIZE(BO)
  2772. FMUL f12, f16, f12
  2773. FNMSUB f8, f17, f12, f8
  2774. FNMSUB f4, f18, f12, f4
  2775. FNMSUB f0, f19, f12, f0
  2776. LFD f16, 10 * SIZE(BO)
  2777. LFD f17, 9 * SIZE(BO)
  2778. LFD f18, 8 * SIZE(BO)
  2779. LFD f19, 5 * SIZE(BO)
  2780. FMUL f8, f16, f8
  2781. LFD f20, 4 * SIZE(BO)
  2782. LFD f21, 0 * SIZE(BO)
  2783. FNMSUB f4, f17, f8, f4
  2784. FNMSUB f0, f18, f8, f0
  2785. FMUL f4, f19, f4
  2786. FNMSUB f0, f20, f4, f0
  2787. FMUL f0, f21, f0
  2788. #endif
  2789. #ifdef LN
  2790. subi CO1, CO1, 1 * SIZE
  2791. subi CO2, CO2, 1 * SIZE
  2792. subi CO3, CO3, 1 * SIZE
  2793. subi CO4, CO4, 1 * SIZE
  2794. #endif
  2795. #if defined(LN) || defined(LT)
  2796. STFD f0, 0 * SIZE(BO)
  2797. STFD f4, 1 * SIZE(BO)
  2798. STFD f8, 2 * SIZE(BO)
  2799. STFD f12, 3 * SIZE(BO)
  2800. #else
  2801. STFD f0, 0 * SIZE(AO)
  2802. STFD f4, 1 * SIZE(AO)
  2803. STFD f8, 2 * SIZE(AO)
  2804. STFD f12, 3 * SIZE(AO)
  2805. #endif
  2806. STFD f0, 0 * SIZE(CO1)
  2807. STFD f4, 0 * SIZE(CO2)
  2808. STFD f8, 0 * SIZE(CO3)
  2809. STFD f12, 0 * SIZE(CO4)
  2810. lfs f0, FZERO
  2811. fmr f1, f0
  2812. fmr f4, f0
  2813. fmr f5, f0
  2814. fmr f8, f0
  2815. fmr f9, f0
  2816. fmr f12, f0
  2817. fmr f13, f0
  2818. #ifndef LN
  2819. addi CO1, CO1, 1 * SIZE
  2820. addi CO2, CO2, 1 * SIZE
  2821. addi CO3, CO3, 1 * SIZE
  2822. addi CO4, CO4, 1 * SIZE
  2823. #endif
  2824. #ifdef RT
  2825. slwi r0, K, 0 + BASE_SHIFT
  2826. add AORIG, AORIG, r0
  2827. #endif
  2828. #if defined(LT) || defined(RN)
  2829. sub TEMP, K, KK
  2830. slwi r0, TEMP, 0 + BASE_SHIFT
  2831. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2832. add AO, AO, r0
  2833. add BO, BO, TEMP
  2834. #endif
  2835. #ifdef LN
  2836. subi KK, KK, 1
  2837. #endif
  2838. #ifdef LT
  2839. addi KK, KK, 1
  2840. #endif
  2841. .align 4
  2842. .L39:
  2843. #ifdef LN
  2844. slwi r0, K, 2 + BASE_SHIFT
  2845. add B, B, r0
  2846. #endif
  2847. #if defined(LT) || defined(RN)
  2848. mr B, BO
  2849. #endif
  2850. #ifdef RN
  2851. addi KK, KK, 4
  2852. #endif
  2853. #ifdef RT
  2854. subi KK, KK, 4
  2855. #endif
  2856. addic. J, J, -1
  2857. lfs f0, FZERO
  2858. bgt .L10
  2859. .align 4
  2860. .L999:
  2861. addi r3, 0, 0
  2862. lfd f14, 0(SP)
  2863. lfd f15, 8(SP)
  2864. lfd f16, 16(SP)
  2865. lfd f17, 24(SP)
  2866. lfd f18, 32(SP)
  2867. lfd f19, 40(SP)
  2868. lfd f20, 48(SP)
  2869. lfd f21, 56(SP)
  2870. lfd f22, 64(SP)
  2871. lfd f23, 72(SP)
  2872. lfd f24, 80(SP)
  2873. lfd f25, 88(SP)
  2874. lfd f26, 96(SP)
  2875. lfd f27, 104(SP)
  2876. lfd f28, 112(SP)
  2877. lfd f29, 120(SP)
  2878. lfd f30, 128(SP)
  2879. lfd f31, 136(SP)
  2880. #ifdef __64BIT__
  2881. ld r31, 144(SP)
  2882. ld r30, 152(SP)
  2883. ld r29, 160(SP)
  2884. ld r28, 168(SP)
  2885. ld r27, 176(SP)
  2886. ld r26, 184(SP)
  2887. ld r25, 192(SP)
  2888. ld r24, 200(SP)
  2889. ld r23, 208(SP)
  2890. ld r22, 216(SP)
  2891. ld r21, 224(SP)
  2892. ld r20, 232(SP)
  2893. ld r19, 240(SP)
  2894. ld r18, 248(SP)
  2895. #else
  2896. lwz r31, 144(SP)
  2897. lwz r30, 148(SP)
  2898. lwz r29, 152(SP)
  2899. lwz r28, 156(SP)
  2900. lwz r27, 160(SP)
  2901. lwz r26, 164(SP)
  2902. lwz r25, 168(SP)
  2903. lwz r24, 172(SP)
  2904. lwz r23, 176(SP)
  2905. lwz r22, 180(SP)
  2906. lwz r21, 184(SP)
  2907. lwz r20, 188(SP)
  2908. lwz r19, 192(SP)
  2909. lwz r18, 196(SP)
  2910. #endif
  2911. addi SP, SP, STACKSIZE
  2912. blr
  2913. EPILOGUE