You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_ppc440_LN.S 59 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #ifdef linux
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define A1 f16
  99. #define A2 f17
  100. #define A3 f18
  101. #define A4 f19
  102. #define A5 f20
  103. #define A6 f21
  104. #define B1 f22
  105. #define B2 f23
  106. #define B3 f24
  107. #define B4 f25
  108. #define B5 f26
  109. #define B6 f27
  110. #define B7 f28
  111. #define B8 f29
  112. #define B9 f30
  113. #define B10 f31
  114. PROLOGUE
  115. PROFCODE
  116. addi SP, SP, -STACKSIZE
  117. li r0, 0
  118. stfd f14, 0(SP)
  119. stfd f15, 8(SP)
  120. stfd f16, 16(SP)
  121. stfd f17, 24(SP)
  122. stfd f18, 32(SP)
  123. stfd f19, 40(SP)
  124. stfd f20, 48(SP)
  125. stfd f21, 56(SP)
  126. stfd f22, 64(SP)
  127. stfd f23, 72(SP)
  128. stfd f24, 80(SP)
  129. stfd f25, 88(SP)
  130. stfd f26, 96(SP)
  131. stfd f27, 104(SP)
  132. stfd f28, 112(SP)
  133. stfd f29, 120(SP)
  134. stfd f30, 128(SP)
  135. stfd f31, 136(SP)
  136. #ifdef __64BIT__
  137. std r31, 144(SP)
  138. std r30, 152(SP)
  139. std r29, 160(SP)
  140. std r28, 168(SP)
  141. std r27, 176(SP)
  142. std r26, 184(SP)
  143. std r25, 192(SP)
  144. std r24, 200(SP)
  145. std r23, 208(SP)
  146. std r22, 216(SP)
  147. std r21, 224(SP)
  148. std r20, 232(SP)
  149. std r19, 240(SP)
  150. std r18, 248(SP)
  151. #else
  152. stw r31, 144(SP)
  153. stw r30, 148(SP)
  154. stw r29, 152(SP)
  155. stw r28, 156(SP)
  156. stw r27, 160(SP)
  157. stw r26, 164(SP)
  158. stw r25, 168(SP)
  159. stw r24, 172(SP)
  160. stw r23, 176(SP)
  161. stw r22, 180(SP)
  162. stw r21, 184(SP)
  163. stw r20, 188(SP)
  164. stw r19, 192(SP)
  165. stw r18, 196(SP)
  166. #endif
  167. stw r0, FZERO
  168. #if defined(_AIX) || defined(__APPLE__)
  169. #if !defined(__64BIT__) && defined(DOUBLE)
  170. lwz LDC, 56 + STACKSIZE(SP)
  171. #endif
  172. #endif
  173. slwi LDC, LDC, BASE_SHIFT
  174. #if defined(linux) && defined(__64BIT__)
  175. ld OFFSET, 112 + STACKSIZE(SP)
  176. #endif
  177. #if defined(_AIX) || defined(__APPLE__)
  178. #ifdef __64BIT__
  179. ld OFFSET, 112 + STACKSIZE(SP)
  180. #else
  181. #ifdef DOUBLE
  182. lwz OFFSET, 60 + STACKSIZE(SP)
  183. #else
  184. lwz OFFSET, 56 + STACKSIZE(SP)
  185. #endif
  186. #endif
  187. #endif
  188. #ifdef LN
  189. mullw r0, M, K
  190. slwi r0, r0, BASE_SHIFT
  191. add A, A, r0
  192. slwi r0, M, BASE_SHIFT
  193. add C, C, r0
  194. #endif
  195. #ifdef RN
  196. neg KK, OFFSET
  197. #endif
  198. #ifdef RT
  199. mullw r0, N, K
  200. slwi r0, r0, BASE_SHIFT
  201. add B, B, r0
  202. mullw r0, N, LDC
  203. add C, C, r0
  204. sub KK, N, OFFSET
  205. #endif
  206. cmpwi cr0, M, 0
  207. ble .L999
  208. cmpwi cr0, N, 0
  209. ble .L999
  210. cmpwi cr0, K, 0
  211. ble .L999
  212. lfs f0, FZERO
  213. srawi. J, N, 2
  214. ble .L40
  215. .align 4
  216. .L10:
  217. #ifdef RT
  218. slwi r0, K, 2 + BASE_SHIFT
  219. sub B, B, r0
  220. slwi r0, LDC, 2
  221. sub C, C, r0
  222. #endif
  223. mr CO1, C
  224. add CO2, C, LDC
  225. add CO3, CO2, LDC
  226. add CO4, CO3, LDC
  227. #ifdef LN
  228. add KK, M, OFFSET
  229. #endif
  230. #ifdef LT
  231. mr KK, OFFSET
  232. #endif
  233. fmr f1, f0
  234. fmr f2, f0
  235. fmr f3, f0
  236. fmr f4, f0
  237. fmr f5, f0
  238. fmr f6, f0
  239. fmr f7, f0
  240. fmr f8, f0
  241. fmr f9, f0
  242. fmr f10, f0
  243. fmr f11, f0
  244. fmr f12, f0
  245. fmr f13, f0
  246. fmr f14, f0
  247. fmr f15, f0
  248. #if defined(LN) || defined(RT)
  249. mr AORIG, A
  250. #else
  251. mr AO, A
  252. #endif
  253. #ifndef RT
  254. add C, CO4, LDC
  255. #endif
  256. .L30:
  257. andi. I, M, 1
  258. ble .L20
  259. #if defined(LT) || defined(RN)
  260. LFD f16, 0 * SIZE(AO)
  261. LFD f17, 1 * SIZE(AO)
  262. LFD f18, 2 * SIZE(AO)
  263. LFD f19, 3 * SIZE(AO)
  264. LFD f20, 0 * SIZE(B)
  265. LFD f21, 1 * SIZE(B)
  266. LFD f22, 2 * SIZE(B)
  267. LFD f23, 3 * SIZE(B)
  268. LFD f24, 4 * SIZE(B)
  269. LFD f25, 5 * SIZE(B)
  270. LFD f26, 6 * SIZE(B)
  271. LFD f27, 7 * SIZE(B)
  272. srawi. r0, KK, 2
  273. mtspr CTR, r0
  274. mr BO, B
  275. #else
  276. #ifdef LN
  277. slwi r0, K, BASE_SHIFT
  278. sub AORIG, AORIG, r0
  279. #endif
  280. slwi r0, KK, 0 + BASE_SHIFT
  281. slwi TEMP, KK, 2 + BASE_SHIFT
  282. add AO, AORIG, r0
  283. add BO, B, TEMP
  284. sub TEMP, K, KK
  285. LFD f16, 0 * SIZE(AO)
  286. LFD f17, 1 * SIZE(AO)
  287. LFD f18, 2 * SIZE(AO)
  288. LFD f19, 3 * SIZE(AO)
  289. LFD f20, 0 * SIZE(BO)
  290. LFD f21, 1 * SIZE(BO)
  291. LFD f22, 2 * SIZE(BO)
  292. LFD f23, 3 * SIZE(BO)
  293. LFD f24, 4 * SIZE(BO)
  294. LFD f25, 5 * SIZE(BO)
  295. LFD f26, 6 * SIZE(BO)
  296. LFD f27, 7 * SIZE(BO)
  297. srawi. r0, TEMP, 2
  298. mtspr CTR, r0
  299. #endif
  300. ble .L35
  301. .align 5
  302. .L32:
  303. FMADD f0, f16, f20, f0
  304. LFD f20, 8 * SIZE(BO)
  305. FMADD f4, f16, f21, f4
  306. LFD f21, 9 * SIZE(BO)
  307. FMADD f8, f16, f22, f8
  308. LFD f22, 10 * SIZE(BO)
  309. FMADD f12, f16, f23, f12
  310. LFD f23, 11 * SIZE(BO)
  311. LFDU f16, 4 * SIZE(AO)
  312. FMADD f1, f17, f24, f1
  313. LFD f24, 12 * SIZE(BO)
  314. FMADD f5, f17, f25, f5
  315. LFD f25, 13 * SIZE(BO)
  316. FMADD f9, f17, f26, f9
  317. LFD f26, 14 * SIZE(BO)
  318. FMADD f13, f17, f27, f13
  319. LFD f27, 15 * SIZE(BO)
  320. LFD f17, 1 * SIZE(AO)
  321. FMADD f0, f18, f20, f0
  322. LFDU f20, 16 * SIZE(BO)
  323. FMADD f4, f18, f21, f4
  324. LFD f21, 1 * SIZE(BO)
  325. FMADD f8, f18, f22, f8
  326. LFD f22, 2 * SIZE(BO)
  327. FMADD f12, f18, f23, f12
  328. LFD f23, 3 * SIZE(BO)
  329. LFD f18, 2 * SIZE(AO)
  330. FMADD f1, f19, f24, f1
  331. LFD f24, 4 * SIZE(BO)
  332. FMADD f5, f19, f25, f5
  333. LFD f25, 5 * SIZE(BO)
  334. FMADD f9, f19, f26, f9
  335. LFD f26, 6 * SIZE(BO)
  336. FMADD f13, f19, f27, f13
  337. LFD f27, 7 * SIZE(BO)
  338. LFD f19, 3 * SIZE(AO)
  339. bdnz .L32
  340. fadd f0, f1, f0
  341. fadd f4, f5, f4
  342. fadd f8, f9, f8
  343. fadd f12, f13, f12
  344. .align 4
  345. .L35:
  346. #if defined(LT) || defined(RN)
  347. andi. r0, KK, 3
  348. #else
  349. andi. r0, TEMP, 3
  350. #endif
  351. mtspr CTR, r0
  352. ble+ .L38
  353. .align 4
  354. .L36:
  355. FMADD f0, f16, f20, f0
  356. LFDU f20, 4 * SIZE(BO)
  357. FMADD f4, f16, f21, f4
  358. LFD f21, 1 * SIZE(BO)
  359. FMADD f8, f16, f22, f8
  360. LFD f22, 2 * SIZE(BO)
  361. FMADD f12, f16, f23, f12
  362. LFDU f16, 1 * SIZE(AO)
  363. LFD f23, 3 * SIZE(BO)
  364. bdnz .L36
  365. .align 4
  366. .L38:
  367. #if defined(LN) || defined(RT)
  368. #ifdef LN
  369. subi r0, KK, 1
  370. #else
  371. subi r0, KK, 4
  372. #endif
  373. slwi TEMP, r0, 0 + BASE_SHIFT
  374. slwi r0, r0, 2 + BASE_SHIFT
  375. add AO, AORIG, TEMP
  376. add BO, B, r0
  377. #endif
  378. #if defined(LN) || defined(LT)
  379. LFD f16, 0 * SIZE(BO)
  380. LFD f17, 1 * SIZE(BO)
  381. LFD f18, 2 * SIZE(BO)
  382. LFD f19, 3 * SIZE(BO)
  383. FSUB f0, f16, f0
  384. FSUB f4, f17, f4
  385. FSUB f8, f18, f8
  386. FSUB f12, f19, f12
  387. #else
  388. LFD f16, 0 * SIZE(AO)
  389. LFD f20, 1 * SIZE(AO)
  390. LFD f24, 2 * SIZE(AO)
  391. LFD f28, 3 * SIZE(AO)
  392. FSUB f0, f16, f0
  393. FSUB f4, f20, f4
  394. FSUB f8, f24, f8
  395. FSUB f12, f28, f12
  396. #endif
  397. #ifdef LN
  398. LFD f21, 0 * SIZE(AO)
  399. FMUL f0, f21, f0
  400. FMUL f4, f21, f4
  401. FMUL f8, f21, f8
  402. FMUL f12, f21, f12
  403. #endif
  404. #ifdef LT
  405. LFD f16, 0 * SIZE(AO)
  406. FMUL f0, f16, f0
  407. FMUL f4, f16, f4
  408. FMUL f8, f16, f8
  409. FMUL f12, f16, f12
  410. #endif
  411. #ifdef RN
  412. LFD f16, 0 * SIZE(BO)
  413. LFD f17, 1 * SIZE(BO)
  414. LFD f18, 2 * SIZE(BO)
  415. LFD f19, 3 * SIZE(BO)
  416. FMUL f0, f16, f0
  417. FNMSUB f4, f17, f0, f4
  418. FNMSUB f8, f18, f0, f8
  419. FNMSUB f12, f19, f0, f12
  420. LFD f16, 5 * SIZE(BO)
  421. LFD f17, 6 * SIZE(BO)
  422. LFD f18, 7 * SIZE(BO)
  423. LFD f19, 10 * SIZE(BO)
  424. LFD f20, 11 * SIZE(BO)
  425. LFD f21, 15 * SIZE(BO)
  426. FMUL f4, f16, f4
  427. FNMSUB f8, f17, f4, f8
  428. FNMSUB f12, f18, f4, f12
  429. FMUL f8, f19, f8
  430. FNMSUB f12, f20, f8, f12
  431. FMUL f12, f21, f12
  432. #endif
  433. #ifdef RT
  434. LFD f16, 15 * SIZE(BO)
  435. LFD f17, 14 * SIZE(BO)
  436. LFD f18, 13 * SIZE(BO)
  437. LFD f19, 12 * SIZE(BO)
  438. FMUL f12, f16, f12
  439. FNMSUB f8, f17, f12, f8
  440. FNMSUB f4, f18, f12, f4
  441. FNMSUB f0, f19, f12, f0
  442. LFD f16, 10 * SIZE(BO)
  443. LFD f17, 9 * SIZE(BO)
  444. LFD f18, 8 * SIZE(BO)
  445. LFD f19, 5 * SIZE(BO)
  446. FMUL f8, f16, f8
  447. LFD f20, 4 * SIZE(BO)
  448. LFD f21, 0 * SIZE(BO)
  449. FNMSUB f4, f17, f8, f4
  450. FNMSUB f0, f18, f8, f0
  451. FMUL f4, f19, f4
  452. FNMSUB f0, f20, f4, f0
  453. FMUL f0, f21, f0
  454. #endif
  455. #ifdef LN
  456. subi CO1, CO1, 1 * SIZE
  457. subi CO2, CO2, 1 * SIZE
  458. subi CO3, CO3, 1 * SIZE
  459. subi CO4, CO4, 1 * SIZE
  460. #endif
  461. #if defined(LN) || defined(LT)
  462. STFD f0, 0 * SIZE(BO)
  463. STFD f4, 1 * SIZE(BO)
  464. STFD f8, 2 * SIZE(BO)
  465. STFD f12, 3 * SIZE(BO)
  466. #else
  467. STFD f0, 0 * SIZE(AO)
  468. STFD f4, 1 * SIZE(AO)
  469. STFD f8, 2 * SIZE(AO)
  470. STFD f12, 3 * SIZE(AO)
  471. #endif
  472. STFD f0, 0 * SIZE(CO1)
  473. STFD f4, 0 * SIZE(CO2)
  474. STFD f8, 0 * SIZE(CO3)
  475. STFD f12, 0 * SIZE(CO4)
  476. lfs f0, FZERO
  477. fmr f1, f0
  478. fmr f4, f0
  479. fmr f5, f0
  480. fmr f8, f0
  481. fmr f9, f0
  482. fmr f12, f0
  483. fmr f13, f0
  484. #ifndef LN
  485. addi CO1, CO1, 1 * SIZE
  486. addi CO2, CO2, 1 * SIZE
  487. addi CO3, CO3, 1 * SIZE
  488. addi CO4, CO4, 1 * SIZE
  489. #endif
  490. #ifdef RT
  491. slwi r0, K, 0 + BASE_SHIFT
  492. add AORIG, AORIG, r0
  493. #endif
  494. #if defined(LT) || defined(RN)
  495. sub TEMP, K, KK
  496. slwi r0, TEMP, 0 + BASE_SHIFT
  497. slwi TEMP, TEMP, 2 + BASE_SHIFT
  498. add AO, AO, r0
  499. add BO, BO, TEMP
  500. #endif
  501. #ifdef LN
  502. subi KK, KK, 1
  503. #endif
  504. #ifdef LT
  505. addi KK, KK, 1
  506. #endif
  507. .align 4
  508. .L20:
  509. andi. I, M, 2
  510. ble .L09
  511. #if defined(LT) || defined(RN)
  512. LFD f16, 0 * SIZE(AO)
  513. LFD f17, 1 * SIZE(AO)
  514. LFD f18, 2 * SIZE(AO)
  515. LFD f19, 3 * SIZE(AO)
  516. LFD f20, 0 * SIZE(B)
  517. LFD f21, 1 * SIZE(B)
  518. LFD f22, 2 * SIZE(B)
  519. LFD f23, 3 * SIZE(B)
  520. LFD f24, 4 * SIZE(B)
  521. LFD f25, 5 * SIZE(B)
  522. LFD f26, 6 * SIZE(B)
  523. LFD f27, 7 * SIZE(B)
  524. srawi. r0, KK, 2
  525. mtspr CTR, r0
  526. mr BO, B
  527. #else
  528. #ifdef LN
  529. slwi r0, K, 1 + BASE_SHIFT
  530. sub AORIG, AORIG, r0
  531. #endif
  532. slwi r0, KK, 1 + BASE_SHIFT
  533. slwi TEMP, KK, 2 + BASE_SHIFT
  534. add AO, AORIG, r0
  535. add BO, B, TEMP
  536. sub TEMP, K, KK
  537. LFD f16, 0 * SIZE(AO)
  538. LFD f17, 1 * SIZE(AO)
  539. LFD f18, 2 * SIZE(AO)
  540. LFD f19, 3 * SIZE(AO)
  541. LFD f20, 0 * SIZE(BO)
  542. LFD f21, 1 * SIZE(BO)
  543. LFD f22, 2 * SIZE(BO)
  544. LFD f23, 3 * SIZE(BO)
  545. LFD f24, 4 * SIZE(BO)
  546. LFD f25, 5 * SIZE(BO)
  547. LFD f26, 6 * SIZE(BO)
  548. LFD f27, 7 * SIZE(BO)
  549. srawi. r0, TEMP, 2
  550. mtspr CTR, r0
  551. #endif
  552. ble .L25
  553. .align 5
  554. .L22:
  555. FMADD f0, f16, f20, f0
  556. nop
  557. FMADD f1, f17, f20, f1
  558. LFD f20, 8 * SIZE(BO)
  559. FMADD f4, f16, f21, f4
  560. nop
  561. FMADD f5, f17, f21, f5
  562. LFD f21, 9 * SIZE(BO)
  563. FMADD f8, f16, f22, f8
  564. nop
  565. FMADD f9, f17, f22, f9
  566. LFD f22, 10 * SIZE(BO)
  567. FMADD f12, f16, f23, f12
  568. LFD f16, 4 * SIZE(AO)
  569. FMADD f13, f17, f23, f13
  570. LFD f23, 11 * SIZE(BO)
  571. FMADD f2, f18, f24, f2
  572. LFD f17, 5 * SIZE(AO)
  573. FMADD f3, f19, f24, f3
  574. LFD f24, 12 * SIZE(BO)
  575. FMADD f6, f18, f25, f6
  576. nop
  577. FMADD f7, f19, f25, f7
  578. LFD f25, 13 * SIZE(BO)
  579. FMADD f10, f18, f26, f10
  580. nop
  581. FMADD f11, f19, f26, f11
  582. LFD f26, 14 * SIZE(BO)
  583. FMADD f14, f18, f27, f14
  584. LFD f18, 6 * SIZE(AO)
  585. FMADD f15, f19, f27, f15
  586. LFD f27, 15 * SIZE(BO)
  587. FMADD f0, f16, f20, f0
  588. LFD f19, 7 * SIZE(AO)
  589. FMADD f1, f17, f20, f1
  590. LFDU f20, 16 * SIZE(BO)
  591. FMADD f4, f16, f21, f4
  592. nop
  593. FMADD f5, f17, f21, f5
  594. LFD f21, 1 * SIZE(BO)
  595. FMADD f8, f16, f22, f8
  596. nop
  597. FMADD f9, f17, f22, f9
  598. LFD f22, 2 * SIZE(BO)
  599. FMADD f12, f16, f23, f12
  600. LFDU f16, 8 * SIZE(AO)
  601. FMADD f13, f17, f23, f13
  602. LFD f23, 3 * SIZE(BO)
  603. FMADD f2, f18, f24, f2
  604. LFD f17, 1 * SIZE(AO)
  605. FMADD f3, f19, f24, f3
  606. LFD f24, 4 * SIZE(BO)
  607. FMADD f6, f18, f25, f6
  608. nop
  609. FMADD f7, f19, f25, f7
  610. LFD f25, 5 * SIZE(BO)
  611. FMADD f10, f18, f26, f10
  612. nop
  613. FMADD f11, f19, f26, f11
  614. LFD f26, 6 * SIZE(BO)
  615. FMADD f14, f18, f27, f14
  616. LFD f18, 2 * SIZE(AO)
  617. FMADD f15, f19, f27, f15
  618. LFD f19, 3 * SIZE(AO)
  619. LFD f27, 7 * SIZE(BO)
  620. bdnz .L22
  621. fadd f0, f2, f0
  622. fadd f1, f3, f1
  623. fadd f4, f6, f4
  624. fadd f5, f7, f5
  625. fadd f8, f10, f8
  626. fadd f9, f11, f9
  627. fadd f12, f14, f12
  628. fadd f13, f15, f13
  629. .align 4
  630. .L25:
  631. #if defined(LT) || defined(RN)
  632. andi. r0, KK, 3
  633. #else
  634. andi. r0, TEMP, 3
  635. #endif
  636. mtspr CTR, r0
  637. ble+ .L28
  638. .align 4
  639. .L26:
  640. FMADD f0, f16, f20, f0
  641. nop
  642. FMADD f1, f17, f20, f1
  643. LFDU f20, 4 * SIZE(BO)
  644. FMADD f4, f16, f21, f4
  645. nop
  646. FMADD f5, f17, f21, f5
  647. LFD f21, 1 * SIZE(BO)
  648. FMADD f8, f16, f22, f8
  649. nop
  650. FMADD f9, f17, f22, f9
  651. LFD f22, 2 * SIZE(BO)
  652. FMADD f12, f16, f23, f12
  653. LFDU f16, 2 * SIZE(AO)
  654. FMADD f13, f17, f23, f13
  655. LFD f17, 1 * SIZE(AO)
  656. LFD f23, 3 * SIZE(BO)
  657. bdnz .L26
  658. .align 4
  659. .L28:
  660. #if defined(LN) || defined(RT)
  661. #ifdef LN
  662. subi r0, KK, 2
  663. #else
  664. subi r0, KK, 4
  665. #endif
  666. slwi TEMP, r0, 1 + BASE_SHIFT
  667. slwi r0, r0, 2 + BASE_SHIFT
  668. add AO, AORIG, TEMP
  669. add BO, B, r0
  670. #endif
  671. #if defined(LN) || defined(LT)
  672. LFD f16, 0 * SIZE(BO)
  673. LFD f17, 1 * SIZE(BO)
  674. LFD f18, 2 * SIZE(BO)
  675. LFD f19, 3 * SIZE(BO)
  676. LFD f20, 4 * SIZE(BO)
  677. LFD f21, 5 * SIZE(BO)
  678. LFD f22, 6 * SIZE(BO)
  679. LFD f23, 7 * SIZE(BO)
  680. FSUB f0, f16, f0
  681. FSUB f4, f17, f4
  682. FSUB f8, f18, f8
  683. FSUB f12, f19, f12
  684. FSUB f1, f20, f1
  685. FSUB f5, f21, f5
  686. FSUB f9, f22, f9
  687. FSUB f13, f23, f13
  688. #else
  689. LFD f16, 0 * SIZE(AO)
  690. LFD f17, 1 * SIZE(AO)
  691. LFD f20, 2 * SIZE(AO)
  692. LFD f21, 3 * SIZE(AO)
  693. LFD f24, 4 * SIZE(AO)
  694. LFD f25, 5 * SIZE(AO)
  695. LFD f28, 6 * SIZE(AO)
  696. LFD f29, 7 * SIZE(AO)
  697. FSUB f0, f16, f0
  698. FSUB f1, f17, f1
  699. FSUB f4, f20, f4
  700. FSUB f5, f21, f5
  701. FSUB f8, f24, f8
  702. FSUB f9, f25, f9
  703. FSUB f12, f28, f12
  704. FSUB f13, f29, f13
  705. #endif
  706. #ifdef LN
  707. LFD f19, 3 * SIZE(AO)
  708. LFD f20, 2 * SIZE(AO)
  709. LFD f21, 0 * SIZE(AO)
  710. FMUL f1, f19, f1
  711. FMUL f5, f19, f5
  712. FMUL f9, f19, f9
  713. FMUL f13, f19, f13
  714. FNMSUB f0, f20, f1, f0
  715. FNMSUB f4, f20, f5, f4
  716. FNMSUB f8, f20, f9, f8
  717. FNMSUB f12, f20, f13, f12
  718. FMUL f0, f21, f0
  719. FMUL f4, f21, f4
  720. FMUL f8, f21, f8
  721. FMUL f12, f21, f12
  722. #endif
  723. #ifdef LT
  724. LFD f16, 0 * SIZE(AO)
  725. LFD f17, 1 * SIZE(AO)
  726. FMUL f0, f16, f0
  727. FMUL f4, f16, f4
  728. FMUL f8, f16, f8
  729. FMUL f12, f16, f12
  730. FNMSUB f1, f17, f0, f1
  731. FNMSUB f5, f17, f4, f5
  732. FNMSUB f9, f17, f8, f9
  733. FNMSUB f13, f17, f12, f13
  734. LFD f17, 3 * SIZE(AO)
  735. FMUL f1, f17, f1
  736. FMUL f5, f17, f5
  737. FMUL f9, f17, f9
  738. FMUL f13, f17, f13
  739. #endif
  740. #ifdef RN
  741. LFD f16, 0 * SIZE(BO)
  742. LFD f17, 1 * SIZE(BO)
  743. LFD f18, 2 * SIZE(BO)
  744. LFD f19, 3 * SIZE(BO)
  745. FMUL f0, f16, f0
  746. FMUL f1, f16, f1
  747. FNMSUB f4, f17, f0, f4
  748. FNMSUB f5, f17, f1, f5
  749. FNMSUB f8, f18, f0, f8
  750. FNMSUB f9, f18, f1, f9
  751. FNMSUB f12, f19, f0, f12
  752. FNMSUB f13, f19, f1, f13
  753. LFD f16, 5 * SIZE(BO)
  754. LFD f17, 6 * SIZE(BO)
  755. LFD f18, 7 * SIZE(BO)
  756. LFD f19, 10 * SIZE(BO)
  757. LFD f20, 11 * SIZE(BO)
  758. LFD f21, 15 * SIZE(BO)
  759. FMUL f4, f16, f4
  760. FMUL f5, f16, f5
  761. FNMSUB f8, f17, f4, f8
  762. FNMSUB f9, f17, f5, f9
  763. FNMSUB f12, f18, f4, f12
  764. FNMSUB f13, f18, f5, f13
  765. FMUL f8, f19, f8
  766. FMUL f9, f19, f9
  767. FNMSUB f12, f20, f8, f12
  768. FNMSUB f13, f20, f9, f13
  769. FMUL f12, f21, f12
  770. FMUL f13, f21, f13
  771. #endif
  772. #ifdef RT
  773. LFD f16, 15 * SIZE(BO)
  774. LFD f17, 14 * SIZE(BO)
  775. LFD f18, 13 * SIZE(BO)
  776. LFD f19, 12 * SIZE(BO)
  777. FMUL f12, f16, f12
  778. FMUL f13, f16, f13
  779. FNMSUB f8, f17, f12, f8
  780. FNMSUB f9, f17, f13, f9
  781. FNMSUB f4, f18, f12, f4
  782. FNMSUB f5, f18, f13, f5
  783. FNMSUB f0, f19, f12, f0
  784. FNMSUB f1, f19, f13, f1
  785. LFD f16, 10 * SIZE(BO)
  786. LFD f17, 9 * SIZE(BO)
  787. LFD f18, 8 * SIZE(BO)
  788. LFD f19, 5 * SIZE(BO)
  789. LFD f20, 4 * SIZE(BO)
  790. LFD f21, 0 * SIZE(BO)
  791. FMUL f8, f16, f8
  792. FMUL f9, f16, f9
  793. FNMSUB f4, f17, f8, f4
  794. FNMSUB f5, f17, f9, f5
  795. FNMSUB f0, f18, f8, f0
  796. FNMSUB f1, f18, f9, f1
  797. FMUL f4, f19, f4
  798. FMUL f5, f19, f5
  799. FNMSUB f0, f20, f4, f0
  800. FNMSUB f1, f20, f5, f1
  801. FMUL f0, f21, f0
  802. FMUL f1, f21, f1
  803. #endif
  804. #ifdef LN
  805. subi CO1, CO1, 2 * SIZE
  806. subi CO2, CO2, 2 * SIZE
  807. subi CO3, CO3, 2 * SIZE
  808. subi CO4, CO4, 2 * SIZE
  809. #endif
  810. #if defined(LN) || defined(LT)
  811. STFD f0, 0 * SIZE(BO)
  812. STFD f4, 1 * SIZE(BO)
  813. STFD f8, 2 * SIZE(BO)
  814. STFD f12, 3 * SIZE(BO)
  815. STFD f1, 4 * SIZE(BO)
  816. STFD f5, 5 * SIZE(BO)
  817. STFD f9, 6 * SIZE(BO)
  818. STFD f13, 7 * SIZE(BO)
  819. #else
  820. STFD f0, 0 * SIZE(AO)
  821. STFD f1, 1 * SIZE(AO)
  822. STFD f4, 2 * SIZE(AO)
  823. STFD f5, 3 * SIZE(AO)
  824. STFD f8, 4 * SIZE(AO)
  825. STFD f9, 5 * SIZE(AO)
  826. STFD f12, 6 * SIZE(AO)
  827. STFD f13, 7 * SIZE(AO)
  828. #endif
  829. STFD f0, 0 * SIZE(CO1)
  830. STFD f1, 1 * SIZE(CO1)
  831. STFD f4, 0 * SIZE(CO2)
  832. STFD f5, 1 * SIZE(CO2)
  833. STFD f8, 0 * SIZE(CO3)
  834. STFD f9, 1 * SIZE(CO3)
  835. STFD f12, 0 * SIZE(CO4)
  836. STFD f13, 1 * SIZE(CO4)
  837. lfs f0, FZERO
  838. fmr f1, f0
  839. fmr f2, f0
  840. fmr f3, f0
  841. fmr f4, f0
  842. fmr f5, f0
  843. fmr f6, f0
  844. fmr f7, f0
  845. fmr f8, f0
  846. fmr f9, f0
  847. fmr f10, f0
  848. fmr f11, f0
  849. fmr f12, f0
  850. fmr f13, f0
  851. fmr f14, f0
  852. fmr f15, f0
  853. #ifndef LN
  854. addi CO1, CO1, 2 * SIZE
  855. addi CO2, CO2, 2 * SIZE
  856. addi CO3, CO3, 2 * SIZE
  857. addi CO4, CO4, 2 * SIZE
  858. #endif
  859. #ifdef RT
  860. slwi r0, K, 1 + BASE_SHIFT
  861. add AORIG, AORIG, r0
  862. #endif
  863. #if defined(LT) || defined(RN)
  864. sub TEMP, K, KK
  865. slwi r0, TEMP, 1 + BASE_SHIFT
  866. slwi TEMP, TEMP, 2 + BASE_SHIFT
  867. add AO, AO, r0
  868. add BO, BO, TEMP
  869. #endif
  870. #ifdef LN
  871. subi KK, KK, 2
  872. #endif
  873. #ifdef LT
  874. addi KK, KK, 2
  875. #endif
  876. .align 4
  877. .L09:
  878. srawi. I, M, 2
  879. ble .L39
  880. .align 4
  881. .L11:
  882. #if defined(LT) || defined(RN)
  883. LFD A1, 0 * SIZE(AO)
  884. LFD A2, 1 * SIZE(AO)
  885. LFD A4, 4 * SIZE(AO)
  886. LFD A5, 8 * SIZE(AO)
  887. LFD B1, 0 * SIZE(B)
  888. LFD B2, 1 * SIZE(B)
  889. LFD B3, 2 * SIZE(B)
  890. LFD B4, 3 * SIZE(B)
  891. LFD B5, 4 * SIZE(B)
  892. LFD B6, 8 * SIZE(B)
  893. LFD B7, 12 * SIZE(B)
  894. srawi. r0, KK, 2
  895. mtspr CTR, r0
  896. mr BO, B
  897. #else
  898. #ifdef LN
  899. slwi r0, K, 2 + BASE_SHIFT
  900. sub AORIG, AORIG, r0
  901. #endif
  902. slwi TEMP, KK, 2 + BASE_SHIFT
  903. add AO, AORIG, TEMP
  904. add BO, B, TEMP
  905. sub TEMP, K, KK
  906. LFD A1, 0 * SIZE(AO)
  907. LFD A2, 1 * SIZE(AO)
  908. LFD A4, 4 * SIZE(AO)
  909. LFD A5, 8 * SIZE(AO)
  910. LFD B1, 0 * SIZE(BO)
  911. LFD B2, 1 * SIZE(BO)
  912. LFD B3, 2 * SIZE(BO)
  913. LFD B4, 3 * SIZE(BO)
  914. LFD B5, 4 * SIZE(BO)
  915. LFD B6, 8 * SIZE(BO)
  916. LFD B7, 12 * SIZE(BO)
  917. srawi. r0, TEMP, 2
  918. mtspr CTR, r0
  919. #endif
  920. ble .L15
  921. .align 4
  922. .L12:
  923. FMADD f0, A1, B1, f0
  924. LFD A3, 2 * SIZE(AO)
  925. FMADD f4, A1, B2, f4
  926. LFD A6, 12 * SIZE(AO)
  927. FMADD f8, A1, B3, f8
  928. nop
  929. FMADD f12, A1, B4, f12
  930. nop
  931. FMADD f1, A2, B1, f1
  932. LFD A1, 3 * SIZE(AO)
  933. FMADD f5, A2, B2, f5
  934. nop
  935. FMADD f9, A2, B3, f9
  936. nop
  937. FMADD f13, A2, B4, f13
  938. nop
  939. FMADD f2, A3, B1, f2
  940. nop
  941. FMADD f6, A3, B2, f6
  942. LFD B8, 5 * SIZE(BO)
  943. FMADD f10, A3, B3, f10
  944. LFD B9, 6 * SIZE(BO)
  945. FMADD f14, A3, B4, f14
  946. LFD B10, 7 * SIZE(BO)
  947. FMADD f3, A1, B1, f3
  948. LFD A2, 5 * SIZE(AO)
  949. FMADD f7, A1, B2, f7
  950. LFD B1, 16 * SIZE(BO)
  951. FMADD f11, A1, B3, f11
  952. nop
  953. FMADD f15, A1, B4, f15
  954. nop
  955. FMADD f0, A4, B5, f0
  956. LFD A3, 6 * SIZE(AO)
  957. FMADD f4, A4, B8, f4
  958. LFD A1, 16 * SIZE(AO)
  959. FMADD f8, A4, B9, f8
  960. nop
  961. FMADD f12, A4, B10, f12
  962. nop
  963. FMADD f1, A2, B5, f1
  964. LFD A4, 7 * SIZE(AO)
  965. FMADD f5, A2, B8, f5
  966. nop
  967. FMADD f9, A2, B9, f9
  968. nop
  969. FMADD f13, A2, B10, f13
  970. nop
  971. FMADD f2, A3, B5, f2
  972. nop
  973. FMADD f6, A3, B8, f6
  974. LFD B2, 9 * SIZE(BO)
  975. FMADD f10, A3, B9, f10
  976. LFD B3, 10 * SIZE(BO)
  977. FMADD f14, A3, B10, f14
  978. LFD B4, 11 * SIZE(BO)
  979. FMADD f3, A4, B5, f3
  980. LFD A2, 9 * SIZE(AO)
  981. FMADD f7, A4, B8, f7
  982. LFD B5, 20 * SIZE(BO)
  983. FMADD f11, A4, B9, f11
  984. nop
  985. FMADD f15, A4, B10, f15
  986. nop
  987. FMADD f0, A5, B6, f0
  988. LFD A3, 10 * SIZE(AO)
  989. FMADD f4, A5, B2, f4
  990. LFD A4, 20 * SIZE(AO)
  991. FMADD f8, A5, B3, f8
  992. nop
  993. FMADD f12, A5, B4, f12
  994. nop
  995. FMADD f1, A2, B6, f1
  996. LFD A5, 11 * SIZE(AO)
  997. FMADD f5, A2, B2, f5
  998. nop
  999. FMADD f9, A2, B3, f9
  1000. nop
  1001. FMADD f13, A2, B4, f13
  1002. nop
  1003. FMADD f2, A3, B6, f2
  1004. nop
  1005. FMADD f6, A3, B2, f6
  1006. LFD B8, 13 * SIZE(BO)
  1007. FMADD f10, A3, B3, f10
  1008. LFD B9, 14 * SIZE(BO)
  1009. FMADD f14, A3, B4, f14
  1010. LFD B10,15 * SIZE(BO)
  1011. FMADD f3, A5, B6, f3
  1012. LFD A2, 13 * SIZE(AO)
  1013. FMADD f7, A5, B2, f7
  1014. LFD B6, 24 * SIZE(BO)
  1015. FMADD f11, A5, B3, f11
  1016. nop
  1017. FMADD f15, A5, B4, f15
  1018. nop
  1019. FMADD f0, A6, B7, f0
  1020. LFD A3, 14 * SIZE(AO)
  1021. FMADD f4, A6, B8, f4
  1022. LFD A5, 24 * SIZE(AO)
  1023. FMADD f8, A6, B9, f8
  1024. nop
  1025. FMADD f12, A6, B10, f12
  1026. nop
  1027. FMADD f1, A2, B7, f1
  1028. LFD A6, 15 * SIZE(AO)
  1029. FMADD f5, A2, B8, f5
  1030. nop
  1031. FMADD f9, A2, B9, f9
  1032. nop
  1033. FMADD f13, A2, B10, f13
  1034. nop
  1035. FMADD f2, A3, B7, f2
  1036. addi AO, AO, 16 * SIZE
  1037. FMADD f6, A3, B8, f6
  1038. LFD B2, 17 * SIZE(BO)
  1039. FMADD f10, A3, B9, f10
  1040. LFD B3, 18 * SIZE(BO)
  1041. FMADD f14, A3, B10, f14
  1042. LFD B4, 19 * SIZE(BO)
  1043. FMADD f3, A6, B7, f3
  1044. LFD A2, 1 * SIZE(AO)
  1045. FMADD f7, A6, B8, f7
  1046. LFD B7, 28 * SIZE(BO)
  1047. FMADD f11, A6, B9, f11
  1048. addi BO, BO, 16 * SIZE
  1049. FMADD f15, A6, B10, f15
  1050. bdnz .L12
  1051. .align 4
  1052. .L15:
  1053. #if defined(LT) || defined(RN)
  1054. andi. r0, KK, 3
  1055. #else
  1056. andi. r0, TEMP, 3
  1057. #endif
  1058. mtspr CTR, r0
  1059. ble+ .L18
  1060. .align 4
  1061. .L16:
  1062. FMADD f0, A1, B1, f0
  1063. LFD A3, 2 * SIZE(AO)
  1064. FMADD f4, A1, B2, f4
  1065. FMADD f8, A1, B3, f8
  1066. FMADD f12, A1, B4, f12
  1067. LFD A4, 3 * SIZE(AO)
  1068. FMADD f1, A2, B1, f1
  1069. FMADD f5, A2, B2, f5
  1070. FMADD f9, A2, B3, f9
  1071. FMADD f13, A2, B4, f13
  1072. LFDU A1, 4 * SIZE(AO)
  1073. FMADD f2, A3, B1, f2
  1074. FMADD f6, A3, B2, f6
  1075. FMADD f10, A3, B3, f10
  1076. FMADD f14, A3, B4, f14
  1077. LFD A2, 1 * SIZE(AO)
  1078. FMADD f3, A4, B1, f3
  1079. LFDU B1, 4 * SIZE(BO)
  1080. FMADD f7, A4, B2, f7
  1081. LFD B2, 1 * SIZE(BO)
  1082. FMADD f11, A4, B3, f11
  1083. LFD B3, 2 * SIZE(BO)
  1084. FMADD f15, A4, B4, f15
  1085. LFD B4, 3 * SIZE(BO)
  1086. bdnz .L16
  1087. .align 4
  1088. .L18:
  1089. #if defined(LN) || defined(RT)
  1090. subi r0, KK, 4
  1091. slwi r0, r0, 2 + BASE_SHIFT
  1092. add AO, AORIG, r0
  1093. add BO, B, r0
  1094. #endif
  1095. #if defined(LN) || defined(LT)
  1096. LFD f16, 0 * SIZE(BO)
  1097. LFD f17, 1 * SIZE(BO)
  1098. LFD f18, 2 * SIZE(BO)
  1099. LFD f19, 3 * SIZE(BO)
  1100. LFD f20, 4 * SIZE(BO)
  1101. LFD f21, 5 * SIZE(BO)
  1102. LFD f22, 6 * SIZE(BO)
  1103. LFD f23, 7 * SIZE(BO)
  1104. LFD f24, 8 * SIZE(BO)
  1105. LFD f25, 9 * SIZE(BO)
  1106. LFD f26, 10 * SIZE(BO)
  1107. LFD f27, 11 * SIZE(BO)
  1108. LFD f28, 12 * SIZE(BO)
  1109. LFD f29, 13 * SIZE(BO)
  1110. LFD f30, 14 * SIZE(BO)
  1111. LFD f31, 15 * SIZE(BO)
  1112. FSUB f0, f16, f0
  1113. FSUB f4, f17, f4
  1114. FSUB f8, f18, f8
  1115. FSUB f12, f19, f12
  1116. FSUB f1, f20, f1
  1117. FSUB f5, f21, f5
  1118. FSUB f9, f22, f9
  1119. FSUB f13, f23, f13
  1120. FSUB f2, f24, f2
  1121. FSUB f6, f25, f6
  1122. FSUB f10, f26, f10
  1123. FSUB f14, f27, f14
  1124. FSUB f3, f28, f3
  1125. FSUB f7, f29, f7
  1126. FSUB f11, f30, f11
  1127. FSUB f15, f31, f15
  1128. #else
  1129. LFD f16, 0 * SIZE(AO)
  1130. LFD f17, 1 * SIZE(AO)
  1131. LFD f18, 2 * SIZE(AO)
  1132. LFD f19, 3 * SIZE(AO)
  1133. LFD f20, 4 * SIZE(AO)
  1134. LFD f21, 5 * SIZE(AO)
  1135. LFD f22, 6 * SIZE(AO)
  1136. LFD f23, 7 * SIZE(AO)
  1137. LFD f24, 8 * SIZE(AO)
  1138. LFD f25, 9 * SIZE(AO)
  1139. LFD f26, 10 * SIZE(AO)
  1140. LFD f27, 11 * SIZE(AO)
  1141. LFD f28, 12 * SIZE(AO)
  1142. LFD f29, 13 * SIZE(AO)
  1143. LFD f30, 14 * SIZE(AO)
  1144. LFD f31, 15 * SIZE(AO)
  1145. FSUB f0, f16, f0
  1146. FSUB f1, f17, f1
  1147. FSUB f2, f18, f2
  1148. FSUB f3, f19, f3
  1149. FSUB f4, f20, f4
  1150. FSUB f5, f21, f5
  1151. FSUB f6, f22, f6
  1152. FSUB f7, f23, f7
  1153. FSUB f8, f24, f8
  1154. FSUB f9, f25, f9
  1155. FSUB f10, f26, f10
  1156. FSUB f11, f27, f11
  1157. FSUB f12, f28, f12
  1158. FSUB f13, f29, f13
  1159. FSUB f14, f30, f14
  1160. FSUB f15, f31, f15
  1161. #endif
  1162. #ifdef LN
  1163. LFD f16, 15 * SIZE(AO)
  1164. LFD f17, 14 * SIZE(AO)
  1165. LFD f18, 13 * SIZE(AO)
  1166. LFD f19, 12 * SIZE(AO)
  1167. FMUL f3, f16, f3
  1168. FMUL f7, f16, f7
  1169. FMUL f11, f16, f11
  1170. FMUL f15, f16, f15
  1171. FNMSUB f2, f17, f3, f2
  1172. FNMSUB f6, f17, f7, f6
  1173. FNMSUB f10, f17, f11, f10
  1174. FNMSUB f14, f17, f15, f14
  1175. FNMSUB f1, f18, f3, f1
  1176. FNMSUB f5, f18, f7, f5
  1177. FNMSUB f9, f18, f11, f9
  1178. FNMSUB f13, f18, f15, f13
  1179. FNMSUB f0, f19, f3, f0
  1180. FNMSUB f4, f19, f7, f4
  1181. FNMSUB f8, f19, f11, f8
  1182. FNMSUB f12, f19, f15, f12
  1183. LFD f16, 10 * SIZE(AO)
  1184. LFD f17, 9 * SIZE(AO)
  1185. LFD f18, 8 * SIZE(AO)
  1186. LFD f19, 5 * SIZE(AO)
  1187. FMUL f2, f16, f2
  1188. FMUL f6, f16, f6
  1189. FMUL f10, f16, f10
  1190. FMUL f14, f16, f14
  1191. LFD f20, 4 * SIZE(AO)
  1192. LFD f21, 0 * SIZE(AO)
  1193. FNMSUB f1, f17, f2, f1
  1194. FNMSUB f5, f17, f6, f5
  1195. FNMSUB f9, f17, f10, f9
  1196. FNMSUB f13, f17, f14, f13
  1197. FNMSUB f0, f18, f2, f0
  1198. FNMSUB f4, f18, f6, f4
  1199. FNMSUB f8, f18, f10, f8
  1200. FNMSUB f12, f18, f14, f12
  1201. FMUL f1, f19, f1
  1202. FMUL f5, f19, f5
  1203. FMUL f9, f19, f9
  1204. FMUL f13, f19, f13
  1205. FNMSUB f0, f20, f1, f0
  1206. FNMSUB f4, f20, f5, f4
  1207. FNMSUB f8, f20, f9, f8
  1208. FNMSUB f12, f20, f13, f12
  1209. FMUL f0, f21, f0
  1210. FMUL f4, f21, f4
  1211. FMUL f8, f21, f8
  1212. FMUL f12, f21, f12
  1213. #endif
  1214. #ifdef LT
  1215. LFD f16, 0 * SIZE(AO)
  1216. LFD f17, 1 * SIZE(AO)
  1217. LFD f18, 2 * SIZE(AO)
  1218. LFD f19, 3 * SIZE(AO)
  1219. FMUL f0, f16, f0
  1220. FMUL f4, f16, f4
  1221. FMUL f8, f16, f8
  1222. FMUL f12, f16, f12
  1223. FNMSUB f1, f17, f0, f1
  1224. FNMSUB f5, f17, f4, f5
  1225. FNMSUB f9, f17, f8, f9
  1226. FNMSUB f13, f17, f12, f13
  1227. FNMSUB f2, f18, f0, f2
  1228. FNMSUB f6, f18, f4, f6
  1229. FNMSUB f10, f18, f8, f10
  1230. FNMSUB f14, f18, f12, f14
  1231. FNMSUB f3, f19, f0, f3
  1232. FNMSUB f7, f19, f4, f7
  1233. FNMSUB f11, f19, f8, f11
  1234. FNMSUB f15, f19, f12, f15
  1235. LFD f16, 5 * SIZE(AO)
  1236. LFD f17, 6 * SIZE(AO)
  1237. LFD f18, 7 * SIZE(AO)
  1238. LFD f19, 10 * SIZE(AO)
  1239. FMUL f1, f16, f1
  1240. FMUL f5, f16, f5
  1241. FMUL f9, f16, f9
  1242. FMUL f13, f16, f13
  1243. LFD f20, 11 * SIZE(AO)
  1244. LFD f21, 15 * SIZE(AO)
  1245. FNMSUB f2, f17, f1, f2
  1246. FNMSUB f6, f17, f5, f6
  1247. FNMSUB f10, f17, f9, f10
  1248. FNMSUB f14, f17, f13, f14
  1249. FNMSUB f3, f18, f1, f3
  1250. FNMSUB f7, f18, f5, f7
  1251. FNMSUB f11, f18, f9, f11
  1252. FNMSUB f15, f18, f13, f15
  1253. FMUL f2, f19, f2
  1254. FMUL f6, f19, f6
  1255. FMUL f10, f19, f10
  1256. FMUL f14, f19, f14
  1257. FNMSUB f3, f20, f2, f3
  1258. FNMSUB f7, f20, f6, f7
  1259. FNMSUB f11, f20, f10, f11
  1260. FNMSUB f15, f20, f14, f15
  1261. FMUL f3, f21, f3
  1262. FMUL f7, f21, f7
  1263. FMUL f11, f21, f11
  1264. FMUL f15, f21, f15
  1265. #endif
  1266. #ifdef RN
  1267. LFD f16, 0 * SIZE(BO)
  1268. LFD f17, 1 * SIZE(BO)
  1269. LFD f18, 2 * SIZE(BO)
  1270. LFD f19, 3 * SIZE(BO)
  1271. FMUL f0, f16, f0
  1272. FMUL f1, f16, f1
  1273. FMUL f2, f16, f2
  1274. FMUL f3, f16, f3
  1275. FNMSUB f4, f17, f0, f4
  1276. FNMSUB f5, f17, f1, f5
  1277. FNMSUB f6, f17, f2, f6
  1278. FNMSUB f7, f17, f3, f7
  1279. FNMSUB f8, f18, f0, f8
  1280. FNMSUB f9, f18, f1, f9
  1281. FNMSUB f10, f18, f2, f10
  1282. FNMSUB f11, f18, f3, f11
  1283. FNMSUB f12, f19, f0, f12
  1284. FNMSUB f13, f19, f1, f13
  1285. FNMSUB f14, f19, f2, f14
  1286. FNMSUB f15, f19, f3, f15
  1287. LFD f16, 5 * SIZE(BO)
  1288. LFD f17, 6 * SIZE(BO)
  1289. LFD f18, 7 * SIZE(BO)
  1290. LFD f19, 10 * SIZE(BO)
  1291. FMUL f4, f16, f4
  1292. FMUL f5, f16, f5
  1293. FMUL f6, f16, f6
  1294. FMUL f7, f16, f7
  1295. LFD f20, 11 * SIZE(BO)
  1296. LFD f21, 15 * SIZE(BO)
  1297. FNMSUB f8, f17, f4, f8
  1298. FNMSUB f9, f17, f5, f9
  1299. FNMSUB f10, f17, f6, f10
  1300. FNMSUB f11, f17, f7, f11
  1301. FNMSUB f12, f18, f4, f12
  1302. FNMSUB f13, f18, f5, f13
  1303. FNMSUB f14, f18, f6, f14
  1304. FNMSUB f15, f18, f7, f15
  1305. FMUL f8, f19, f8
  1306. FMUL f9, f19, f9
  1307. FMUL f10, f19, f10
  1308. FMUL f11, f19, f11
  1309. FNMSUB f12, f20, f8, f12
  1310. FNMSUB f13, f20, f9, f13
  1311. FNMSUB f14, f20, f10, f14
  1312. FNMSUB f15, f20, f11, f15
  1313. FMUL f12, f21, f12
  1314. FMUL f13, f21, f13
  1315. FMUL f14, f21, f14
  1316. FMUL f15, f21, f15
  1317. #endif
  1318. #ifdef RT
  1319. LFD f16, 15 * SIZE(BO)
  1320. LFD f17, 14 * SIZE(BO)
  1321. LFD f18, 13 * SIZE(BO)
  1322. LFD f19, 12 * SIZE(BO)
  1323. FMUL f12, f16, f12
  1324. FMUL f13, f16, f13
  1325. FMUL f14, f16, f14
  1326. FMUL f15, f16, f15
  1327. FNMSUB f8, f17, f12, f8
  1328. FNMSUB f9, f17, f13, f9
  1329. FNMSUB f10, f17, f14, f10
  1330. FNMSUB f11, f17, f15, f11
  1331. FNMSUB f4, f18, f12, f4
  1332. FNMSUB f5, f18, f13, f5
  1333. FNMSUB f6, f18, f14, f6
  1334. FNMSUB f7, f18, f15, f7
  1335. FNMSUB f0, f19, f12, f0
  1336. FNMSUB f1, f19, f13, f1
  1337. FNMSUB f2, f19, f14, f2
  1338. FNMSUB f3, f19, f15, f3
  1339. LFD f16, 10 * SIZE(BO)
  1340. LFD f17, 9 * SIZE(BO)
  1341. LFD f18, 8 * SIZE(BO)
  1342. LFD f19, 5 * SIZE(BO)
  1343. FMUL f8, f16, f8
  1344. FMUL f9, f16, f9
  1345. FMUL f10, f16, f10
  1346. FMUL f11, f16, f11
  1347. LFD f20, 4 * SIZE(BO)
  1348. LFD f21, 0 * SIZE(BO)
  1349. FNMSUB f4, f17, f8, f4
  1350. FNMSUB f5, f17, f9, f5
  1351. FNMSUB f6, f17, f10, f6
  1352. FNMSUB f7, f17, f11, f7
  1353. FNMSUB f0, f18, f8, f0
  1354. FNMSUB f1, f18, f9, f1
  1355. FNMSUB f2, f18, f10, f2
  1356. FNMSUB f3, f18, f11, f3
  1357. FMUL f4, f19, f4
  1358. FMUL f5, f19, f5
  1359. FMUL f6, f19, f6
  1360. FMUL f7, f19, f7
  1361. FNMSUB f0, f20, f4, f0
  1362. FNMSUB f1, f20, f5, f1
  1363. FNMSUB f2, f20, f6, f2
  1364. FNMSUB f3, f20, f7, f3
  1365. FMUL f0, f21, f0
  1366. FMUL f1, f21, f1
  1367. FMUL f2, f21, f2
  1368. FMUL f3, f21, f3
  1369. #endif
  1370. #ifdef LN
  1371. subi CO1, CO1, 4 * SIZE
  1372. subi CO2, CO2, 4 * SIZE
  1373. subi CO3, CO3, 4 * SIZE
  1374. subi CO4, CO4, 4 * SIZE
  1375. #endif
  1376. #if defined(LN) || defined(LT)
  1377. STFD f0, 0 * SIZE(BO)
  1378. STFD f4, 1 * SIZE(BO)
  1379. STFD f8, 2 * SIZE(BO)
  1380. STFD f12, 3 * SIZE(BO)
  1381. STFD f1, 4 * SIZE(BO)
  1382. STFD f5, 5 * SIZE(BO)
  1383. STFD f9, 6 * SIZE(BO)
  1384. STFD f13, 7 * SIZE(BO)
  1385. STFD f2, 8 * SIZE(BO)
  1386. STFD f6, 9 * SIZE(BO)
  1387. STFD f10, 10 * SIZE(BO)
  1388. STFD f14, 11 * SIZE(BO)
  1389. STFD f3, 12 * SIZE(BO)
  1390. STFD f7, 13 * SIZE(BO)
  1391. STFD f11, 14 * SIZE(BO)
  1392. STFD f15, 15 * SIZE(BO)
  1393. #else
  1394. STFD f0, 0 * SIZE(AO)
  1395. STFD f1, 1 * SIZE(AO)
  1396. STFD f2, 2 * SIZE(AO)
  1397. STFD f3, 3 * SIZE(AO)
  1398. STFD f4, 4 * SIZE(AO)
  1399. STFD f5, 5 * SIZE(AO)
  1400. STFD f6, 6 * SIZE(AO)
  1401. STFD f7, 7 * SIZE(AO)
  1402. STFD f8, 8 * SIZE(AO)
  1403. STFD f9, 9 * SIZE(AO)
  1404. STFD f10, 10 * SIZE(AO)
  1405. STFD f11, 11 * SIZE(AO)
  1406. STFD f12, 12 * SIZE(AO)
  1407. STFD f13, 13 * SIZE(AO)
  1408. STFD f14, 14 * SIZE(AO)
  1409. STFD f15, 15 * SIZE(AO)
  1410. #endif
  1411. STFD f0, 0 * SIZE(CO1)
  1412. STFD f1, 1 * SIZE(CO1)
  1413. STFD f2, 2 * SIZE(CO1)
  1414. STFD f3, 3 * SIZE(CO1)
  1415. STFD f4, 0 * SIZE(CO2)
  1416. STFD f5, 1 * SIZE(CO2)
  1417. STFD f6, 2 * SIZE(CO2)
  1418. STFD f7, 3 * SIZE(CO2)
  1419. STFD f8, 0 * SIZE(CO3)
  1420. STFD f9, 1 * SIZE(CO3)
  1421. STFD f10, 2 * SIZE(CO3)
  1422. STFD f11, 3 * SIZE(CO3)
  1423. STFD f12, 0 * SIZE(CO4)
  1424. STFD f13, 1 * SIZE(CO4)
  1425. STFD f14, 2 * SIZE(CO4)
  1426. STFD f15, 3 * SIZE(CO4)
  1427. lfs f0, FZERO
  1428. fmr f1, f0
  1429. fmr f2, f0
  1430. fmr f3, f0
  1431. fmr f4, f0
  1432. fmr f5, f0
  1433. fmr f6, f0
  1434. fmr f7, f0
  1435. fmr f8, f0
  1436. fmr f9, f0
  1437. fmr f10, f0
  1438. fmr f11, f0
  1439. fmr f12, f0
  1440. fmr f13, f0
  1441. fmr f14, f0
  1442. fmr f15, f0
  1443. #ifndef LN
  1444. addi CO1, CO1, 4 * SIZE
  1445. addi CO2, CO2, 4 * SIZE
  1446. addi CO3, CO3, 4 * SIZE
  1447. addi CO4, CO4, 4 * SIZE
  1448. #endif
  1449. #ifdef RT
  1450. slwi r0, K, 2 + BASE_SHIFT
  1451. add AORIG, AORIG, r0
  1452. #endif
  1453. #if defined(LT) || defined(RN)
  1454. sub TEMP, K, KK
  1455. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1456. add AO, AO, TEMP
  1457. add BO, BO, TEMP
  1458. #endif
  1459. #ifdef LT
  1460. addi KK, KK, 4
  1461. #endif
  1462. #ifdef LN
  1463. subi KK, KK, 4
  1464. #endif
  1465. addic. I, I, -1
  1466. bgt+ .L11
  1467. .align 4
  1468. .L39:
  1469. #ifdef LN
  1470. slwi r0, K, 2 + BASE_SHIFT
  1471. add B, B, r0
  1472. #endif
  1473. #if defined(LT) || defined(RN)
  1474. mr B, BO
  1475. #endif
  1476. #ifdef RN
  1477. addi KK, KK, 4
  1478. #endif
  1479. #ifdef RT
  1480. subi KK, KK, 4
  1481. #endif
  1482. addic. J, J, -1
  1483. lfs f0, FZERO
  1484. bgt .L10
  1485. .align 4
  1486. .L40:
  1487. andi. J, N, 2
  1488. ble .L70
  1489. #ifdef RT
  1490. slwi r0, K, 1 + BASE_SHIFT
  1491. sub B, B, r0
  1492. slwi r0, LDC, 1
  1493. sub C, C, r0
  1494. #endif
  1495. mr CO1, C
  1496. add CO2, C, LDC
  1497. #ifdef LN
  1498. add KK, M, OFFSET
  1499. #endif
  1500. #ifdef LT
  1501. mr KK, OFFSET
  1502. #endif
  1503. fmr f1, f0
  1504. fmr f2, f0
  1505. fmr f3, f0
  1506. fmr f4, f0
  1507. fmr f5, f0
  1508. fmr f6, f0
  1509. fmr f7, f0
  1510. #if defined(LN) || defined(RT)
  1511. mr AORIG, A
  1512. #else
  1513. mr AO, A
  1514. #endif
  1515. #ifndef RT
  1516. add C, CO2, LDC
  1517. #endif
  1518. .L60:
  1519. andi. I, M, 1
  1520. ble .L50
  1521. #if defined(LT) || defined(RN)
  1522. LFD f16, 0 * SIZE(AO)
  1523. LFD f17, 1 * SIZE(AO)
  1524. LFD f18, 2 * SIZE(AO)
  1525. LFD f19, 3 * SIZE(AO)
  1526. LFD f20, 0 * SIZE(B)
  1527. LFD f21, 1 * SIZE(B)
  1528. LFD f22, 2 * SIZE(B)
  1529. LFD f23, 3 * SIZE(B)
  1530. LFD f24, 4 * SIZE(B)
  1531. LFD f25, 5 * SIZE(B)
  1532. LFD f26, 6 * SIZE(B)
  1533. LFD f27, 7 * SIZE(B)
  1534. srawi. r0, KK, 2
  1535. mtspr CTR, r0
  1536. mr BO, B
  1537. #else
  1538. #ifdef LN
  1539. slwi r0, K, BASE_SHIFT
  1540. sub AORIG, AORIG, r0
  1541. #endif
  1542. slwi r0, KK, 0 + BASE_SHIFT
  1543. slwi TEMP, KK, 1 + BASE_SHIFT
  1544. add AO, AORIG, r0
  1545. add BO, B, TEMP
  1546. sub TEMP, K, KK
  1547. LFD f16, 0 * SIZE(AO)
  1548. LFD f17, 1 * SIZE(AO)
  1549. LFD f18, 2 * SIZE(AO)
  1550. LFD f19, 3 * SIZE(AO)
  1551. LFD f20, 0 * SIZE(BO)
  1552. LFD f21, 1 * SIZE(BO)
  1553. LFD f22, 2 * SIZE(BO)
  1554. LFD f23, 3 * SIZE(BO)
  1555. LFD f24, 4 * SIZE(BO)
  1556. LFD f25, 5 * SIZE(BO)
  1557. LFD f26, 6 * SIZE(BO)
  1558. LFD f27, 7 * SIZE(BO)
  1559. srawi. r0, TEMP, 2
  1560. mtspr CTR, r0
  1561. #endif
  1562. ble .L65
  1563. .align 5
  1564. .L62:
  1565. FMADD f0, f16, f20, f0
  1566. LFDU f20, 8 * SIZE(BO)
  1567. FMADD f1, f16, f21, f1
  1568. LFDU f16, 4 * SIZE(AO)
  1569. LFD f21, 1 * SIZE(BO)
  1570. FMADD f2, f17, f22, f2
  1571. LFD f22, 2 * SIZE(BO)
  1572. FMADD f3, f17, f23, f3
  1573. LFD f17, 1 * SIZE(AO)
  1574. LFD f23, 3 * SIZE(BO)
  1575. FMADD f0, f18, f24, f0
  1576. LFD f24, 4 * SIZE(BO)
  1577. FMADD f1, f18, f25, f1
  1578. LFD f18, 2 * SIZE(AO)
  1579. LFD f25, 5 * SIZE(BO)
  1580. FMADD f2, f19, f26, f2
  1581. LFD f26, 6 * SIZE(BO)
  1582. FMADD f3, f19, f27, f3
  1583. LFD f19, 3 * SIZE(AO)
  1584. LFD f27, 7 * SIZE(BO)
  1585. bdnz .L62
  1586. .align 4
  1587. .L65:
  1588. #if defined(LT) || defined(RN)
  1589. andi. r0, KK, 3
  1590. #else
  1591. andi. r0, TEMP, 3
  1592. #endif
  1593. mtspr CTR, r0
  1594. ble+ .L68
  1595. .align 4
  1596. .L66:
  1597. FMADD f0, f16, f20, f0
  1598. LFDU f20, 2 * SIZE(BO)
  1599. FMADD f1, f16, f21, f1
  1600. LFDU f16, 1 * SIZE(AO)
  1601. LFD f21, 1 * SIZE(BO)
  1602. bdnz .L66
  1603. .align 4
  1604. .L68:
  1605. FADD f0, f2, f0
  1606. FADD f1, f3, f1
  1607. #if defined(LN) || defined(RT)
  1608. #ifdef LN
  1609. subi r0, KK, 1
  1610. #else
  1611. subi r0, KK, 2
  1612. #endif
  1613. slwi TEMP, r0, 0 + BASE_SHIFT
  1614. slwi r0, r0, 1 + BASE_SHIFT
  1615. add AO, AORIG, TEMP
  1616. add BO, B, r0
  1617. #endif
  1618. #if defined(LN) || defined(LT)
  1619. LFD f16, 0 * SIZE(BO)
  1620. LFD f17, 1 * SIZE(BO)
  1621. FSUB f0, f16, f0
  1622. FSUB f1, f17, f1
  1623. #else
  1624. LFD f16, 0 * SIZE(AO)
  1625. LFD f20, 1 * SIZE(AO)
  1626. FSUB f0, f16, f0
  1627. FSUB f1, f20, f1
  1628. #endif
  1629. #ifdef LN
  1630. LFD f21, 0 * SIZE(AO)
  1631. FMUL f0, f21, f0
  1632. FMUL f1, f21, f1
  1633. #endif
  1634. #ifdef LT
  1635. LFD f16, 0 * SIZE(AO)
  1636. FMUL f0, f16, f0
  1637. FMUL f1, f16, f1
  1638. #endif
  1639. #ifdef RN
  1640. LFD f16, 0 * SIZE(BO)
  1641. LFD f17, 1 * SIZE(BO)
  1642. LFD f18, 3 * SIZE(BO)
  1643. FMUL f0, f16, f0
  1644. FNMSUB f1, f17, f0, f1
  1645. FMUL f1, f18, f1
  1646. #endif
  1647. #ifdef RT
  1648. LFD f19, 3 * SIZE(BO)
  1649. LFD f20, 2 * SIZE(BO)
  1650. LFD f21, 0 * SIZE(BO)
  1651. FMUL f1, f19, f1
  1652. FNMSUB f0, f20, f1, f0
  1653. FMUL f0, f21, f0
  1654. #endif
  1655. #ifdef LN
  1656. subi CO1, CO1, 1 * SIZE
  1657. subi CO2, CO2, 1 * SIZE
  1658. #endif
  1659. #if defined(LN) || defined(LT)
  1660. STFD f0, 0 * SIZE(BO)
  1661. STFD f1, 1 * SIZE(BO)
  1662. #else
  1663. STFD f0, 0 * SIZE(AO)
  1664. STFD f1, 1 * SIZE(AO)
  1665. #endif
  1666. STFD f0, 0 * SIZE(CO1)
  1667. STFD f1, 0 * SIZE(CO2)
  1668. lfs f0, FZERO
  1669. fmr f1, f0
  1670. fmr f4, f0
  1671. fmr f5, f0
  1672. #ifndef LN
  1673. addi CO1, CO1, 1 * SIZE
  1674. addi CO2, CO2, 1 * SIZE
  1675. #endif
  1676. #ifdef RT
  1677. slwi r0, K, 0 + BASE_SHIFT
  1678. add AORIG, AORIG, r0
  1679. #endif
  1680. #if defined(LT) || defined(RN)
  1681. sub TEMP, K, KK
  1682. slwi r0, TEMP, 0 + BASE_SHIFT
  1683. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1684. add AO, AO, r0
  1685. add BO, BO, TEMP
  1686. #endif
  1687. #ifdef LN
  1688. subi KK, KK, 1
  1689. #endif
  1690. #ifdef LT
  1691. addi KK, KK, 1
  1692. #endif
  1693. .align 4
  1694. .L50:
  1695. andi. I, M, 2
  1696. ble .L41
  1697. #if defined(LT) || defined(RN)
  1698. LFD f16, 0 * SIZE(AO)
  1699. LFD f17, 1 * SIZE(AO)
  1700. LFD f18, 2 * SIZE(AO)
  1701. LFD f19, 3 * SIZE(AO)
  1702. LFD f20, 0 * SIZE(B)
  1703. LFD f21, 1 * SIZE(B)
  1704. LFD f22, 2 * SIZE(B)
  1705. LFD f23, 3 * SIZE(B)
  1706. LFD f24, 4 * SIZE(B)
  1707. LFD f25, 5 * SIZE(B)
  1708. LFD f26, 6 * SIZE(B)
  1709. LFD f27, 7 * SIZE(B)
  1710. srawi. r0, KK, 2
  1711. mtspr CTR, r0
  1712. mr BO, B
  1713. #else
  1714. #ifdef LN
  1715. slwi r0, K, 1 + BASE_SHIFT
  1716. sub AORIG, AORIG, r0
  1717. #endif
  1718. slwi r0, KK, 1 + BASE_SHIFT
  1719. slwi TEMP, KK, 1 + BASE_SHIFT
  1720. add AO, AORIG, r0
  1721. add BO, B, TEMP
  1722. sub TEMP, K, KK
  1723. LFD f16, 0 * SIZE(AO)
  1724. LFD f17, 1 * SIZE(AO)
  1725. LFD f18, 2 * SIZE(AO)
  1726. LFD f19, 3 * SIZE(AO)
  1727. LFD f20, 0 * SIZE(BO)
  1728. LFD f21, 1 * SIZE(BO)
  1729. LFD f22, 2 * SIZE(BO)
  1730. LFD f23, 3 * SIZE(BO)
  1731. LFD f24, 4 * SIZE(BO)
  1732. LFD f25, 5 * SIZE(BO)
  1733. LFD f26, 6 * SIZE(BO)
  1734. LFD f27, 7 * SIZE(BO)
  1735. srawi. r0, TEMP, 2
  1736. mtspr CTR, r0
  1737. #endif
  1738. ble .L55
  1739. .align 5
  1740. .L52:
  1741. FMADD f0, f16, f20, f0
  1742. FMADD f1, f17, f20, f1
  1743. LFDU f20, 8 * SIZE(BO)
  1744. FMADD f2, f16, f21, f2
  1745. LFD f16, 4 * SIZE(AO)
  1746. FMADD f3, f17, f21, f3
  1747. LFD f17, 5 * SIZE(AO)
  1748. FMADD f4, f18, f22, f4
  1749. LFD f21, 1 * SIZE(BO)
  1750. FMADD f5, f19, f22, f5
  1751. LFD f22, 2 * SIZE(BO)
  1752. FMADD f6, f18, f23, f6
  1753. LFD f18, 6 * SIZE(AO)
  1754. FMADD f7, f19, f23, f7
  1755. LFD f19, 7 * SIZE(AO)
  1756. FMADD f0, f16, f24, f0
  1757. LFD f23, 3 * SIZE(BO)
  1758. FMADD f1, f17, f24, f1
  1759. LFD f24, 4 * SIZE(BO)
  1760. FMADD f2, f16, f25, f2
  1761. LFDU f16, 8 * SIZE(AO)
  1762. FMADD f3, f17, f25, f3
  1763. LFD f17, 1 * SIZE(AO)
  1764. FMADD f4, f18, f26, f4
  1765. LFD f25, 5 * SIZE(BO)
  1766. FMADD f5, f19, f26, f5
  1767. LFD f26, 6 * SIZE(BO)
  1768. FMADD f6, f18, f27, f6
  1769. LFD f18, 2 * SIZE(AO)
  1770. FMADD f7, f19, f27, f7
  1771. LFD f19, 3 * SIZE(AO)
  1772. LFD f27, 7 * SIZE(BO)
  1773. bdnz .L52
  1774. .align 4
  1775. .L55:
  1776. #if defined(LT) || defined(RN)
  1777. andi. r0, KK, 3
  1778. #else
  1779. andi. r0, TEMP, 3
  1780. #endif
  1781. mtspr CTR, r0
  1782. ble+ .L58
  1783. .align 4
  1784. .L56:
  1785. FMADD f0, f16, f20, f0
  1786. FMADD f1, f17, f20, f1
  1787. LFDU f20, 2 * SIZE(BO)
  1788. FMADD f2, f16, f21, f2
  1789. LFDU f16, 2 * SIZE(AO)
  1790. FMADD f3, f17, f21, f3
  1791. LFD f17, 1 * SIZE(AO)
  1792. LFD f21, 1 * SIZE(BO)
  1793. bdnz .L56
  1794. .align 4
  1795. .L58:
  1796. FADD f0, f4, f0
  1797. FADD f1, f5, f1
  1798. FADD f2, f6, f2
  1799. FADD f3, f7, f3
  1800. #if defined(LN) || defined(RT)
  1801. #ifdef LN
  1802. subi r0, KK, 2
  1803. #else
  1804. subi r0, KK, 2
  1805. #endif
  1806. slwi TEMP, r0, 1 + BASE_SHIFT
  1807. slwi r0, r0, 1 + BASE_SHIFT
  1808. add AO, AORIG, TEMP
  1809. add BO, B, r0
  1810. #endif
  1811. #if defined(LN) || defined(LT)
  1812. LFD f16, 0 * SIZE(BO)
  1813. LFD f17, 1 * SIZE(BO)
  1814. LFD f20, 2 * SIZE(BO)
  1815. LFD f21, 3 * SIZE(BO)
  1816. FSUB f0, f16, f0
  1817. FSUB f2, f17, f2
  1818. FSUB f1, f20, f1
  1819. FSUB f3, f21, f3
  1820. #else
  1821. LFD f16, 0 * SIZE(AO)
  1822. LFD f17, 1 * SIZE(AO)
  1823. LFD f20, 2 * SIZE(AO)
  1824. LFD f21, 3 * SIZE(AO)
  1825. FSUB f0, f16, f0
  1826. FSUB f1, f17, f1
  1827. FSUB f2, f20, f2
  1828. FSUB f3, f21, f3
  1829. #endif
  1830. #ifdef LN
  1831. LFD f19, 3 * SIZE(AO)
  1832. LFD f20, 2 * SIZE(AO)
  1833. LFD f21, 0 * SIZE(AO)
  1834. FMUL f1, f19, f1
  1835. FMUL f3, f19, f3
  1836. FNMSUB f0, f20, f1, f0
  1837. FNMSUB f2, f20, f3, f2
  1838. FMUL f0, f21, f0
  1839. FMUL f2, f21, f2
  1840. #endif
  1841. #ifdef LT
  1842. LFD f16, 0 * SIZE(AO)
  1843. LFD f17, 1 * SIZE(AO)
  1844. FMUL f0, f16, f0
  1845. FMUL f2, f16, f2
  1846. FNMSUB f1, f17, f0, f1
  1847. FNMSUB f3, f17, f2, f3
  1848. LFD f17, 3 * SIZE(AO)
  1849. FMUL f1, f17, f1
  1850. FMUL f3, f17, f3
  1851. #endif
  1852. #ifdef RN
  1853. LFD f16, 0 * SIZE(BO)
  1854. LFD f17, 1 * SIZE(BO)
  1855. LFD f18, 3 * SIZE(BO)
  1856. FMUL f0, f16, f0
  1857. FMUL f1, f16, f1
  1858. FNMSUB f2, f17, f0, f2
  1859. FNMSUB f3, f17, f1, f3
  1860. FMUL f2, f18, f2
  1861. FMUL f3, f18, f3
  1862. #endif
  1863. #ifdef RT
  1864. LFD f19, 3 * SIZE(BO)
  1865. LFD f20, 2 * SIZE(BO)
  1866. LFD f21, 0 * SIZE(BO)
  1867. FMUL f2, f19, f2
  1868. FMUL f3, f19, f3
  1869. FNMSUB f0, f20, f2, f0
  1870. FNMSUB f1, f20, f3, f1
  1871. FMUL f0, f21, f0
  1872. FMUL f1, f21, f1
  1873. #endif
  1874. #ifdef LN
  1875. subi CO1, CO1, 2 * SIZE
  1876. subi CO2, CO2, 2 * SIZE
  1877. #endif
  1878. #if defined(LN) || defined(LT)
  1879. STFD f0, 0 * SIZE(BO)
  1880. STFD f2, 1 * SIZE(BO)
  1881. STFD f1, 2 * SIZE(BO)
  1882. STFD f3, 3 * SIZE(BO)
  1883. #else
  1884. STFD f0, 0 * SIZE(AO)
  1885. STFD f1, 1 * SIZE(AO)
  1886. STFD f2, 2 * SIZE(AO)
  1887. STFD f3, 3 * SIZE(AO)
  1888. #endif
  1889. STFD f0, 0 * SIZE(CO1)
  1890. STFD f1, 1 * SIZE(CO1)
  1891. STFD f2, 0 * SIZE(CO2)
  1892. STFD f3, 1 * SIZE(CO2)
  1893. lfs f0, FZERO
  1894. fmr f1, f0
  1895. fmr f2, f0
  1896. fmr f3, f0
  1897. fmr f4, f0
  1898. fmr f5, f0
  1899. fmr f6, f0
  1900. fmr f7, f0
  1901. #ifndef LN
  1902. addi CO1, CO1, 2 * SIZE
  1903. addi CO2, CO2, 2 * SIZE
  1904. #endif
  1905. #ifdef RT
  1906. slwi r0, K, 1 + BASE_SHIFT
  1907. add AORIG, AORIG, r0
  1908. #endif
  1909. #if defined(LT) || defined(RN)
  1910. sub TEMP, K, KK
  1911. slwi r0, TEMP, 1 + BASE_SHIFT
  1912. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1913. add AO, AO, r0
  1914. add BO, BO, TEMP
  1915. #endif
  1916. #ifdef LN
  1917. subi KK, KK, 2
  1918. #endif
  1919. #ifdef LT
  1920. addi KK, KK, 2
  1921. #endif
  1922. .align 4
  1923. .L41:
  1924. srawi. I, M, 2
  1925. ble .L69
  1926. .align 4
  1927. .L42:
  1928. #if defined(LT) || defined(RN)
  1929. LFD f16, 0 * SIZE(AO)
  1930. LFD f17, 1 * SIZE(AO)
  1931. LFD f18, 2 * SIZE(AO)
  1932. LFD f19, 3 * SIZE(AO)
  1933. LFD f20, 0 * SIZE(B)
  1934. LFD f21, 1 * SIZE(B)
  1935. LFD f22, 2 * SIZE(B)
  1936. LFD f23, 3 * SIZE(B)
  1937. srawi. r0, KK, 2
  1938. mtspr CTR, r0
  1939. mr BO, B
  1940. #else
  1941. #ifdef LN
  1942. slwi r0, K, 2 + BASE_SHIFT
  1943. sub AORIG, AORIG, r0
  1944. #endif
  1945. slwi r0, KK, 2 + BASE_SHIFT
  1946. slwi TEMP, KK, 1 + BASE_SHIFT
  1947. add AO, AORIG, r0
  1948. add BO, B, TEMP
  1949. sub TEMP, K, KK
  1950. LFD f16, 0 * SIZE(AO)
  1951. LFD f17, 1 * SIZE(AO)
  1952. LFD f18, 2 * SIZE(AO)
  1953. LFD f19, 3 * SIZE(AO)
  1954. LFD f20, 0 * SIZE(BO)
  1955. LFD f21, 1 * SIZE(BO)
  1956. LFD f22, 2 * SIZE(BO)
  1957. LFD f23, 3 * SIZE(BO)
  1958. srawi. r0, TEMP, 2
  1959. mtspr CTR, r0
  1960. #endif
  1961. ble .L45
  1962. .align 5
  1963. .L43:
  1964. FMADD f0, f16, f20, f0
  1965. FMADD f1, f17, f20, f1
  1966. FMADD f2, f18, f20, f2
  1967. FMADD f3, f19, f20, f3
  1968. LFD f20, 4 * SIZE(BO)
  1969. FMADD f4, f16, f21, f4
  1970. LFD f16, 4 * SIZE(AO)
  1971. FMADD f5, f17, f21, f5
  1972. LFD f17, 5 * SIZE(AO)
  1973. FMADD f6, f18, f21, f6
  1974. LFD f18, 6 * SIZE(AO)
  1975. FMADD f7, f19, f21, f7
  1976. LFD f19, 7 * SIZE(AO)
  1977. FMADD f0, f16, f22, f0
  1978. LFD f21, 5 * SIZE(BO)
  1979. FMADD f1, f17, f22, f1
  1980. FMADD f2, f18, f22, f2
  1981. FMADD f3, f19, f22, f3
  1982. LFD f22, 6 * SIZE(BO)
  1983. FMADD f4, f16, f23, f4
  1984. LFD f16, 8 * SIZE(AO)
  1985. FMADD f5, f17, f23, f5
  1986. LFD f17, 9 * SIZE(AO)
  1987. FMADD f6, f18, f23, f6
  1988. LFD f18, 10 * SIZE(AO)
  1989. FMADD f7, f19, f23, f7
  1990. LFD f19, 11 * SIZE(AO)
  1991. FMADD f0, f16, f20, f0
  1992. LFD f23, 7 * SIZE(BO)
  1993. FMADD f1, f17, f20, f1
  1994. FMADD f2, f18, f20, f2
  1995. FMADD f3, f19, f20, f3
  1996. LFDU f20, 8 * SIZE(BO)
  1997. FMADD f4, f16, f21, f4
  1998. LFD f16, 12 * SIZE(AO)
  1999. FMADD f5, f17, f21, f5
  2000. LFD f17, 13 * SIZE(AO)
  2001. FMADD f6, f18, f21, f6
  2002. LFD f18, 14 * SIZE(AO)
  2003. FMADD f7, f19, f21, f7
  2004. LFD f19, 15 * SIZE(AO)
  2005. FMADD f0, f16, f22, f0
  2006. LFD f21, 1 * SIZE(BO)
  2007. FMADD f1, f17, f22, f1
  2008. FMADD f2, f18, f22, f2
  2009. FMADD f3, f19, f22, f3
  2010. LFD f22, 2 * SIZE(BO)
  2011. FMADD f4, f16, f23, f4
  2012. LFDU f16, 16 * SIZE(AO)
  2013. FMADD f5, f17, f23, f5
  2014. LFD f17, 1 * SIZE(AO)
  2015. FMADD f6, f18, f23, f6
  2016. LFD f18, 2 * SIZE(AO)
  2017. FMADD f7, f19, f23, f7
  2018. LFD f19, 3 * SIZE(AO)
  2019. LFD f23, 3 * SIZE(BO)
  2020. bdnz .L43
  2021. .align 4
  2022. .L45:
  2023. #if defined(LT) || defined(RN)
  2024. andi. r0, KK, 3
  2025. #else
  2026. andi. r0, TEMP, 3
  2027. #endif
  2028. mtspr CTR, r0
  2029. ble+ .L48
  2030. .align 4
  2031. .L46:
  2032. FMADD f0, f16, f20, f0
  2033. FMADD f1, f17, f20, f1
  2034. FMADD f2, f18, f20, f2
  2035. FMADD f3, f19, f20, f3
  2036. LFDU f20, 2 * SIZE(BO)
  2037. FMADD f4, f16, f21, f4
  2038. LFDU f16, 4 * SIZE(AO)
  2039. FMADD f5, f17, f21, f5
  2040. LFD f17, 1 * SIZE(AO)
  2041. FMADD f6, f18, f21, f6
  2042. LFD f18, 2 * SIZE(AO)
  2043. FMADD f7, f19, f21, f7
  2044. LFD f19, 3 * SIZE(AO)
  2045. LFD f21, 1 * SIZE(BO)
  2046. bdnz .L46
  2047. .align 4
  2048. .L48:
  2049. #if defined(LN) || defined(RT)
  2050. #ifdef LN
  2051. subi r0, KK, 4
  2052. #else
  2053. subi r0, KK, 2
  2054. #endif
  2055. slwi TEMP, r0, 2 + BASE_SHIFT
  2056. slwi r0, r0, 1 + BASE_SHIFT
  2057. add AO, AORIG, TEMP
  2058. add BO, B, r0
  2059. #endif
  2060. #if defined(LN) || defined(LT)
  2061. LFD f16, 0 * SIZE(BO)
  2062. LFD f17, 1 * SIZE(BO)
  2063. LFD f20, 2 * SIZE(BO)
  2064. LFD f21, 3 * SIZE(BO)
  2065. LFD f24, 4 * SIZE(BO)
  2066. LFD f25, 5 * SIZE(BO)
  2067. LFD f28, 6 * SIZE(BO)
  2068. LFD f29, 7 * SIZE(BO)
  2069. FSUB f0, f16, f0
  2070. FSUB f4, f17, f4
  2071. FSUB f1, f20, f1
  2072. FSUB f5, f21, f5
  2073. FSUB f2, f24, f2
  2074. FSUB f6, f25, f6
  2075. FSUB f3, f28, f3
  2076. FSUB f7, f29, f7
  2077. #else
  2078. LFD f16, 0 * SIZE(AO)
  2079. LFD f17, 1 * SIZE(AO)
  2080. LFD f18, 2 * SIZE(AO)
  2081. LFD f19, 3 * SIZE(AO)
  2082. LFD f20, 4 * SIZE(AO)
  2083. LFD f21, 5 * SIZE(AO)
  2084. LFD f22, 6 * SIZE(AO)
  2085. LFD f23, 7 * SIZE(AO)
  2086. FSUB f0, f16, f0
  2087. FSUB f1, f17, f1
  2088. FSUB f2, f18, f2
  2089. FSUB f3, f19, f3
  2090. FSUB f4, f20, f4
  2091. FSUB f5, f21, f5
  2092. FSUB f6, f22, f6
  2093. FSUB f7, f23, f7
  2094. #endif
  2095. #ifdef LN
  2096. LFD f16, 15 * SIZE(AO)
  2097. LFD f17, 14 * SIZE(AO)
  2098. LFD f18, 13 * SIZE(AO)
  2099. LFD f19, 12 * SIZE(AO)
  2100. FMUL f3, f16, f3
  2101. FMUL f7, f16, f7
  2102. FNMSUB f2, f17, f3, f2
  2103. FNMSUB f6, f17, f7, f6
  2104. FNMSUB f1, f18, f3, f1
  2105. FNMSUB f5, f18, f7, f5
  2106. FNMSUB f0, f19, f3, f0
  2107. FNMSUB f4, f19, f7, f4
  2108. LFD f16, 10 * SIZE(AO)
  2109. LFD f17, 9 * SIZE(AO)
  2110. LFD f18, 8 * SIZE(AO)
  2111. LFD f19, 5 * SIZE(AO)
  2112. LFD f20, 4 * SIZE(AO)
  2113. LFD f21, 0 * SIZE(AO)
  2114. FMUL f2, f16, f2
  2115. FMUL f6, f16, f6
  2116. FNMSUB f1, f17, f2, f1
  2117. FNMSUB f5, f17, f6, f5
  2118. FNMSUB f0, f18, f2, f0
  2119. FNMSUB f4, f18, f6, f4
  2120. FMUL f1, f19, f1
  2121. FMUL f5, f19, f5
  2122. FNMSUB f0, f20, f1, f0
  2123. FNMSUB f4, f20, f5, f4
  2124. FMUL f0, f21, f0
  2125. FMUL f4, f21, f4
  2126. #endif
  2127. #ifdef LT
  2128. LFD f16, 0 * SIZE(AO)
  2129. LFD f17, 1 * SIZE(AO)
  2130. LFD f18, 2 * SIZE(AO)
  2131. LFD f19, 3 * SIZE(AO)
  2132. FMUL f0, f16, f0
  2133. FMUL f4, f16, f4
  2134. FNMSUB f1, f17, f0, f1
  2135. FNMSUB f5, f17, f4, f5
  2136. FNMSUB f2, f18, f0, f2
  2137. FNMSUB f6, f18, f4, f6
  2138. FNMSUB f3, f19, f0, f3
  2139. FNMSUB f7, f19, f4, f7
  2140. LFD f17, 5 * SIZE(AO)
  2141. LFD f18, 6 * SIZE(AO)
  2142. LFD f19, 7 * SIZE(AO)
  2143. FMUL f1, f17, f1
  2144. FMUL f5, f17, f5
  2145. FNMSUB f2, f18, f1, f2
  2146. FNMSUB f6, f18, f5, f6
  2147. FNMSUB f3, f19, f1, f3
  2148. FNMSUB f7, f19, f5, f7
  2149. LFD f18, 10 * SIZE(AO)
  2150. LFD f19, 11 * SIZE(AO)
  2151. FMUL f2, f18, f2
  2152. FMUL f6, f18, f6
  2153. FNMSUB f3, f19, f2, f3
  2154. FNMSUB f7, f19, f6, f7
  2155. LFD f19, 15 * SIZE(AO)
  2156. FMUL f3, f19, f3
  2157. FMUL f7, f19, f7
  2158. #endif
  2159. #ifdef RN
  2160. LFD f16, 0 * SIZE(BO)
  2161. LFD f17, 1 * SIZE(BO)
  2162. LFD f18, 3 * SIZE(BO)
  2163. FMUL f0, f16, f0
  2164. FMUL f1, f16, f1
  2165. FMUL f2, f16, f2
  2166. FMUL f3, f16, f3
  2167. FNMSUB f4, f17, f0, f4
  2168. FNMSUB f5, f17, f1, f5
  2169. FNMSUB f6, f17, f2, f6
  2170. FNMSUB f7, f17, f3, f7
  2171. FMUL f4, f18, f4
  2172. FMUL f5, f18, f5
  2173. FMUL f6, f18, f6
  2174. FMUL f7, f18, f7
  2175. #endif
  2176. #ifdef RT
  2177. LFD f19, 3 * SIZE(BO)
  2178. LFD f20, 2 * SIZE(BO)
  2179. LFD f21, 0 * SIZE(BO)
  2180. FMUL f4, f19, f4
  2181. FMUL f5, f19, f5
  2182. FMUL f6, f19, f6
  2183. FMUL f7, f19, f7
  2184. FNMSUB f0, f20, f4, f0
  2185. FNMSUB f1, f20, f5, f1
  2186. FNMSUB f2, f20, f6, f2
  2187. FNMSUB f3, f20, f7, f3
  2188. FMUL f0, f21, f0
  2189. FMUL f1, f21, f1
  2190. FMUL f2, f21, f2
  2191. FMUL f3, f21, f3
  2192. #endif
  2193. #ifdef LN
  2194. subi CO1, CO1, 4 * SIZE
  2195. subi CO2, CO2, 4 * SIZE
  2196. #endif
  2197. #if defined(LN) || defined(LT)
  2198. STFD f0, 0 * SIZE(BO)
  2199. STFD f4, 1 * SIZE(BO)
  2200. STFD f1, 2 * SIZE(BO)
  2201. STFD f5, 3 * SIZE(BO)
  2202. STFD f2, 4 * SIZE(BO)
  2203. STFD f6, 5 * SIZE(BO)
  2204. STFD f3, 6 * SIZE(BO)
  2205. STFD f7, 7 * SIZE(BO)
  2206. #else
  2207. STFD f0, 0 * SIZE(AO)
  2208. STFD f1, 1 * SIZE(AO)
  2209. STFD f2, 2 * SIZE(AO)
  2210. STFD f3, 3 * SIZE(AO)
  2211. STFD f4, 4 * SIZE(AO)
  2212. STFD f5, 5 * SIZE(AO)
  2213. STFD f6, 6 * SIZE(AO)
  2214. STFD f7, 7 * SIZE(AO)
  2215. #endif
  2216. STFD f0, 0 * SIZE(CO1)
  2217. STFD f1, 1 * SIZE(CO1)
  2218. STFD f2, 2 * SIZE(CO1)
  2219. STFD f3, 3 * SIZE(CO1)
  2220. STFD f4, 0 * SIZE(CO2)
  2221. STFD f5, 1 * SIZE(CO2)
  2222. STFD f6, 2 * SIZE(CO2)
  2223. STFD f7, 3 * SIZE(CO2)
  2224. lfs f0, FZERO
  2225. fmr f1, f0
  2226. fmr f2, f0
  2227. fmr f3, f0
  2228. fmr f4, f0
  2229. fmr f5, f0
  2230. fmr f6, f0
  2231. fmr f7, f0
  2232. #ifndef LN
  2233. addi CO1, CO1, 4 * SIZE
  2234. addi CO2, CO2, 4 * SIZE
  2235. #endif
  2236. #ifdef RT
  2237. slwi r0, K, 2 + BASE_SHIFT
  2238. add AORIG, AORIG, r0
  2239. #endif
  2240. #if defined(LT) || defined(RN)
  2241. sub TEMP, K, KK
  2242. slwi r0, TEMP, 2 + BASE_SHIFT
  2243. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2244. add AO, AO, r0
  2245. add BO, BO, TEMP
  2246. #endif
  2247. #ifdef LN
  2248. subi KK, KK, 4
  2249. #endif
  2250. #ifdef LT
  2251. addi KK, KK, 4
  2252. #endif
  2253. addic. I, I, -1
  2254. bgt+ .L42
  2255. .align 4
  2256. .L69:
  2257. #ifdef LN
  2258. slwi r0, K, 1 + BASE_SHIFT
  2259. add B, B, r0
  2260. #endif
  2261. #if defined(LT) || defined(RN)
  2262. mr B, BO
  2263. #endif
  2264. #ifdef RN
  2265. addi KK, KK, 2
  2266. #endif
  2267. #ifdef RT
  2268. subi KK, KK, 2
  2269. #endif
  2270. lfs f0, FZERO
  2271. .align 4
  2272. .L70:
  2273. andi. J, N, 1
  2274. ble .L999
  2275. #ifdef RT
  2276. slwi r0, K, 0 + BASE_SHIFT
  2277. sub B, B, r0
  2278. sub C, C, LDC
  2279. #endif
  2280. mr CO1, C
  2281. #ifdef LN
  2282. add KK, M, OFFSET
  2283. #endif
  2284. #ifdef LT
  2285. mr KK, OFFSET
  2286. #endif
  2287. fmr f1, f0
  2288. fmr f2, f0
  2289. fmr f3, f0
  2290. #if defined(LN) || defined(RT)
  2291. mr AORIG, A
  2292. #else
  2293. mr AO, A
  2294. #endif
  2295. #ifndef RT
  2296. add C, CO1, LDC
  2297. #endif
  2298. .align 4
  2299. .L90:
  2300. andi. I, M, 1
  2301. ble .L80
  2302. #if defined(LT) || defined(RN)
  2303. LFD f16, 0 * SIZE(AO)
  2304. LFD f17, 1 * SIZE(AO)
  2305. LFD f18, 2 * SIZE(AO)
  2306. LFD f19, 3 * SIZE(AO)
  2307. LFD f20, 0 * SIZE(B)
  2308. LFD f21, 1 * SIZE(B)
  2309. LFD f22, 2 * SIZE(B)
  2310. LFD f23, 3 * SIZE(B)
  2311. srawi. r0, KK, 3
  2312. mtspr CTR, r0
  2313. mr BO, B
  2314. #else
  2315. #ifdef LN
  2316. slwi r0, K, BASE_SHIFT
  2317. sub AORIG, AORIG, r0
  2318. #endif
  2319. slwi r0, KK, 0 + BASE_SHIFT
  2320. slwi TEMP, KK, 0 + BASE_SHIFT
  2321. add AO, AORIG, r0
  2322. add BO, B, TEMP
  2323. sub TEMP, K, KK
  2324. LFD f16, 0 * SIZE(AO)
  2325. LFD f17, 1 * SIZE(AO)
  2326. LFD f18, 2 * SIZE(AO)
  2327. LFD f19, 3 * SIZE(AO)
  2328. LFD f20, 0 * SIZE(BO)
  2329. LFD f21, 1 * SIZE(BO)
  2330. LFD f22, 2 * SIZE(BO)
  2331. LFD f23, 3 * SIZE(BO)
  2332. srawi. r0, TEMP, 3
  2333. mtspr CTR, r0
  2334. #endif
  2335. ble .L95
  2336. .align 5
  2337. .L92:
  2338. FMADD f0, f16, f20, f0
  2339. LFD f16, 4 * SIZE(AO)
  2340. LFD f20, 4 * SIZE(BO)
  2341. FMADD f1, f17, f21, f1
  2342. LFD f17, 5 * SIZE(AO)
  2343. LFD f21, 5 * SIZE(BO)
  2344. FMADD f2, f18, f22, f2
  2345. LFD f18, 6 * SIZE(AO)
  2346. LFD f22, 6 * SIZE(BO)
  2347. FMADD f3, f19, f23, f3
  2348. LFD f19, 7 * SIZE(AO)
  2349. LFD f23, 7 * SIZE(BO)
  2350. FMADD f0, f16, f20, f0
  2351. LFDU f16, 8 * SIZE(AO)
  2352. LFDU f20, 8 * SIZE(BO)
  2353. FMADD f1, f17, f21, f1
  2354. LFD f17, 1 * SIZE(AO)
  2355. LFD f21, 1 * SIZE(BO)
  2356. FMADD f2, f18, f22, f2
  2357. LFD f18, 2 * SIZE(AO)
  2358. LFD f22, 2 * SIZE(BO)
  2359. FMADD f3, f19, f23, f3
  2360. LFD f19, 3 * SIZE(AO)
  2361. LFD f23, 3 * SIZE(BO)
  2362. bdnz .L92
  2363. .align 4
  2364. .L95:
  2365. #if defined(LT) || defined(RN)
  2366. andi. r0, KK, 7
  2367. #else
  2368. andi. r0, TEMP, 7
  2369. #endif
  2370. mtspr CTR, r0
  2371. ble+ .L98
  2372. .align 4
  2373. .L96:
  2374. FMADD f0, f16, f20, f0
  2375. LFDU f16, 1 * SIZE(AO)
  2376. LFDU f20, 1 * SIZE(BO)
  2377. bdnz .L96
  2378. .align 4
  2379. .L98:
  2380. FADD f0, f1, f0
  2381. FADD f2, f3, f2
  2382. FADD f0, f2, f0
  2383. #if defined(LN) || defined(RT)
  2384. #ifdef LN
  2385. subi r0, KK, 1
  2386. #else
  2387. subi r0, KK, 1
  2388. #endif
  2389. slwi TEMP, r0, 0 + BASE_SHIFT
  2390. slwi r0, r0, 0 + BASE_SHIFT
  2391. add AO, AORIG, TEMP
  2392. add BO, B, r0
  2393. #endif
  2394. #if defined(LN) || defined(LT)
  2395. LFD f16, 0 * SIZE(BO)
  2396. FSUB f0, f16, f0
  2397. #else
  2398. LFD f16, 0 * SIZE(AO)
  2399. FSUB f0, f16, f0
  2400. #endif
  2401. #ifdef LN
  2402. LFD f21, 0 * SIZE(AO)
  2403. FMUL f0, f21, f0
  2404. #endif
  2405. #ifdef LT
  2406. LFD f16, 0 * SIZE(AO)
  2407. FMUL f0, f16, f0
  2408. #endif
  2409. #ifdef RN
  2410. LFD f16, 0 * SIZE(BO)
  2411. FMUL f0, f16, f0
  2412. #endif
  2413. #ifdef RT
  2414. LFD f21, 0 * SIZE(BO)
  2415. FMUL f0, f21, f0
  2416. #endif
  2417. #ifdef LN
  2418. subi CO1, CO1, 1 * SIZE
  2419. #endif
  2420. #if defined(LN) || defined(LT)
  2421. STFD f0, 0 * SIZE(BO)
  2422. #else
  2423. STFD f0, 0 * SIZE(AO)
  2424. #endif
  2425. STFD f0, 0 * SIZE(CO1)
  2426. lfs f0, FZERO
  2427. fmr f1, f0
  2428. fmr f2, f0
  2429. fmr f3, f0
  2430. #ifndef LN
  2431. addi CO1, CO1, 1 * SIZE
  2432. #endif
  2433. #ifdef RT
  2434. slwi r0, K, 0 + BASE_SHIFT
  2435. add AORIG, AORIG, r0
  2436. #endif
  2437. #if defined(LT) || defined(RN)
  2438. sub TEMP, K, KK
  2439. slwi r0, TEMP, 0 + BASE_SHIFT
  2440. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2441. add AO, AO, r0
  2442. add BO, BO, TEMP
  2443. #endif
  2444. #ifdef LN
  2445. subi KK, KK, 1
  2446. #endif
  2447. #ifdef LT
  2448. addi KK, KK, 1
  2449. #endif
  2450. .align 4
  2451. .L80:
  2452. andi. I, M, 2
  2453. ble .L71
  2454. #if defined(LT) || defined(RN)
  2455. LFD f16, 0 * SIZE(AO)
  2456. LFD f17, 1 * SIZE(AO)
  2457. LFD f18, 2 * SIZE(AO)
  2458. LFD f19, 3 * SIZE(AO)
  2459. LFD f20, 0 * SIZE(B)
  2460. LFD f21, 1 * SIZE(B)
  2461. LFD f22, 2 * SIZE(B)
  2462. LFD f23, 3 * SIZE(B)
  2463. srawi. r0, KK, 2
  2464. mtspr CTR, r0
  2465. mr BO, B
  2466. #else
  2467. #ifdef LN
  2468. slwi r0, K, 1 + BASE_SHIFT
  2469. sub AORIG, AORIG, r0
  2470. #endif
  2471. slwi r0, KK, 1 + BASE_SHIFT
  2472. slwi TEMP, KK, 0 + BASE_SHIFT
  2473. add AO, AORIG, r0
  2474. add BO, B, TEMP
  2475. sub TEMP, K, KK
  2476. LFD f16, 0 * SIZE(AO)
  2477. LFD f17, 1 * SIZE(AO)
  2478. LFD f18, 2 * SIZE(AO)
  2479. LFD f19, 3 * SIZE(AO)
  2480. LFD f20, 0 * SIZE(BO)
  2481. LFD f21, 1 * SIZE(BO)
  2482. LFD f22, 2 * SIZE(BO)
  2483. LFD f23, 3 * SIZE(BO)
  2484. srawi. r0, TEMP, 2
  2485. mtspr CTR, r0
  2486. #endif
  2487. ble .L85
  2488. .align 5
  2489. .L82:
  2490. FMADD f0, f16, f20, f0
  2491. LFD f16, 4 * SIZE(AO)
  2492. FMADD f1, f17, f20, f1
  2493. LFDU f20, 4 * SIZE(BO)
  2494. LFD f17, 5 * SIZE(AO)
  2495. FMADD f2, f18, f21, f2
  2496. LFD f18, 6 * SIZE(AO)
  2497. FMADD f3, f19, f21, f3
  2498. LFD f21, 1 * SIZE(BO)
  2499. LFD f19, 7 * SIZE(AO)
  2500. FMADD f0, f16, f22, f0
  2501. LFDU f16, 8 * SIZE(AO)
  2502. FMADD f1, f17, f22, f1
  2503. LFD f22, 2 * SIZE(BO)
  2504. LFD f17, 1 * SIZE(AO)
  2505. FMADD f2, f18, f23, f2
  2506. LFD f18, 2 * SIZE(AO)
  2507. FMADD f3, f19, f23, f3
  2508. LFD f23, 3 * SIZE(BO)
  2509. LFD f19, 3 * SIZE(AO)
  2510. bdnz .L82
  2511. .align 4
  2512. .L85:
  2513. #if defined(LT) || defined(RN)
  2514. andi. r0, KK, 3
  2515. #else
  2516. andi. r0, TEMP, 3
  2517. #endif
  2518. mtspr CTR, r0
  2519. ble+ .L88
  2520. .align 4
  2521. .L86:
  2522. FMADD f0, f16, f20, f0
  2523. LFDU f16, 2 * SIZE(AO)
  2524. FMADD f1, f17, f20, f1
  2525. LFDU f20, 1 * SIZE(BO)
  2526. LFD f17, 1 * SIZE(AO)
  2527. bdnz .L86
  2528. .align 4
  2529. .L88:
  2530. FADD f0, f2, f0
  2531. FADD f1, f3, f1
  2532. #if defined(LN) || defined(RT)
  2533. #ifdef LN
  2534. subi r0, KK, 2
  2535. #else
  2536. subi r0, KK, 1
  2537. #endif
  2538. slwi TEMP, r0, 1 + BASE_SHIFT
  2539. slwi r0, r0, 0 + BASE_SHIFT
  2540. add AO, AORIG, TEMP
  2541. add BO, B, r0
  2542. #endif
  2543. #if defined(LN) || defined(LT)
  2544. LFD f16, 0 * SIZE(BO)
  2545. LFD f20, 1 * SIZE(BO)
  2546. FSUB f0, f16, f0
  2547. FSUB f1, f20, f1
  2548. #else
  2549. LFD f16, 0 * SIZE(AO)
  2550. LFD f17, 1 * SIZE(AO)
  2551. FSUB f0, f16, f0
  2552. FSUB f1, f17, f1
  2553. #endif
  2554. #ifdef LN
  2555. LFD f19, 3 * SIZE(AO)
  2556. LFD f20, 2 * SIZE(AO)
  2557. LFD f21, 0 * SIZE(AO)
  2558. FMUL f1, f19, f1
  2559. FNMSUB f0, f20, f1, f0
  2560. FMUL f0, f21, f0
  2561. #endif
  2562. #ifdef LT
  2563. LFD f16, 0 * SIZE(AO)
  2564. LFD f17, 1 * SIZE(AO)
  2565. FMUL f0, f16, f0
  2566. FNMSUB f1, f17, f0, f1
  2567. LFD f17, 3 * SIZE(AO)
  2568. FMUL f1, f17, f1
  2569. #endif
  2570. #ifdef RN
  2571. LFD f16, 0 * SIZE(BO)
  2572. FMUL f0, f16, f0
  2573. FMUL f1, f16, f1
  2574. #endif
  2575. #ifdef RT
  2576. LFD f21, 0 * SIZE(BO)
  2577. FMUL f0, f21, f0
  2578. FMUL f1, f21, f1
  2579. #endif
  2580. #ifdef LN
  2581. subi CO1, CO1, 2 * SIZE
  2582. #endif
  2583. #if defined(LN) || defined(LT)
  2584. STFD f0, 0 * SIZE(BO)
  2585. STFD f1, 1 * SIZE(BO)
  2586. #else
  2587. STFD f0, 0 * SIZE(AO)
  2588. STFD f1, 1 * SIZE(AO)
  2589. #endif
  2590. STFD f0, 0 * SIZE(CO1)
  2591. STFD f1, 1 * SIZE(CO1)
  2592. lfs f0, FZERO
  2593. fmr f1, f0
  2594. fmr f2, f0
  2595. fmr f3, f0
  2596. #ifndef LN
  2597. addi CO1, CO1, 2 * SIZE
  2598. #endif
  2599. #ifdef RT
  2600. slwi r0, K, 1 + BASE_SHIFT
  2601. add AORIG, AORIG, r0
  2602. #endif
  2603. #if defined(LT) || defined(RN)
  2604. sub TEMP, K, KK
  2605. slwi r0, TEMP, 1 + BASE_SHIFT
  2606. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2607. add AO, AO, r0
  2608. add BO, BO, TEMP
  2609. #endif
  2610. #ifdef LN
  2611. subi KK, KK, 2
  2612. #endif
  2613. #ifdef LT
  2614. addi KK, KK, 2
  2615. #endif
  2616. .align 4
  2617. .L71:
  2618. srawi. I, M, 2
  2619. ble .L999
  2620. .align 4
  2621. .L72:
  2622. #if defined(LT) || defined(RN)
  2623. LFD f16, 0 * SIZE(AO)
  2624. LFD f17, 1 * SIZE(AO)
  2625. LFD f18, 2 * SIZE(AO)
  2626. LFD f19, 3 * SIZE(AO)
  2627. LFD f20, 0 * SIZE(B)
  2628. LFD f21, 1 * SIZE(B)
  2629. LFD f22, 2 * SIZE(B)
  2630. LFD f23, 3 * SIZE(B)
  2631. srawi. r0, KK, 2
  2632. mtspr CTR, r0
  2633. mr BO, B
  2634. #else
  2635. #ifdef LN
  2636. slwi r0, K, 2 + BASE_SHIFT
  2637. sub AORIG, AORIG, r0
  2638. #endif
  2639. slwi r0, KK, 2 + BASE_SHIFT
  2640. slwi TEMP, KK, 0 + BASE_SHIFT
  2641. add AO, AORIG, r0
  2642. add BO, B, TEMP
  2643. sub TEMP, K, KK
  2644. LFD f16, 0 * SIZE(AO)
  2645. LFD f17, 1 * SIZE(AO)
  2646. LFD f18, 2 * SIZE(AO)
  2647. LFD f19, 3 * SIZE(AO)
  2648. LFD f20, 0 * SIZE(BO)
  2649. LFD f21, 1 * SIZE(BO)
  2650. LFD f22, 2 * SIZE(BO)
  2651. LFD f23, 3 * SIZE(BO)
  2652. srawi. r0, TEMP, 2
  2653. mtspr CTR, r0
  2654. #endif
  2655. ble .L75
  2656. .align 5
  2657. .L73:
  2658. FMADD f0, f16, f20, f0
  2659. LFD f16, 4 * SIZE(AO)
  2660. FMADD f1, f17, f20, f1
  2661. LFD f17, 5 * SIZE(AO)
  2662. FMADD f2, f18, f20, f2
  2663. LFD f18, 6 * SIZE(AO)
  2664. FMADD f3, f19, f20, f3
  2665. LFD f19, 7 * SIZE(AO)
  2666. LFDU f20, 4 * SIZE(BO)
  2667. FMADD f0, f16, f21, f0
  2668. LFD f16, 8 * SIZE(AO)
  2669. FMADD f1, f17, f21, f1
  2670. LFD f17, 9 * SIZE(AO)
  2671. FMADD f2, f18, f21, f2
  2672. LFD f18, 10 * SIZE(AO)
  2673. FMADD f3, f19, f21, f3
  2674. LFD f19, 11 * SIZE(AO)
  2675. LFD f21, 1 * SIZE(BO)
  2676. FMADD f0, f16, f22, f0
  2677. LFD f16, 12 * SIZE(AO)
  2678. FMADD f1, f17, f22, f1
  2679. LFD f17, 13 * SIZE(AO)
  2680. FMADD f2, f18, f22, f2
  2681. LFD f18, 14 * SIZE(AO)
  2682. FMADD f3, f19, f22, f3
  2683. LFD f19, 15 * SIZE(AO)
  2684. LFD f22, 2 * SIZE(BO)
  2685. FMADD f0, f16, f23, f0
  2686. LFDU f16, 16 * SIZE(AO)
  2687. FMADD f1, f17, f23, f1
  2688. LFD f17, 1 * SIZE(AO)
  2689. FMADD f2, f18, f23, f2
  2690. LFD f18, 2 * SIZE(AO)
  2691. FMADD f3, f19, f23, f3
  2692. LFD f19, 3 * SIZE(AO)
  2693. LFD f23, 3 * SIZE(BO)
  2694. bdnz .L73
  2695. .align 4
  2696. .L75:
  2697. #if defined(LT) || defined(RN)
  2698. andi. r0, KK, 3
  2699. #else
  2700. andi. r0, TEMP, 3
  2701. #endif
  2702. mtspr CTR, r0
  2703. ble+ .L78
  2704. .align 4
  2705. .L76:
  2706. FMADD f0, f16, f20, f0
  2707. LFDU f16, 4 * SIZE(AO)
  2708. FMADD f1, f17, f20, f1
  2709. LFD f17, 1 * SIZE(AO)
  2710. FMADD f2, f18, f20, f2
  2711. LFD f18, 2 * SIZE(AO)
  2712. FMADD f3, f19, f20, f3
  2713. LFDU f20, 1 * SIZE(BO)
  2714. LFD f19, 3 * SIZE(AO)
  2715. bdnz .L76
  2716. .align 4
  2717. .L78:
  2718. #if defined(LN) || defined(RT)
  2719. #ifdef LN
  2720. subi r0, KK, 4
  2721. #else
  2722. subi r0, KK, 1
  2723. #endif
  2724. slwi TEMP, r0, 2 + BASE_SHIFT
  2725. slwi r0, r0, 0 + BASE_SHIFT
  2726. add AO, AORIG, TEMP
  2727. add BO, B, r0
  2728. #endif
  2729. #if defined(LN) || defined(LT)
  2730. LFD f16, 0 * SIZE(BO)
  2731. LFD f20, 1 * SIZE(BO)
  2732. LFD f24, 2 * SIZE(BO)
  2733. LFD f28, 3 * SIZE(BO)
  2734. FSUB f0, f16, f0
  2735. FSUB f1, f20, f1
  2736. FSUB f2, f24, f2
  2737. FSUB f3, f28, f3
  2738. #else
  2739. LFD f16, 0 * SIZE(AO)
  2740. LFD f17, 1 * SIZE(AO)
  2741. LFD f18, 2 * SIZE(AO)
  2742. LFD f19, 3 * SIZE(AO)
  2743. FSUB f0, f16, f0
  2744. FSUB f1, f17, f1
  2745. FSUB f2, f18, f2
  2746. FSUB f3, f19, f3
  2747. #endif
  2748. #ifdef LN
  2749. LFD f16, 15 * SIZE(AO)
  2750. LFD f17, 14 * SIZE(AO)
  2751. LFD f18, 13 * SIZE(AO)
  2752. LFD f19, 12 * SIZE(AO)
  2753. FMUL f3, f16, f3
  2754. FNMSUB f2, f17, f3, f2
  2755. FNMSUB f1, f18, f3, f1
  2756. FNMSUB f0, f19, f3, f0
  2757. LFD f16, 10 * SIZE(AO)
  2758. LFD f17, 9 * SIZE(AO)
  2759. LFD f18, 8 * SIZE(AO)
  2760. LFD f19, 5 * SIZE(AO)
  2761. LFD f20, 4 * SIZE(AO)
  2762. LFD f21, 0 * SIZE(AO)
  2763. FMUL f2, f16, f2
  2764. FNMSUB f1, f17, f2, f1
  2765. FNMSUB f0, f18, f2, f0
  2766. FMUL f1, f19, f1
  2767. FNMSUB f0, f20, f1, f0
  2768. FMUL f0, f21, f0
  2769. #endif
  2770. #ifdef LT
  2771. LFD f16, 0 * SIZE(AO)
  2772. LFD f17, 1 * SIZE(AO)
  2773. LFD f18, 2 * SIZE(AO)
  2774. LFD f19, 3 * SIZE(AO)
  2775. FMUL f0, f16, f0
  2776. FNMSUB f1, f17, f0, f1
  2777. FNMSUB f2, f18, f0, f2
  2778. FNMSUB f3, f19, f0, f3
  2779. LFD f17, 5 * SIZE(AO)
  2780. LFD f18, 6 * SIZE(AO)
  2781. LFD f19, 7 * SIZE(AO)
  2782. FMUL f1, f17, f1
  2783. FNMSUB f2, f18, f1, f2
  2784. FNMSUB f3, f19, f1, f3
  2785. LFD f18, 10 * SIZE(AO)
  2786. LFD f19, 11 * SIZE(AO)
  2787. FMUL f2, f18, f2
  2788. FNMSUB f3, f19, f2, f3
  2789. LFD f19, 15 * SIZE(AO)
  2790. FMUL f3, f19, f3
  2791. #endif
  2792. #ifdef RN
  2793. LFD f16, 0 * SIZE(BO)
  2794. FMUL f0, f16, f0
  2795. FMUL f1, f16, f1
  2796. FMUL f2, f16, f2
  2797. FMUL f3, f16, f3
  2798. #endif
  2799. #ifdef RT
  2800. LFD f21, 0 * SIZE(BO)
  2801. FMUL f0, f21, f0
  2802. FMUL f1, f21, f1
  2803. FMUL f2, f21, f2
  2804. FMUL f3, f21, f3
  2805. #endif
  2806. #ifdef LN
  2807. subi CO1, CO1, 4 * SIZE
  2808. #endif
  2809. #if defined(LN) || defined(LT)
  2810. STFD f0, 0 * SIZE(BO)
  2811. STFD f1, 1 * SIZE(BO)
  2812. STFD f2, 2 * SIZE(BO)
  2813. STFD f3, 3 * SIZE(BO)
  2814. #else
  2815. STFD f0, 0 * SIZE(AO)
  2816. STFD f1, 1 * SIZE(AO)
  2817. STFD f2, 2 * SIZE(AO)
  2818. STFD f3, 3 * SIZE(AO)
  2819. #endif
  2820. STFD f0, 0 * SIZE(CO1)
  2821. STFD f1, 1 * SIZE(CO1)
  2822. STFD f2, 2 * SIZE(CO1)
  2823. STFD f3, 3 * SIZE(CO1)
  2824. lfs f0, FZERO
  2825. fmr f1, f0
  2826. fmr f2, f0
  2827. fmr f3, f0
  2828. #ifndef LN
  2829. addi CO1, CO1, 4 * SIZE
  2830. #endif
  2831. #ifdef RT
  2832. slwi r0, K, 2 + BASE_SHIFT
  2833. add AORIG, AORIG, r0
  2834. #endif
  2835. #if defined(LT) || defined(RN)
  2836. sub TEMP, K, KK
  2837. slwi r0, TEMP, 2 + BASE_SHIFT
  2838. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2839. add AO, AO, r0
  2840. add BO, BO, TEMP
  2841. #endif
  2842. #ifdef LN
  2843. subi KK, KK, 4
  2844. #endif
  2845. #ifdef LT
  2846. addi KK, KK, 4
  2847. #endif
  2848. addic. I, I, -1
  2849. bgt+ .L72
  2850. .align 4
  2851. .L999:
  2852. addi r3, 0, 0
  2853. lfd f14, 0(SP)
  2854. lfd f15, 8(SP)
  2855. lfd f16, 16(SP)
  2856. lfd f17, 24(SP)
  2857. lfd f18, 32(SP)
  2858. lfd f19, 40(SP)
  2859. lfd f20, 48(SP)
  2860. lfd f21, 56(SP)
  2861. lfd f22, 64(SP)
  2862. lfd f23, 72(SP)
  2863. lfd f24, 80(SP)
  2864. lfd f25, 88(SP)
  2865. lfd f26, 96(SP)
  2866. lfd f27, 104(SP)
  2867. lfd f28, 112(SP)
  2868. lfd f29, 120(SP)
  2869. lfd f30, 128(SP)
  2870. lfd f31, 136(SP)
  2871. #ifdef __64BIT__
  2872. ld r31, 144(SP)
  2873. ld r30, 152(SP)
  2874. ld r29, 160(SP)
  2875. ld r28, 168(SP)
  2876. ld r27, 176(SP)
  2877. ld r26, 184(SP)
  2878. ld r25, 192(SP)
  2879. ld r24, 200(SP)
  2880. ld r23, 208(SP)
  2881. ld r22, 216(SP)
  2882. ld r21, 224(SP)
  2883. ld r20, 232(SP)
  2884. ld r19, 240(SP)
  2885. ld r18, 248(SP)
  2886. #else
  2887. lwz r31, 144(SP)
  2888. lwz r30, 148(SP)
  2889. lwz r29, 152(SP)
  2890. lwz r28, 156(SP)
  2891. lwz r27, 160(SP)
  2892. lwz r26, 164(SP)
  2893. lwz r25, 168(SP)
  2894. lwz r24, 172(SP)
  2895. lwz r23, 176(SP)
  2896. lwz r22, 180(SP)
  2897. lwz r21, 184(SP)
  2898. lwz r20, 188(SP)
  2899. lwz r19, 192(SP)
  2900. lwz r18, 196(SP)
  2901. #endif
  2902. addi SP, SP, STACKSIZE
  2903. blr
  2904. EPILOGUE