You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_ppc440_LT.S 59 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #ifndef NEEDPARAM
  99. PROLOGUE
  100. PROFCODE
  101. addi SP, SP, -STACKSIZE
  102. li r0, 0
  103. stfd f14, 0(SP)
  104. stfd f15, 8(SP)
  105. stfd f16, 16(SP)
  106. stfd f17, 24(SP)
  107. stfd f18, 32(SP)
  108. stfd f19, 40(SP)
  109. stfd f20, 48(SP)
  110. stfd f21, 56(SP)
  111. stfd f22, 64(SP)
  112. stfd f23, 72(SP)
  113. stfd f24, 80(SP)
  114. stfd f25, 88(SP)
  115. stfd f26, 96(SP)
  116. stfd f27, 104(SP)
  117. stfd f28, 112(SP)
  118. stfd f29, 120(SP)
  119. stfd f30, 128(SP)
  120. stfd f31, 136(SP)
  121. #ifdef __64BIT__
  122. std r31, 144(SP)
  123. std r30, 152(SP)
  124. std r29, 160(SP)
  125. std r28, 168(SP)
  126. std r27, 176(SP)
  127. std r26, 184(SP)
  128. std r25, 192(SP)
  129. std r24, 200(SP)
  130. std r23, 208(SP)
  131. std r22, 216(SP)
  132. std r21, 224(SP)
  133. std r20, 232(SP)
  134. std r19, 240(SP)
  135. std r18, 248(SP)
  136. #else
  137. stw r31, 144(SP)
  138. stw r30, 148(SP)
  139. stw r29, 152(SP)
  140. stw r28, 156(SP)
  141. stw r27, 160(SP)
  142. stw r26, 164(SP)
  143. stw r25, 168(SP)
  144. stw r24, 172(SP)
  145. stw r23, 176(SP)
  146. stw r22, 180(SP)
  147. stw r21, 184(SP)
  148. stw r20, 188(SP)
  149. stw r19, 192(SP)
  150. stw r18, 196(SP)
  151. #endif
  152. stw r0, FZERO
  153. #if defined(_AIX) || defined(__APPLE__)
  154. #if !defined(__64BIT__) && defined(DOUBLE)
  155. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  156. #endif
  157. #endif
  158. slwi LDC, LDC, BASE_SHIFT
  159. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  160. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  161. #endif
  162. #if defined(_AIX) || defined(__APPLE__)
  163. #ifdef __64BIT__
  164. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  165. #else
  166. #ifdef DOUBLE
  167. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  168. #else
  169. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  170. #endif
  171. #endif
  172. #endif
  173. #define A1 f16
  174. #define A2 f17
  175. #define A3 f18
  176. #define A4 f19
  177. #define A5 f20
  178. #define A6 f21
  179. #define B1 f22
  180. #define B2 f23
  181. #define B3 f24
  182. #define B4 f25
  183. #define B5 f26
  184. #define B6 f27
  185. #define B7 f28
  186. #define B8 f29
  187. #define B9 f30
  188. #define B10 f31
  189. #ifdef LN
  190. mullw r0, M, K
  191. slwi r0, r0, BASE_SHIFT
  192. add A, A, r0
  193. slwi r0, M, BASE_SHIFT
  194. add C, C, r0
  195. #endif
  196. #ifdef RN
  197. neg KK, OFFSET
  198. #endif
  199. #ifdef RT
  200. mullw r0, N, K
  201. slwi r0, r0, BASE_SHIFT
  202. add B, B, r0
  203. mullw r0, N, LDC
  204. add C, C, r0
  205. sub KK, N, OFFSET
  206. #endif
  207. cmpwi cr0, M, 0
  208. ble .L999
  209. cmpwi cr0, N, 0
  210. ble .L999
  211. cmpwi cr0, K, 0
  212. ble .L999
  213. lfs f0, FZERO
  214. srawi. J, N, 2
  215. ble .L40
  216. .align 4
  217. .L10:
  218. #ifdef RT
  219. slwi r0, K, 2 + BASE_SHIFT
  220. sub B, B, r0
  221. slwi r0, LDC, 2
  222. sub C, C, r0
  223. #endif
  224. mr CO1, C
  225. add CO2, C, LDC
  226. add CO3, CO2, LDC
  227. add CO4, CO3, LDC
  228. #ifdef LN
  229. add KK, M, OFFSET
  230. #endif
  231. #ifdef LT
  232. mr KK, OFFSET
  233. #endif
  234. fmr f1, f0
  235. fmr f2, f0
  236. fmr f3, f0
  237. fmr f4, f0
  238. fmr f5, f0
  239. fmr f6, f0
  240. fmr f7, f0
  241. fmr f8, f0
  242. fmr f9, f0
  243. fmr f10, f0
  244. fmr f11, f0
  245. fmr f12, f0
  246. fmr f13, f0
  247. fmr f14, f0
  248. fmr f15, f0
  249. srawi. I, M, 2
  250. #if defined(LN) || defined(RT)
  251. mr AORIG, A
  252. #else
  253. mr AO, A
  254. #endif
  255. #ifndef RT
  256. add C, CO4, LDC
  257. #endif
  258. ble .L20
  259. .align 4
  260. .L11:
  261. #if defined(LT) || defined(RN)
  262. LFD A1, 0 * SIZE(AO)
  263. LFD A2, 1 * SIZE(AO)
  264. LFD A4, 4 * SIZE(AO)
  265. LFD A5, 8 * SIZE(AO)
  266. LFD B1, 0 * SIZE(B)
  267. LFD B2, 1 * SIZE(B)
  268. LFD B3, 2 * SIZE(B)
  269. LFD B4, 3 * SIZE(B)
  270. LFD B5, 4 * SIZE(B)
  271. LFD B6, 8 * SIZE(B)
  272. LFD B7, 12 * SIZE(B)
  273. srawi. r0, KK, 2
  274. mtspr CTR, r0
  275. mr BO, B
  276. #else
  277. #ifdef LN
  278. slwi r0, K, 2 + BASE_SHIFT
  279. sub AORIG, AORIG, r0
  280. #endif
  281. slwi TEMP, KK, 2 + BASE_SHIFT
  282. add AO, AORIG, TEMP
  283. add BO, B, TEMP
  284. sub TEMP, K, KK
  285. LFD A1, 0 * SIZE(AO)
  286. LFD A2, 1 * SIZE(AO)
  287. LFD A4, 4 * SIZE(AO)
  288. LFD A5, 8 * SIZE(AO)
  289. LFD B1, 0 * SIZE(BO)
  290. LFD B2, 1 * SIZE(BO)
  291. LFD B3, 2 * SIZE(BO)
  292. LFD B4, 3 * SIZE(BO)
  293. LFD B5, 4 * SIZE(BO)
  294. LFD B6, 8 * SIZE(BO)
  295. LFD B7, 12 * SIZE(BO)
  296. srawi. r0, TEMP, 2
  297. mtspr CTR, r0
  298. #endif
  299. ble .L15
  300. .align 4
  301. .L12:
  302. FMADD f0, A1, B1, f0
  303. LFD A3, 2 * SIZE(AO)
  304. FMADD f4, A1, B2, f4
  305. LFD A6, 12 * SIZE(AO)
  306. FMADD f8, A1, B3, f8
  307. nop
  308. FMADD f12, A1, B4, f12
  309. nop
  310. FMADD f1, A2, B1, f1
  311. LFD A1, 3 * SIZE(AO)
  312. FMADD f5, A2, B2, f5
  313. nop
  314. FMADD f9, A2, B3, f9
  315. nop
  316. FMADD f13, A2, B4, f13
  317. nop
  318. FMADD f2, A3, B1, f2
  319. nop
  320. FMADD f6, A3, B2, f6
  321. LFD B8, 5 * SIZE(BO)
  322. FMADD f10, A3, B3, f10
  323. LFD B9, 6 * SIZE(BO)
  324. FMADD f14, A3, B4, f14
  325. LFD B10, 7 * SIZE(BO)
  326. FMADD f3, A1, B1, f3
  327. LFD A2, 5 * SIZE(AO)
  328. FMADD f7, A1, B2, f7
  329. LFD B1, 16 * SIZE(BO)
  330. FMADD f11, A1, B3, f11
  331. nop
  332. FMADD f15, A1, B4, f15
  333. nop
  334. FMADD f0, A4, B5, f0
  335. LFD A3, 6 * SIZE(AO)
  336. FMADD f4, A4, B8, f4
  337. LFD A1, 16 * SIZE(AO)
  338. FMADD f8, A4, B9, f8
  339. nop
  340. FMADD f12, A4, B10, f12
  341. nop
  342. FMADD f1, A2, B5, f1
  343. LFD A4, 7 * SIZE(AO)
  344. FMADD f5, A2, B8, f5
  345. nop
  346. FMADD f9, A2, B9, f9
  347. nop
  348. FMADD f13, A2, B10, f13
  349. nop
  350. FMADD f2, A3, B5, f2
  351. nop
  352. FMADD f6, A3, B8, f6
  353. LFD B2, 9 * SIZE(BO)
  354. FMADD f10, A3, B9, f10
  355. LFD B3, 10 * SIZE(BO)
  356. FMADD f14, A3, B10, f14
  357. LFD B4, 11 * SIZE(BO)
  358. FMADD f3, A4, B5, f3
  359. LFD A2, 9 * SIZE(AO)
  360. FMADD f7, A4, B8, f7
  361. LFD B5, 20 * SIZE(BO)
  362. FMADD f11, A4, B9, f11
  363. nop
  364. FMADD f15, A4, B10, f15
  365. nop
  366. FMADD f0, A5, B6, f0
  367. LFD A3, 10 * SIZE(AO)
  368. FMADD f4, A5, B2, f4
  369. LFD A4, 20 * SIZE(AO)
  370. FMADD f8, A5, B3, f8
  371. nop
  372. FMADD f12, A5, B4, f12
  373. nop
  374. FMADD f1, A2, B6, f1
  375. LFD A5, 11 * SIZE(AO)
  376. FMADD f5, A2, B2, f5
  377. nop
  378. FMADD f9, A2, B3, f9
  379. nop
  380. FMADD f13, A2, B4, f13
  381. nop
  382. FMADD f2, A3, B6, f2
  383. nop
  384. FMADD f6, A3, B2, f6
  385. LFD B8, 13 * SIZE(BO)
  386. FMADD f10, A3, B3, f10
  387. LFD B9, 14 * SIZE(BO)
  388. FMADD f14, A3, B4, f14
  389. LFD B10,15 * SIZE(BO)
  390. FMADD f3, A5, B6, f3
  391. LFD A2, 13 * SIZE(AO)
  392. FMADD f7, A5, B2, f7
  393. LFD B6, 24 * SIZE(BO)
  394. FMADD f11, A5, B3, f11
  395. nop
  396. FMADD f15, A5, B4, f15
  397. nop
  398. FMADD f0, A6, B7, f0
  399. LFD A3, 14 * SIZE(AO)
  400. FMADD f4, A6, B8, f4
  401. LFD A5, 24 * SIZE(AO)
  402. FMADD f8, A6, B9, f8
  403. nop
  404. FMADD f12, A6, B10, f12
  405. nop
  406. FMADD f1, A2, B7, f1
  407. LFD A6, 15 * SIZE(AO)
  408. FMADD f5, A2, B8, f5
  409. nop
  410. FMADD f9, A2, B9, f9
  411. nop
  412. FMADD f13, A2, B10, f13
  413. nop
  414. FMADD f2, A3, B7, f2
  415. addi AO, AO, 16 * SIZE
  416. FMADD f6, A3, B8, f6
  417. LFD B2, 17 * SIZE(BO)
  418. FMADD f10, A3, B9, f10
  419. LFD B3, 18 * SIZE(BO)
  420. FMADD f14, A3, B10, f14
  421. LFD B4, 19 * SIZE(BO)
  422. FMADD f3, A6, B7, f3
  423. LFD A2, 1 * SIZE(AO)
  424. FMADD f7, A6, B8, f7
  425. LFD B7, 28 * SIZE(BO)
  426. FMADD f11, A6, B9, f11
  427. addi BO, BO, 16 * SIZE
  428. FMADD f15, A6, B10, f15
  429. bdnz .L12
  430. .align 4
  431. .L15:
  432. #if defined(LT) || defined(RN)
  433. andi. r0, KK, 3
  434. #else
  435. andi. r0, TEMP, 3
  436. #endif
  437. mtspr CTR, r0
  438. ble+ .L18
  439. .align 4
  440. .L16:
  441. FMADD f0, A1, B1, f0
  442. LFD A3, 2 * SIZE(AO)
  443. FMADD f4, A1, B2, f4
  444. FMADD f8, A1, B3, f8
  445. FMADD f12, A1, B4, f12
  446. LFD A4, 3 * SIZE(AO)
  447. FMADD f1, A2, B1, f1
  448. FMADD f5, A2, B2, f5
  449. FMADD f9, A2, B3, f9
  450. FMADD f13, A2, B4, f13
  451. LFDU A1, 4 * SIZE(AO)
  452. FMADD f2, A3, B1, f2
  453. FMADD f6, A3, B2, f6
  454. FMADD f10, A3, B3, f10
  455. FMADD f14, A3, B4, f14
  456. LFD A2, 1 * SIZE(AO)
  457. FMADD f3, A4, B1, f3
  458. LFDU B1, 4 * SIZE(BO)
  459. FMADD f7, A4, B2, f7
  460. LFD B2, 1 * SIZE(BO)
  461. FMADD f11, A4, B3, f11
  462. LFD B3, 2 * SIZE(BO)
  463. FMADD f15, A4, B4, f15
  464. LFD B4, 3 * SIZE(BO)
  465. bdnz .L16
  466. .align 4
  467. .L18:
  468. #if defined(LN) || defined(RT)
  469. subi r0, KK, 4
  470. slwi r0, r0, 2 + BASE_SHIFT
  471. add AO, AORIG, r0
  472. add BO, B, r0
  473. #endif
  474. #if defined(LN) || defined(LT)
  475. LFD f16, 0 * SIZE(BO)
  476. LFD f17, 1 * SIZE(BO)
  477. LFD f18, 2 * SIZE(BO)
  478. LFD f19, 3 * SIZE(BO)
  479. LFD f20, 4 * SIZE(BO)
  480. LFD f21, 5 * SIZE(BO)
  481. LFD f22, 6 * SIZE(BO)
  482. LFD f23, 7 * SIZE(BO)
  483. LFD f24, 8 * SIZE(BO)
  484. LFD f25, 9 * SIZE(BO)
  485. LFD f26, 10 * SIZE(BO)
  486. LFD f27, 11 * SIZE(BO)
  487. LFD f28, 12 * SIZE(BO)
  488. LFD f29, 13 * SIZE(BO)
  489. LFD f30, 14 * SIZE(BO)
  490. LFD f31, 15 * SIZE(BO)
  491. FSUB f0, f16, f0
  492. FSUB f4, f17, f4
  493. FSUB f8, f18, f8
  494. FSUB f12, f19, f12
  495. FSUB f1, f20, f1
  496. FSUB f5, f21, f5
  497. FSUB f9, f22, f9
  498. FSUB f13, f23, f13
  499. FSUB f2, f24, f2
  500. FSUB f6, f25, f6
  501. FSUB f10, f26, f10
  502. FSUB f14, f27, f14
  503. FSUB f3, f28, f3
  504. FSUB f7, f29, f7
  505. FSUB f11, f30, f11
  506. FSUB f15, f31, f15
  507. #else
  508. LFD f16, 0 * SIZE(AO)
  509. LFD f17, 1 * SIZE(AO)
  510. LFD f18, 2 * SIZE(AO)
  511. LFD f19, 3 * SIZE(AO)
  512. LFD f20, 4 * SIZE(AO)
  513. LFD f21, 5 * SIZE(AO)
  514. LFD f22, 6 * SIZE(AO)
  515. LFD f23, 7 * SIZE(AO)
  516. LFD f24, 8 * SIZE(AO)
  517. LFD f25, 9 * SIZE(AO)
  518. LFD f26, 10 * SIZE(AO)
  519. LFD f27, 11 * SIZE(AO)
  520. LFD f28, 12 * SIZE(AO)
  521. LFD f29, 13 * SIZE(AO)
  522. LFD f30, 14 * SIZE(AO)
  523. LFD f31, 15 * SIZE(AO)
  524. FSUB f0, f16, f0
  525. FSUB f1, f17, f1
  526. FSUB f2, f18, f2
  527. FSUB f3, f19, f3
  528. FSUB f4, f20, f4
  529. FSUB f5, f21, f5
  530. FSUB f6, f22, f6
  531. FSUB f7, f23, f7
  532. FSUB f8, f24, f8
  533. FSUB f9, f25, f9
  534. FSUB f10, f26, f10
  535. FSUB f11, f27, f11
  536. FSUB f12, f28, f12
  537. FSUB f13, f29, f13
  538. FSUB f14, f30, f14
  539. FSUB f15, f31, f15
  540. #endif
  541. #ifdef LN
  542. LFD f16, 15 * SIZE(AO)
  543. LFD f17, 14 * SIZE(AO)
  544. LFD f18, 13 * SIZE(AO)
  545. LFD f19, 12 * SIZE(AO)
  546. FMUL f3, f16, f3
  547. FMUL f7, f16, f7
  548. FMUL f11, f16, f11
  549. FMUL f15, f16, f15
  550. FNMSUB f2, f17, f3, f2
  551. FNMSUB f6, f17, f7, f6
  552. FNMSUB f10, f17, f11, f10
  553. FNMSUB f14, f17, f15, f14
  554. FNMSUB f1, f18, f3, f1
  555. FNMSUB f5, f18, f7, f5
  556. FNMSUB f9, f18, f11, f9
  557. FNMSUB f13, f18, f15, f13
  558. FNMSUB f0, f19, f3, f0
  559. FNMSUB f4, f19, f7, f4
  560. FNMSUB f8, f19, f11, f8
  561. FNMSUB f12, f19, f15, f12
  562. LFD f16, 10 * SIZE(AO)
  563. LFD f17, 9 * SIZE(AO)
  564. LFD f18, 8 * SIZE(AO)
  565. LFD f19, 5 * SIZE(AO)
  566. FMUL f2, f16, f2
  567. FMUL f6, f16, f6
  568. FMUL f10, f16, f10
  569. FMUL f14, f16, f14
  570. LFD f20, 4 * SIZE(AO)
  571. LFD f21, 0 * SIZE(AO)
  572. FNMSUB f1, f17, f2, f1
  573. FNMSUB f5, f17, f6, f5
  574. FNMSUB f9, f17, f10, f9
  575. FNMSUB f13, f17, f14, f13
  576. FNMSUB f0, f18, f2, f0
  577. FNMSUB f4, f18, f6, f4
  578. FNMSUB f8, f18, f10, f8
  579. FNMSUB f12, f18, f14, f12
  580. FMUL f1, f19, f1
  581. FMUL f5, f19, f5
  582. FMUL f9, f19, f9
  583. FMUL f13, f19, f13
  584. FNMSUB f0, f20, f1, f0
  585. FNMSUB f4, f20, f5, f4
  586. FNMSUB f8, f20, f9, f8
  587. FNMSUB f12, f20, f13, f12
  588. FMUL f0, f21, f0
  589. FMUL f4, f21, f4
  590. FMUL f8, f21, f8
  591. FMUL f12, f21, f12
  592. #endif
  593. #ifdef LT
  594. LFD f16, 0 * SIZE(AO)
  595. LFD f17, 1 * SIZE(AO)
  596. LFD f18, 2 * SIZE(AO)
  597. LFD f19, 3 * SIZE(AO)
  598. FMUL f0, f16, f0
  599. FMUL f4, f16, f4
  600. FMUL f8, f16, f8
  601. FMUL f12, f16, f12
  602. FNMSUB f1, f17, f0, f1
  603. FNMSUB f5, f17, f4, f5
  604. FNMSUB f9, f17, f8, f9
  605. FNMSUB f13, f17, f12, f13
  606. FNMSUB f2, f18, f0, f2
  607. FNMSUB f6, f18, f4, f6
  608. FNMSUB f10, f18, f8, f10
  609. FNMSUB f14, f18, f12, f14
  610. FNMSUB f3, f19, f0, f3
  611. FNMSUB f7, f19, f4, f7
  612. FNMSUB f11, f19, f8, f11
  613. FNMSUB f15, f19, f12, f15
  614. LFD f16, 5 * SIZE(AO)
  615. LFD f17, 6 * SIZE(AO)
  616. LFD f18, 7 * SIZE(AO)
  617. LFD f19, 10 * SIZE(AO)
  618. FMUL f1, f16, f1
  619. FMUL f5, f16, f5
  620. FMUL f9, f16, f9
  621. FMUL f13, f16, f13
  622. LFD f20, 11 * SIZE(AO)
  623. LFD f21, 15 * SIZE(AO)
  624. FNMSUB f2, f17, f1, f2
  625. FNMSUB f6, f17, f5, f6
  626. FNMSUB f10, f17, f9, f10
  627. FNMSUB f14, f17, f13, f14
  628. FNMSUB f3, f18, f1, f3
  629. FNMSUB f7, f18, f5, f7
  630. FNMSUB f11, f18, f9, f11
  631. FNMSUB f15, f18, f13, f15
  632. FMUL f2, f19, f2
  633. FMUL f6, f19, f6
  634. FMUL f10, f19, f10
  635. FMUL f14, f19, f14
  636. FNMSUB f3, f20, f2, f3
  637. FNMSUB f7, f20, f6, f7
  638. FNMSUB f11, f20, f10, f11
  639. FNMSUB f15, f20, f14, f15
  640. FMUL f3, f21, f3
  641. FMUL f7, f21, f7
  642. FMUL f11, f21, f11
  643. FMUL f15, f21, f15
  644. #endif
  645. #ifdef RN
  646. LFD f16, 0 * SIZE(BO)
  647. LFD f17, 1 * SIZE(BO)
  648. LFD f18, 2 * SIZE(BO)
  649. LFD f19, 3 * SIZE(BO)
  650. FMUL f0, f16, f0
  651. FMUL f1, f16, f1
  652. FMUL f2, f16, f2
  653. FMUL f3, f16, f3
  654. FNMSUB f4, f17, f0, f4
  655. FNMSUB f5, f17, f1, f5
  656. FNMSUB f6, f17, f2, f6
  657. FNMSUB f7, f17, f3, f7
  658. FNMSUB f8, f18, f0, f8
  659. FNMSUB f9, f18, f1, f9
  660. FNMSUB f10, f18, f2, f10
  661. FNMSUB f11, f18, f3, f11
  662. FNMSUB f12, f19, f0, f12
  663. FNMSUB f13, f19, f1, f13
  664. FNMSUB f14, f19, f2, f14
  665. FNMSUB f15, f19, f3, f15
  666. LFD f16, 5 * SIZE(BO)
  667. LFD f17, 6 * SIZE(BO)
  668. LFD f18, 7 * SIZE(BO)
  669. LFD f19, 10 * SIZE(BO)
  670. FMUL f4, f16, f4
  671. FMUL f5, f16, f5
  672. FMUL f6, f16, f6
  673. FMUL f7, f16, f7
  674. LFD f20, 11 * SIZE(BO)
  675. LFD f21, 15 * SIZE(BO)
  676. FNMSUB f8, f17, f4, f8
  677. FNMSUB f9, f17, f5, f9
  678. FNMSUB f10, f17, f6, f10
  679. FNMSUB f11, f17, f7, f11
  680. FNMSUB f12, f18, f4, f12
  681. FNMSUB f13, f18, f5, f13
  682. FNMSUB f14, f18, f6, f14
  683. FNMSUB f15, f18, f7, f15
  684. FMUL f8, f19, f8
  685. FMUL f9, f19, f9
  686. FMUL f10, f19, f10
  687. FMUL f11, f19, f11
  688. FNMSUB f12, f20, f8, f12
  689. FNMSUB f13, f20, f9, f13
  690. FNMSUB f14, f20, f10, f14
  691. FNMSUB f15, f20, f11, f15
  692. FMUL f12, f21, f12
  693. FMUL f13, f21, f13
  694. FMUL f14, f21, f14
  695. FMUL f15, f21, f15
  696. #endif
  697. #ifdef RT
  698. LFD f16, 15 * SIZE(BO)
  699. LFD f17, 14 * SIZE(BO)
  700. LFD f18, 13 * SIZE(BO)
  701. LFD f19, 12 * SIZE(BO)
  702. FMUL f12, f16, f12
  703. FMUL f13, f16, f13
  704. FMUL f14, f16, f14
  705. FMUL f15, f16, f15
  706. FNMSUB f8, f17, f12, f8
  707. FNMSUB f9, f17, f13, f9
  708. FNMSUB f10, f17, f14, f10
  709. FNMSUB f11, f17, f15, f11
  710. FNMSUB f4, f18, f12, f4
  711. FNMSUB f5, f18, f13, f5
  712. FNMSUB f6, f18, f14, f6
  713. FNMSUB f7, f18, f15, f7
  714. FNMSUB f0, f19, f12, f0
  715. FNMSUB f1, f19, f13, f1
  716. FNMSUB f2, f19, f14, f2
  717. FNMSUB f3, f19, f15, f3
  718. LFD f16, 10 * SIZE(BO)
  719. LFD f17, 9 * SIZE(BO)
  720. LFD f18, 8 * SIZE(BO)
  721. LFD f19, 5 * SIZE(BO)
  722. FMUL f8, f16, f8
  723. FMUL f9, f16, f9
  724. FMUL f10, f16, f10
  725. FMUL f11, f16, f11
  726. LFD f20, 4 * SIZE(BO)
  727. LFD f21, 0 * SIZE(BO)
  728. FNMSUB f4, f17, f8, f4
  729. FNMSUB f5, f17, f9, f5
  730. FNMSUB f6, f17, f10, f6
  731. FNMSUB f7, f17, f11, f7
  732. FNMSUB f0, f18, f8, f0
  733. FNMSUB f1, f18, f9, f1
  734. FNMSUB f2, f18, f10, f2
  735. FNMSUB f3, f18, f11, f3
  736. FMUL f4, f19, f4
  737. FMUL f5, f19, f5
  738. FMUL f6, f19, f6
  739. FMUL f7, f19, f7
  740. FNMSUB f0, f20, f4, f0
  741. FNMSUB f1, f20, f5, f1
  742. FNMSUB f2, f20, f6, f2
  743. FNMSUB f3, f20, f7, f3
  744. FMUL f0, f21, f0
  745. FMUL f1, f21, f1
  746. FMUL f2, f21, f2
  747. FMUL f3, f21, f3
  748. #endif
  749. #ifdef LN
  750. subi CO1, CO1, 4 * SIZE
  751. subi CO2, CO2, 4 * SIZE
  752. subi CO3, CO3, 4 * SIZE
  753. subi CO4, CO4, 4 * SIZE
  754. #endif
  755. #if defined(LN) || defined(LT)
  756. STFD f0, 0 * SIZE(BO)
  757. STFD f4, 1 * SIZE(BO)
  758. STFD f8, 2 * SIZE(BO)
  759. STFD f12, 3 * SIZE(BO)
  760. STFD f1, 4 * SIZE(BO)
  761. STFD f5, 5 * SIZE(BO)
  762. STFD f9, 6 * SIZE(BO)
  763. STFD f13, 7 * SIZE(BO)
  764. STFD f2, 8 * SIZE(BO)
  765. STFD f6, 9 * SIZE(BO)
  766. STFD f10, 10 * SIZE(BO)
  767. STFD f14, 11 * SIZE(BO)
  768. STFD f3, 12 * SIZE(BO)
  769. STFD f7, 13 * SIZE(BO)
  770. STFD f11, 14 * SIZE(BO)
  771. STFD f15, 15 * SIZE(BO)
  772. #else
  773. STFD f0, 0 * SIZE(AO)
  774. STFD f1, 1 * SIZE(AO)
  775. STFD f2, 2 * SIZE(AO)
  776. STFD f3, 3 * SIZE(AO)
  777. STFD f4, 4 * SIZE(AO)
  778. STFD f5, 5 * SIZE(AO)
  779. STFD f6, 6 * SIZE(AO)
  780. STFD f7, 7 * SIZE(AO)
  781. STFD f8, 8 * SIZE(AO)
  782. STFD f9, 9 * SIZE(AO)
  783. STFD f10, 10 * SIZE(AO)
  784. STFD f11, 11 * SIZE(AO)
  785. STFD f12, 12 * SIZE(AO)
  786. STFD f13, 13 * SIZE(AO)
  787. STFD f14, 14 * SIZE(AO)
  788. STFD f15, 15 * SIZE(AO)
  789. #endif
  790. STFD f0, 0 * SIZE(CO1)
  791. STFD f1, 1 * SIZE(CO1)
  792. STFD f2, 2 * SIZE(CO1)
  793. STFD f3, 3 * SIZE(CO1)
  794. STFD f4, 0 * SIZE(CO2)
  795. STFD f5, 1 * SIZE(CO2)
  796. STFD f6, 2 * SIZE(CO2)
  797. STFD f7, 3 * SIZE(CO2)
  798. STFD f8, 0 * SIZE(CO3)
  799. STFD f9, 1 * SIZE(CO3)
  800. STFD f10, 2 * SIZE(CO3)
  801. STFD f11, 3 * SIZE(CO3)
  802. STFD f12, 0 * SIZE(CO4)
  803. STFD f13, 1 * SIZE(CO4)
  804. STFD f14, 2 * SIZE(CO4)
  805. STFD f15, 3 * SIZE(CO4)
  806. lfs f0, FZERO
  807. fmr f1, f0
  808. fmr f2, f0
  809. fmr f3, f0
  810. fmr f4, f0
  811. fmr f5, f0
  812. fmr f6, f0
  813. fmr f7, f0
  814. fmr f8, f0
  815. fmr f9, f0
  816. fmr f10, f0
  817. fmr f11, f0
  818. fmr f12, f0
  819. fmr f13, f0
  820. fmr f14, f0
  821. fmr f15, f0
  822. #ifndef LN
  823. addi CO1, CO1, 4 * SIZE
  824. addi CO2, CO2, 4 * SIZE
  825. addi CO3, CO3, 4 * SIZE
  826. addi CO4, CO4, 4 * SIZE
  827. #endif
  828. #ifdef RT
  829. slwi r0, K, 2 + BASE_SHIFT
  830. add AORIG, AORIG, r0
  831. #endif
  832. #if defined(LT) || defined(RN)
  833. sub TEMP, K, KK
  834. slwi TEMP, TEMP, 2 + BASE_SHIFT
  835. add AO, AO, TEMP
  836. add BO, BO, TEMP
  837. #endif
  838. #ifdef LT
  839. addi KK, KK, 4
  840. #endif
  841. #ifdef LN
  842. subi KK, KK, 4
  843. #endif
  844. addic. I, I, -1
  845. bgt+ .L11
  846. .align 4
  847. .L20:
  848. andi. I, M, 2
  849. ble .L30
  850. #if defined(LT) || defined(RN)
  851. LFD f16, 0 * SIZE(AO)
  852. LFD f17, 1 * SIZE(AO)
  853. LFD f18, 2 * SIZE(AO)
  854. LFD f19, 3 * SIZE(AO)
  855. LFD f20, 0 * SIZE(B)
  856. LFD f21, 1 * SIZE(B)
  857. LFD f22, 2 * SIZE(B)
  858. LFD f23, 3 * SIZE(B)
  859. LFD f24, 4 * SIZE(B)
  860. LFD f25, 5 * SIZE(B)
  861. LFD f26, 6 * SIZE(B)
  862. LFD f27, 7 * SIZE(B)
  863. srawi. r0, KK, 2
  864. mtspr CTR, r0
  865. mr BO, B
  866. #else
  867. #ifdef LN
  868. slwi r0, K, 1 + BASE_SHIFT
  869. sub AORIG, AORIG, r0
  870. #endif
  871. slwi r0, KK, 1 + BASE_SHIFT
  872. slwi TEMP, KK, 2 + BASE_SHIFT
  873. add AO, AORIG, r0
  874. add BO, B, TEMP
  875. sub TEMP, K, KK
  876. LFD f16, 0 * SIZE(AO)
  877. LFD f17, 1 * SIZE(AO)
  878. LFD f18, 2 * SIZE(AO)
  879. LFD f19, 3 * SIZE(AO)
  880. LFD f20, 0 * SIZE(BO)
  881. LFD f21, 1 * SIZE(BO)
  882. LFD f22, 2 * SIZE(BO)
  883. LFD f23, 3 * SIZE(BO)
  884. LFD f24, 4 * SIZE(BO)
  885. LFD f25, 5 * SIZE(BO)
  886. LFD f26, 6 * SIZE(BO)
  887. LFD f27, 7 * SIZE(BO)
  888. srawi. r0, TEMP, 2
  889. mtspr CTR, r0
  890. #endif
  891. ble .L25
  892. .align 5
  893. .L22:
  894. FMADD f0, f16, f20, f0
  895. nop
  896. FMADD f1, f17, f20, f1
  897. LFD f20, 8 * SIZE(BO)
  898. FMADD f4, f16, f21, f4
  899. nop
  900. FMADD f5, f17, f21, f5
  901. LFD f21, 9 * SIZE(BO)
  902. FMADD f8, f16, f22, f8
  903. nop
  904. FMADD f9, f17, f22, f9
  905. LFD f22, 10 * SIZE(BO)
  906. FMADD f12, f16, f23, f12
  907. LFD f16, 4 * SIZE(AO)
  908. FMADD f13, f17, f23, f13
  909. LFD f23, 11 * SIZE(BO)
  910. FMADD f2, f18, f24, f2
  911. LFD f17, 5 * SIZE(AO)
  912. FMADD f3, f19, f24, f3
  913. LFD f24, 12 * SIZE(BO)
  914. FMADD f6, f18, f25, f6
  915. nop
  916. FMADD f7, f19, f25, f7
  917. LFD f25, 13 * SIZE(BO)
  918. FMADD f10, f18, f26, f10
  919. nop
  920. FMADD f11, f19, f26, f11
  921. LFD f26, 14 * SIZE(BO)
  922. FMADD f14, f18, f27, f14
  923. LFD f18, 6 * SIZE(AO)
  924. FMADD f15, f19, f27, f15
  925. LFD f27, 15 * SIZE(BO)
  926. FMADD f0, f16, f20, f0
  927. LFD f19, 7 * SIZE(AO)
  928. FMADD f1, f17, f20, f1
  929. LFDU f20, 16 * SIZE(BO)
  930. FMADD f4, f16, f21, f4
  931. nop
  932. FMADD f5, f17, f21, f5
  933. LFD f21, 1 * SIZE(BO)
  934. FMADD f8, f16, f22, f8
  935. nop
  936. FMADD f9, f17, f22, f9
  937. LFD f22, 2 * SIZE(BO)
  938. FMADD f12, f16, f23, f12
  939. LFDU f16, 8 * SIZE(AO)
  940. FMADD f13, f17, f23, f13
  941. LFD f23, 3 * SIZE(BO)
  942. FMADD f2, f18, f24, f2
  943. LFD f17, 1 * SIZE(AO)
  944. FMADD f3, f19, f24, f3
  945. LFD f24, 4 * SIZE(BO)
  946. FMADD f6, f18, f25, f6
  947. nop
  948. FMADD f7, f19, f25, f7
  949. LFD f25, 5 * SIZE(BO)
  950. FMADD f10, f18, f26, f10
  951. nop
  952. FMADD f11, f19, f26, f11
  953. LFD f26, 6 * SIZE(BO)
  954. FMADD f14, f18, f27, f14
  955. LFD f18, 2 * SIZE(AO)
  956. FMADD f15, f19, f27, f15
  957. LFD f19, 3 * SIZE(AO)
  958. LFD f27, 7 * SIZE(BO)
  959. bdnz .L22
  960. fadd f0, f2, f0
  961. fadd f1, f3, f1
  962. fadd f4, f6, f4
  963. fadd f5, f7, f5
  964. fadd f8, f10, f8
  965. fadd f9, f11, f9
  966. fadd f12, f14, f12
  967. fadd f13, f15, f13
  968. .align 4
  969. .L25:
  970. #if defined(LT) || defined(RN)
  971. andi. r0, KK, 3
  972. #else
  973. andi. r0, TEMP, 3
  974. #endif
  975. mtspr CTR, r0
  976. ble+ .L28
  977. .align 4
  978. .L26:
  979. FMADD f0, f16, f20, f0
  980. nop
  981. FMADD f1, f17, f20, f1
  982. LFDU f20, 4 * SIZE(BO)
  983. FMADD f4, f16, f21, f4
  984. nop
  985. FMADD f5, f17, f21, f5
  986. LFD f21, 1 * SIZE(BO)
  987. FMADD f8, f16, f22, f8
  988. nop
  989. FMADD f9, f17, f22, f9
  990. LFD f22, 2 * SIZE(BO)
  991. FMADD f12, f16, f23, f12
  992. LFDU f16, 2 * SIZE(AO)
  993. FMADD f13, f17, f23, f13
  994. LFD f17, 1 * SIZE(AO)
  995. LFD f23, 3 * SIZE(BO)
  996. bdnz .L26
  997. .align 4
  998. .L28:
  999. #if defined(LN) || defined(RT)
  1000. #ifdef LN
  1001. subi r0, KK, 2
  1002. #else
  1003. subi r0, KK, 4
  1004. #endif
  1005. slwi TEMP, r0, 1 + BASE_SHIFT
  1006. slwi r0, r0, 2 + BASE_SHIFT
  1007. add AO, AORIG, TEMP
  1008. add BO, B, r0
  1009. #endif
  1010. #if defined(LN) || defined(LT)
  1011. LFD f16, 0 * SIZE(BO)
  1012. LFD f17, 1 * SIZE(BO)
  1013. LFD f18, 2 * SIZE(BO)
  1014. LFD f19, 3 * SIZE(BO)
  1015. LFD f20, 4 * SIZE(BO)
  1016. LFD f21, 5 * SIZE(BO)
  1017. LFD f22, 6 * SIZE(BO)
  1018. LFD f23, 7 * SIZE(BO)
  1019. FSUB f0, f16, f0
  1020. FSUB f4, f17, f4
  1021. FSUB f8, f18, f8
  1022. FSUB f12, f19, f12
  1023. FSUB f1, f20, f1
  1024. FSUB f5, f21, f5
  1025. FSUB f9, f22, f9
  1026. FSUB f13, f23, f13
  1027. #else
  1028. LFD f16, 0 * SIZE(AO)
  1029. LFD f17, 1 * SIZE(AO)
  1030. LFD f20, 2 * SIZE(AO)
  1031. LFD f21, 3 * SIZE(AO)
  1032. LFD f24, 4 * SIZE(AO)
  1033. LFD f25, 5 * SIZE(AO)
  1034. LFD f28, 6 * SIZE(AO)
  1035. LFD f29, 7 * SIZE(AO)
  1036. FSUB f0, f16, f0
  1037. FSUB f1, f17, f1
  1038. FSUB f4, f20, f4
  1039. FSUB f5, f21, f5
  1040. FSUB f8, f24, f8
  1041. FSUB f9, f25, f9
  1042. FSUB f12, f28, f12
  1043. FSUB f13, f29, f13
  1044. #endif
  1045. #ifdef LN
  1046. LFD f19, 3 * SIZE(AO)
  1047. LFD f20, 2 * SIZE(AO)
  1048. LFD f21, 0 * SIZE(AO)
  1049. FMUL f1, f19, f1
  1050. FMUL f5, f19, f5
  1051. FMUL f9, f19, f9
  1052. FMUL f13, f19, f13
  1053. FNMSUB f0, f20, f1, f0
  1054. FNMSUB f4, f20, f5, f4
  1055. FNMSUB f8, f20, f9, f8
  1056. FNMSUB f12, f20, f13, f12
  1057. FMUL f0, f21, f0
  1058. FMUL f4, f21, f4
  1059. FMUL f8, f21, f8
  1060. FMUL f12, f21, f12
  1061. #endif
  1062. #ifdef LT
  1063. LFD f16, 0 * SIZE(AO)
  1064. LFD f17, 1 * SIZE(AO)
  1065. FMUL f0, f16, f0
  1066. FMUL f4, f16, f4
  1067. FMUL f8, f16, f8
  1068. FMUL f12, f16, f12
  1069. FNMSUB f1, f17, f0, f1
  1070. FNMSUB f5, f17, f4, f5
  1071. FNMSUB f9, f17, f8, f9
  1072. FNMSUB f13, f17, f12, f13
  1073. LFD f17, 3 * SIZE(AO)
  1074. FMUL f1, f17, f1
  1075. FMUL f5, f17, f5
  1076. FMUL f9, f17, f9
  1077. FMUL f13, f17, f13
  1078. #endif
  1079. #ifdef RN
  1080. LFD f16, 0 * SIZE(BO)
  1081. LFD f17, 1 * SIZE(BO)
  1082. LFD f18, 2 * SIZE(BO)
  1083. LFD f19, 3 * SIZE(BO)
  1084. FMUL f0, f16, f0
  1085. FMUL f1, f16, f1
  1086. FNMSUB f4, f17, f0, f4
  1087. FNMSUB f5, f17, f1, f5
  1088. FNMSUB f8, f18, f0, f8
  1089. FNMSUB f9, f18, f1, f9
  1090. FNMSUB f12, f19, f0, f12
  1091. FNMSUB f13, f19, f1, f13
  1092. LFD f16, 5 * SIZE(BO)
  1093. LFD f17, 6 * SIZE(BO)
  1094. LFD f18, 7 * SIZE(BO)
  1095. LFD f19, 10 * SIZE(BO)
  1096. LFD f20, 11 * SIZE(BO)
  1097. LFD f21, 15 * SIZE(BO)
  1098. FMUL f4, f16, f4
  1099. FMUL f5, f16, f5
  1100. FNMSUB f8, f17, f4, f8
  1101. FNMSUB f9, f17, f5, f9
  1102. FNMSUB f12, f18, f4, f12
  1103. FNMSUB f13, f18, f5, f13
  1104. FMUL f8, f19, f8
  1105. FMUL f9, f19, f9
  1106. FNMSUB f12, f20, f8, f12
  1107. FNMSUB f13, f20, f9, f13
  1108. FMUL f12, f21, f12
  1109. FMUL f13, f21, f13
  1110. #endif
  1111. #ifdef RT
  1112. LFD f16, 15 * SIZE(BO)
  1113. LFD f17, 14 * SIZE(BO)
  1114. LFD f18, 13 * SIZE(BO)
  1115. LFD f19, 12 * SIZE(BO)
  1116. FMUL f12, f16, f12
  1117. FMUL f13, f16, f13
  1118. FNMSUB f8, f17, f12, f8
  1119. FNMSUB f9, f17, f13, f9
  1120. FNMSUB f4, f18, f12, f4
  1121. FNMSUB f5, f18, f13, f5
  1122. FNMSUB f0, f19, f12, f0
  1123. FNMSUB f1, f19, f13, f1
  1124. LFD f16, 10 * SIZE(BO)
  1125. LFD f17, 9 * SIZE(BO)
  1126. LFD f18, 8 * SIZE(BO)
  1127. LFD f19, 5 * SIZE(BO)
  1128. LFD f20, 4 * SIZE(BO)
  1129. LFD f21, 0 * SIZE(BO)
  1130. FMUL f8, f16, f8
  1131. FMUL f9, f16, f9
  1132. FNMSUB f4, f17, f8, f4
  1133. FNMSUB f5, f17, f9, f5
  1134. FNMSUB f0, f18, f8, f0
  1135. FNMSUB f1, f18, f9, f1
  1136. FMUL f4, f19, f4
  1137. FMUL f5, f19, f5
  1138. FNMSUB f0, f20, f4, f0
  1139. FNMSUB f1, f20, f5, f1
  1140. FMUL f0, f21, f0
  1141. FMUL f1, f21, f1
  1142. #endif
  1143. #ifdef LN
  1144. subi CO1, CO1, 2 * SIZE
  1145. subi CO2, CO2, 2 * SIZE
  1146. subi CO3, CO3, 2 * SIZE
  1147. subi CO4, CO4, 2 * SIZE
  1148. #endif
  1149. #if defined(LN) || defined(LT)
  1150. STFD f0, 0 * SIZE(BO)
  1151. STFD f4, 1 * SIZE(BO)
  1152. STFD f8, 2 * SIZE(BO)
  1153. STFD f12, 3 * SIZE(BO)
  1154. STFD f1, 4 * SIZE(BO)
  1155. STFD f5, 5 * SIZE(BO)
  1156. STFD f9, 6 * SIZE(BO)
  1157. STFD f13, 7 * SIZE(BO)
  1158. #else
  1159. STFD f0, 0 * SIZE(AO)
  1160. STFD f1, 1 * SIZE(AO)
  1161. STFD f4, 2 * SIZE(AO)
  1162. STFD f5, 3 * SIZE(AO)
  1163. STFD f8, 4 * SIZE(AO)
  1164. STFD f9, 5 * SIZE(AO)
  1165. STFD f12, 6 * SIZE(AO)
  1166. STFD f13, 7 * SIZE(AO)
  1167. #endif
  1168. STFD f0, 0 * SIZE(CO1)
  1169. STFD f1, 1 * SIZE(CO1)
  1170. STFD f4, 0 * SIZE(CO2)
  1171. STFD f5, 1 * SIZE(CO2)
  1172. STFD f8, 0 * SIZE(CO3)
  1173. STFD f9, 1 * SIZE(CO3)
  1174. STFD f12, 0 * SIZE(CO4)
  1175. STFD f13, 1 * SIZE(CO4)
  1176. lfs f0, FZERO
  1177. fmr f1, f0
  1178. fmr f2, f0
  1179. fmr f3, f0
  1180. fmr f4, f0
  1181. fmr f5, f0
  1182. fmr f6, f0
  1183. fmr f7, f0
  1184. fmr f8, f0
  1185. fmr f9, f0
  1186. fmr f10, f0
  1187. fmr f11, f0
  1188. fmr f12, f0
  1189. fmr f13, f0
  1190. fmr f14, f0
  1191. fmr f15, f0
  1192. #ifndef LN
  1193. addi CO1, CO1, 2 * SIZE
  1194. addi CO2, CO2, 2 * SIZE
  1195. addi CO3, CO3, 2 * SIZE
  1196. addi CO4, CO4, 2 * SIZE
  1197. #endif
  1198. #ifdef RT
  1199. slwi r0, K, 1 + BASE_SHIFT
  1200. add AORIG, AORIG, r0
  1201. #endif
  1202. #if defined(LT) || defined(RN)
  1203. sub TEMP, K, KK
  1204. slwi r0, TEMP, 1 + BASE_SHIFT
  1205. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1206. add AO, AO, r0
  1207. add BO, BO, TEMP
  1208. #endif
  1209. #ifdef LN
  1210. subi KK, KK, 2
  1211. #endif
  1212. #ifdef LT
  1213. addi KK, KK, 2
  1214. #endif
  1215. .align 4
  1216. .L30:
  1217. andi. I, M, 1
  1218. ble .L39
  1219. #if defined(LT) || defined(RN)
  1220. LFD f16, 0 * SIZE(AO)
  1221. LFD f17, 1 * SIZE(AO)
  1222. LFD f18, 2 * SIZE(AO)
  1223. LFD f19, 3 * SIZE(AO)
  1224. LFD f20, 0 * SIZE(B)
  1225. LFD f21, 1 * SIZE(B)
  1226. LFD f22, 2 * SIZE(B)
  1227. LFD f23, 3 * SIZE(B)
  1228. LFD f24, 4 * SIZE(B)
  1229. LFD f25, 5 * SIZE(B)
  1230. LFD f26, 6 * SIZE(B)
  1231. LFD f27, 7 * SIZE(B)
  1232. srawi. r0, KK, 2
  1233. mtspr CTR, r0
  1234. mr BO, B
  1235. #else
  1236. #ifdef LN
  1237. slwi r0, K, BASE_SHIFT
  1238. sub AORIG, AORIG, r0
  1239. #endif
  1240. slwi r0, KK, 0 + BASE_SHIFT
  1241. slwi TEMP, KK, 2 + BASE_SHIFT
  1242. add AO, AORIG, r0
  1243. add BO, B, TEMP
  1244. sub TEMP, K, KK
  1245. LFD f16, 0 * SIZE(AO)
  1246. LFD f17, 1 * SIZE(AO)
  1247. LFD f18, 2 * SIZE(AO)
  1248. LFD f19, 3 * SIZE(AO)
  1249. LFD f20, 0 * SIZE(BO)
  1250. LFD f21, 1 * SIZE(BO)
  1251. LFD f22, 2 * SIZE(BO)
  1252. LFD f23, 3 * SIZE(BO)
  1253. LFD f24, 4 * SIZE(BO)
  1254. LFD f25, 5 * SIZE(BO)
  1255. LFD f26, 6 * SIZE(BO)
  1256. LFD f27, 7 * SIZE(BO)
  1257. srawi. r0, TEMP, 2
  1258. mtspr CTR, r0
  1259. #endif
  1260. ble .L35
  1261. .align 5
  1262. .L32:
  1263. FMADD f0, f16, f20, f0
  1264. LFD f20, 8 * SIZE(BO)
  1265. FMADD f4, f16, f21, f4
  1266. LFD f21, 9 * SIZE(BO)
  1267. FMADD f8, f16, f22, f8
  1268. LFD f22, 10 * SIZE(BO)
  1269. FMADD f12, f16, f23, f12
  1270. LFD f23, 11 * SIZE(BO)
  1271. LFDU f16, 4 * SIZE(AO)
  1272. FMADD f1, f17, f24, f1
  1273. LFD f24, 12 * SIZE(BO)
  1274. FMADD f5, f17, f25, f5
  1275. LFD f25, 13 * SIZE(BO)
  1276. FMADD f9, f17, f26, f9
  1277. LFD f26, 14 * SIZE(BO)
  1278. FMADD f13, f17, f27, f13
  1279. LFD f27, 15 * SIZE(BO)
  1280. LFD f17, 1 * SIZE(AO)
  1281. FMADD f0, f18, f20, f0
  1282. LFDU f20, 16 * SIZE(BO)
  1283. FMADD f4, f18, f21, f4
  1284. LFD f21, 1 * SIZE(BO)
  1285. FMADD f8, f18, f22, f8
  1286. LFD f22, 2 * SIZE(BO)
  1287. FMADD f12, f18, f23, f12
  1288. LFD f23, 3 * SIZE(BO)
  1289. LFD f18, 2 * SIZE(AO)
  1290. FMADD f1, f19, f24, f1
  1291. LFD f24, 4 * SIZE(BO)
  1292. FMADD f5, f19, f25, f5
  1293. LFD f25, 5 * SIZE(BO)
  1294. FMADD f9, f19, f26, f9
  1295. LFD f26, 6 * SIZE(BO)
  1296. FMADD f13, f19, f27, f13
  1297. LFD f27, 7 * SIZE(BO)
  1298. LFD f19, 3 * SIZE(AO)
  1299. bdnz .L32
  1300. fadd f0, f1, f0
  1301. fadd f4, f5, f4
  1302. fadd f8, f9, f8
  1303. fadd f12, f13, f12
  1304. .align 4
  1305. .L35:
  1306. #if defined(LT) || defined(RN)
  1307. andi. r0, KK, 3
  1308. #else
  1309. andi. r0, TEMP, 3
  1310. #endif
  1311. mtspr CTR, r0
  1312. ble+ .L38
  1313. .align 4
  1314. .L36:
  1315. FMADD f0, f16, f20, f0
  1316. LFDU f20, 4 * SIZE(BO)
  1317. FMADD f4, f16, f21, f4
  1318. LFD f21, 1 * SIZE(BO)
  1319. FMADD f8, f16, f22, f8
  1320. LFD f22, 2 * SIZE(BO)
  1321. FMADD f12, f16, f23, f12
  1322. LFDU f16, 1 * SIZE(AO)
  1323. LFD f23, 3 * SIZE(BO)
  1324. bdnz .L36
  1325. .align 4
  1326. .L38:
  1327. #if defined(LN) || defined(RT)
  1328. #ifdef LN
  1329. subi r0, KK, 1
  1330. #else
  1331. subi r0, KK, 4
  1332. #endif
  1333. slwi TEMP, r0, 0 + BASE_SHIFT
  1334. slwi r0, r0, 2 + BASE_SHIFT
  1335. add AO, AORIG, TEMP
  1336. add BO, B, r0
  1337. #endif
  1338. #if defined(LN) || defined(LT)
  1339. LFD f16, 0 * SIZE(BO)
  1340. LFD f17, 1 * SIZE(BO)
  1341. LFD f18, 2 * SIZE(BO)
  1342. LFD f19, 3 * SIZE(BO)
  1343. FSUB f0, f16, f0
  1344. FSUB f4, f17, f4
  1345. FSUB f8, f18, f8
  1346. FSUB f12, f19, f12
  1347. #else
  1348. LFD f16, 0 * SIZE(AO)
  1349. LFD f20, 1 * SIZE(AO)
  1350. LFD f24, 2 * SIZE(AO)
  1351. LFD f28, 3 * SIZE(AO)
  1352. FSUB f0, f16, f0
  1353. FSUB f4, f20, f4
  1354. FSUB f8, f24, f8
  1355. FSUB f12, f28, f12
  1356. #endif
  1357. #ifdef LN
  1358. LFD f21, 0 * SIZE(AO)
  1359. FMUL f0, f21, f0
  1360. FMUL f4, f21, f4
  1361. FMUL f8, f21, f8
  1362. FMUL f12, f21, f12
  1363. #endif
  1364. #ifdef LT
  1365. LFD f16, 0 * SIZE(AO)
  1366. FMUL f0, f16, f0
  1367. FMUL f4, f16, f4
  1368. FMUL f8, f16, f8
  1369. FMUL f12, f16, f12
  1370. #endif
  1371. #ifdef RN
  1372. LFD f16, 0 * SIZE(BO)
  1373. LFD f17, 1 * SIZE(BO)
  1374. LFD f18, 2 * SIZE(BO)
  1375. LFD f19, 3 * SIZE(BO)
  1376. FMUL f0, f16, f0
  1377. FNMSUB f4, f17, f0, f4
  1378. FNMSUB f8, f18, f0, f8
  1379. FNMSUB f12, f19, f0, f12
  1380. LFD f16, 5 * SIZE(BO)
  1381. LFD f17, 6 * SIZE(BO)
  1382. LFD f18, 7 * SIZE(BO)
  1383. LFD f19, 10 * SIZE(BO)
  1384. LFD f20, 11 * SIZE(BO)
  1385. LFD f21, 15 * SIZE(BO)
  1386. FMUL f4, f16, f4
  1387. FNMSUB f8, f17, f4, f8
  1388. FNMSUB f12, f18, f4, f12
  1389. FMUL f8, f19, f8
  1390. FNMSUB f12, f20, f8, f12
  1391. FMUL f12, f21, f12
  1392. #endif
  1393. #ifdef RT
  1394. LFD f16, 15 * SIZE(BO)
  1395. LFD f17, 14 * SIZE(BO)
  1396. LFD f18, 13 * SIZE(BO)
  1397. LFD f19, 12 * SIZE(BO)
  1398. FMUL f12, f16, f12
  1399. FNMSUB f8, f17, f12, f8
  1400. FNMSUB f4, f18, f12, f4
  1401. FNMSUB f0, f19, f12, f0
  1402. LFD f16, 10 * SIZE(BO)
  1403. LFD f17, 9 * SIZE(BO)
  1404. LFD f18, 8 * SIZE(BO)
  1405. LFD f19, 5 * SIZE(BO)
  1406. FMUL f8, f16, f8
  1407. LFD f20, 4 * SIZE(BO)
  1408. LFD f21, 0 * SIZE(BO)
  1409. FNMSUB f4, f17, f8, f4
  1410. FNMSUB f0, f18, f8, f0
  1411. FMUL f4, f19, f4
  1412. FNMSUB f0, f20, f4, f0
  1413. FMUL f0, f21, f0
  1414. #endif
  1415. #ifdef LN
  1416. subi CO1, CO1, 1 * SIZE
  1417. subi CO2, CO2, 1 * SIZE
  1418. subi CO3, CO3, 1 * SIZE
  1419. subi CO4, CO4, 1 * SIZE
  1420. #endif
  1421. #if defined(LN) || defined(LT)
  1422. STFD f0, 0 * SIZE(BO)
  1423. STFD f4, 1 * SIZE(BO)
  1424. STFD f8, 2 * SIZE(BO)
  1425. STFD f12, 3 * SIZE(BO)
  1426. #else
  1427. STFD f0, 0 * SIZE(AO)
  1428. STFD f4, 1 * SIZE(AO)
  1429. STFD f8, 2 * SIZE(AO)
  1430. STFD f12, 3 * SIZE(AO)
  1431. #endif
  1432. STFD f0, 0 * SIZE(CO1)
  1433. STFD f4, 0 * SIZE(CO2)
  1434. STFD f8, 0 * SIZE(CO3)
  1435. STFD f12, 0 * SIZE(CO4)
  1436. lfs f0, FZERO
  1437. fmr f1, f0
  1438. fmr f4, f0
  1439. fmr f5, f0
  1440. fmr f8, f0
  1441. fmr f9, f0
  1442. fmr f12, f0
  1443. fmr f13, f0
  1444. #ifndef LN
  1445. addi CO1, CO1, 1 * SIZE
  1446. addi CO2, CO2, 1 * SIZE
  1447. addi CO3, CO3, 1 * SIZE
  1448. addi CO4, CO4, 1 * SIZE
  1449. #endif
  1450. #ifdef RT
  1451. slwi r0, K, 0 + BASE_SHIFT
  1452. add AORIG, AORIG, r0
  1453. #endif
  1454. #if defined(LT) || defined(RN)
  1455. sub TEMP, K, KK
  1456. slwi r0, TEMP, 0 + BASE_SHIFT
  1457. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1458. add AO, AO, r0
  1459. add BO, BO, TEMP
  1460. #endif
  1461. #ifdef LN
  1462. subi KK, KK, 1
  1463. #endif
  1464. #ifdef LT
  1465. addi KK, KK, 1
  1466. #endif
  1467. .align 4
  1468. .L39:
  1469. #ifdef LN
  1470. slwi r0, K, 2 + BASE_SHIFT
  1471. add B, B, r0
  1472. #endif
  1473. #if defined(LT) || defined(RN)
  1474. mr B, BO
  1475. #endif
  1476. #ifdef RN
  1477. addi KK, KK, 4
  1478. #endif
  1479. #ifdef RT
  1480. subi KK, KK, 4
  1481. #endif
  1482. addic. J, J, -1
  1483. lfs f0, FZERO
  1484. bgt .L10
  1485. .align 4
  1486. .L40:
  1487. andi. J, N, 2
  1488. ble .L70
  1489. #ifdef RT
  1490. slwi r0, K, 1 + BASE_SHIFT
  1491. sub B, B, r0
  1492. slwi r0, LDC, 1
  1493. sub C, C, r0
  1494. #endif
  1495. mr CO1, C
  1496. add CO2, C, LDC
  1497. #ifdef LN
  1498. add KK, M, OFFSET
  1499. #endif
  1500. #ifdef LT
  1501. mr KK, OFFSET
  1502. #endif
  1503. fmr f1, f0
  1504. fmr f2, f0
  1505. fmr f3, f0
  1506. fmr f4, f0
  1507. fmr f5, f0
  1508. fmr f6, f0
  1509. fmr f7, f0
  1510. srawi. I, M, 2
  1511. #if defined(LN) || defined(RT)
  1512. mr AORIG, A
  1513. #else
  1514. mr AO, A
  1515. #endif
  1516. #ifndef RT
  1517. add C, CO2, LDC
  1518. #endif
  1519. ble .L50
  1520. .align 4
  1521. .L41:
  1522. #if defined(LT) || defined(RN)
  1523. LFD f16, 0 * SIZE(AO)
  1524. LFD f17, 1 * SIZE(AO)
  1525. LFD f18, 2 * SIZE(AO)
  1526. LFD f19, 3 * SIZE(AO)
  1527. LFD f20, 0 * SIZE(B)
  1528. LFD f21, 1 * SIZE(B)
  1529. LFD f22, 2 * SIZE(B)
  1530. LFD f23, 3 * SIZE(B)
  1531. srawi. r0, KK, 2
  1532. mtspr CTR, r0
  1533. mr BO, B
  1534. #else
  1535. #ifdef LN
  1536. slwi r0, K, 2 + BASE_SHIFT
  1537. sub AORIG, AORIG, r0
  1538. #endif
  1539. slwi r0, KK, 2 + BASE_SHIFT
  1540. slwi TEMP, KK, 1 + BASE_SHIFT
  1541. add AO, AORIG, r0
  1542. add BO, B, TEMP
  1543. sub TEMP, K, KK
  1544. LFD f16, 0 * SIZE(AO)
  1545. LFD f17, 1 * SIZE(AO)
  1546. LFD f18, 2 * SIZE(AO)
  1547. LFD f19, 3 * SIZE(AO)
  1548. LFD f20, 0 * SIZE(BO)
  1549. LFD f21, 1 * SIZE(BO)
  1550. LFD f22, 2 * SIZE(BO)
  1551. LFD f23, 3 * SIZE(BO)
  1552. srawi. r0, TEMP, 2
  1553. mtspr CTR, r0
  1554. #endif
  1555. ble .L45
  1556. .align 5
  1557. .L42:
  1558. FMADD f0, f16, f20, f0
  1559. FMADD f1, f17, f20, f1
  1560. FMADD f2, f18, f20, f2
  1561. FMADD f3, f19, f20, f3
  1562. LFD f20, 4 * SIZE(BO)
  1563. FMADD f4, f16, f21, f4
  1564. LFD f16, 4 * SIZE(AO)
  1565. FMADD f5, f17, f21, f5
  1566. LFD f17, 5 * SIZE(AO)
  1567. FMADD f6, f18, f21, f6
  1568. LFD f18, 6 * SIZE(AO)
  1569. FMADD f7, f19, f21, f7
  1570. LFD f19, 7 * SIZE(AO)
  1571. FMADD f0, f16, f22, f0
  1572. LFD f21, 5 * SIZE(BO)
  1573. FMADD f1, f17, f22, f1
  1574. FMADD f2, f18, f22, f2
  1575. FMADD f3, f19, f22, f3
  1576. LFD f22, 6 * SIZE(BO)
  1577. FMADD f4, f16, f23, f4
  1578. LFD f16, 8 * SIZE(AO)
  1579. FMADD f5, f17, f23, f5
  1580. LFD f17, 9 * SIZE(AO)
  1581. FMADD f6, f18, f23, f6
  1582. LFD f18, 10 * SIZE(AO)
  1583. FMADD f7, f19, f23, f7
  1584. LFD f19, 11 * SIZE(AO)
  1585. FMADD f0, f16, f20, f0
  1586. LFD f23, 7 * SIZE(BO)
  1587. FMADD f1, f17, f20, f1
  1588. FMADD f2, f18, f20, f2
  1589. FMADD f3, f19, f20, f3
  1590. LFDU f20, 8 * SIZE(BO)
  1591. FMADD f4, f16, f21, f4
  1592. LFD f16, 12 * SIZE(AO)
  1593. FMADD f5, f17, f21, f5
  1594. LFD f17, 13 * SIZE(AO)
  1595. FMADD f6, f18, f21, f6
  1596. LFD f18, 14 * SIZE(AO)
  1597. FMADD f7, f19, f21, f7
  1598. LFD f19, 15 * SIZE(AO)
  1599. FMADD f0, f16, f22, f0
  1600. LFD f21, 1 * SIZE(BO)
  1601. FMADD f1, f17, f22, f1
  1602. FMADD f2, f18, f22, f2
  1603. FMADD f3, f19, f22, f3
  1604. LFD f22, 2 * SIZE(BO)
  1605. FMADD f4, f16, f23, f4
  1606. LFDU f16, 16 * SIZE(AO)
  1607. FMADD f5, f17, f23, f5
  1608. LFD f17, 1 * SIZE(AO)
  1609. FMADD f6, f18, f23, f6
  1610. LFD f18, 2 * SIZE(AO)
  1611. FMADD f7, f19, f23, f7
  1612. LFD f19, 3 * SIZE(AO)
  1613. LFD f23, 3 * SIZE(BO)
  1614. bdnz .L42
  1615. .align 4
  1616. .L45:
  1617. #if defined(LT) || defined(RN)
  1618. andi. r0, KK, 3
  1619. #else
  1620. andi. r0, TEMP, 3
  1621. #endif
  1622. mtspr CTR, r0
  1623. ble+ .L48
  1624. .align 4
  1625. .L46:
  1626. FMADD f0, f16, f20, f0
  1627. FMADD f1, f17, f20, f1
  1628. FMADD f2, f18, f20, f2
  1629. FMADD f3, f19, f20, f3
  1630. LFDU f20, 2 * SIZE(BO)
  1631. FMADD f4, f16, f21, f4
  1632. LFDU f16, 4 * SIZE(AO)
  1633. FMADD f5, f17, f21, f5
  1634. LFD f17, 1 * SIZE(AO)
  1635. FMADD f6, f18, f21, f6
  1636. LFD f18, 2 * SIZE(AO)
  1637. FMADD f7, f19, f21, f7
  1638. LFD f19, 3 * SIZE(AO)
  1639. LFD f21, 1 * SIZE(BO)
  1640. bdnz .L46
  1641. .align 4
  1642. .L48:
  1643. #if defined(LN) || defined(RT)
  1644. #ifdef LN
  1645. subi r0, KK, 4
  1646. #else
  1647. subi r0, KK, 2
  1648. #endif
  1649. slwi TEMP, r0, 2 + BASE_SHIFT
  1650. slwi r0, r0, 1 + BASE_SHIFT
  1651. add AO, AORIG, TEMP
  1652. add BO, B, r0
  1653. #endif
  1654. #if defined(LN) || defined(LT)
  1655. LFD f16, 0 * SIZE(BO)
  1656. LFD f17, 1 * SIZE(BO)
  1657. LFD f20, 2 * SIZE(BO)
  1658. LFD f21, 3 * SIZE(BO)
  1659. LFD f24, 4 * SIZE(BO)
  1660. LFD f25, 5 * SIZE(BO)
  1661. LFD f28, 6 * SIZE(BO)
  1662. LFD f29, 7 * SIZE(BO)
  1663. FSUB f0, f16, f0
  1664. FSUB f4, f17, f4
  1665. FSUB f1, f20, f1
  1666. FSUB f5, f21, f5
  1667. FSUB f2, f24, f2
  1668. FSUB f6, f25, f6
  1669. FSUB f3, f28, f3
  1670. FSUB f7, f29, f7
  1671. #else
  1672. LFD f16, 0 * SIZE(AO)
  1673. LFD f17, 1 * SIZE(AO)
  1674. LFD f18, 2 * SIZE(AO)
  1675. LFD f19, 3 * SIZE(AO)
  1676. LFD f20, 4 * SIZE(AO)
  1677. LFD f21, 5 * SIZE(AO)
  1678. LFD f22, 6 * SIZE(AO)
  1679. LFD f23, 7 * SIZE(AO)
  1680. FSUB f0, f16, f0
  1681. FSUB f1, f17, f1
  1682. FSUB f2, f18, f2
  1683. FSUB f3, f19, f3
  1684. FSUB f4, f20, f4
  1685. FSUB f5, f21, f5
  1686. FSUB f6, f22, f6
  1687. FSUB f7, f23, f7
  1688. #endif
  1689. #ifdef LN
  1690. LFD f16, 15 * SIZE(AO)
  1691. LFD f17, 14 * SIZE(AO)
  1692. LFD f18, 13 * SIZE(AO)
  1693. LFD f19, 12 * SIZE(AO)
  1694. FMUL f3, f16, f3
  1695. FMUL f7, f16, f7
  1696. FNMSUB f2, f17, f3, f2
  1697. FNMSUB f6, f17, f7, f6
  1698. FNMSUB f1, f18, f3, f1
  1699. FNMSUB f5, f18, f7, f5
  1700. FNMSUB f0, f19, f3, f0
  1701. FNMSUB f4, f19, f7, f4
  1702. LFD f16, 10 * SIZE(AO)
  1703. LFD f17, 9 * SIZE(AO)
  1704. LFD f18, 8 * SIZE(AO)
  1705. LFD f19, 5 * SIZE(AO)
  1706. LFD f20, 4 * SIZE(AO)
  1707. LFD f21, 0 * SIZE(AO)
  1708. FMUL f2, f16, f2
  1709. FMUL f6, f16, f6
  1710. FNMSUB f1, f17, f2, f1
  1711. FNMSUB f5, f17, f6, f5
  1712. FNMSUB f0, f18, f2, f0
  1713. FNMSUB f4, f18, f6, f4
  1714. FMUL f1, f19, f1
  1715. FMUL f5, f19, f5
  1716. FNMSUB f0, f20, f1, f0
  1717. FNMSUB f4, f20, f5, f4
  1718. FMUL f0, f21, f0
  1719. FMUL f4, f21, f4
  1720. #endif
  1721. #ifdef LT
  1722. LFD f16, 0 * SIZE(AO)
  1723. LFD f17, 1 * SIZE(AO)
  1724. LFD f18, 2 * SIZE(AO)
  1725. LFD f19, 3 * SIZE(AO)
  1726. FMUL f0, f16, f0
  1727. FMUL f4, f16, f4
  1728. FNMSUB f1, f17, f0, f1
  1729. FNMSUB f5, f17, f4, f5
  1730. FNMSUB f2, f18, f0, f2
  1731. FNMSUB f6, f18, f4, f6
  1732. FNMSUB f3, f19, f0, f3
  1733. FNMSUB f7, f19, f4, f7
  1734. LFD f17, 5 * SIZE(AO)
  1735. LFD f18, 6 * SIZE(AO)
  1736. LFD f19, 7 * SIZE(AO)
  1737. FMUL f1, f17, f1
  1738. FMUL f5, f17, f5
  1739. FNMSUB f2, f18, f1, f2
  1740. FNMSUB f6, f18, f5, f6
  1741. FNMSUB f3, f19, f1, f3
  1742. FNMSUB f7, f19, f5, f7
  1743. LFD f18, 10 * SIZE(AO)
  1744. LFD f19, 11 * SIZE(AO)
  1745. FMUL f2, f18, f2
  1746. FMUL f6, f18, f6
  1747. FNMSUB f3, f19, f2, f3
  1748. FNMSUB f7, f19, f6, f7
  1749. LFD f19, 15 * SIZE(AO)
  1750. FMUL f3, f19, f3
  1751. FMUL f7, f19, f7
  1752. #endif
  1753. #ifdef RN
  1754. LFD f16, 0 * SIZE(BO)
  1755. LFD f17, 1 * SIZE(BO)
  1756. LFD f18, 3 * SIZE(BO)
  1757. FMUL f0, f16, f0
  1758. FMUL f1, f16, f1
  1759. FMUL f2, f16, f2
  1760. FMUL f3, f16, f3
  1761. FNMSUB f4, f17, f0, f4
  1762. FNMSUB f5, f17, f1, f5
  1763. FNMSUB f6, f17, f2, f6
  1764. FNMSUB f7, f17, f3, f7
  1765. FMUL f4, f18, f4
  1766. FMUL f5, f18, f5
  1767. FMUL f6, f18, f6
  1768. FMUL f7, f18, f7
  1769. #endif
  1770. #ifdef RT
  1771. LFD f19, 3 * SIZE(BO)
  1772. LFD f20, 2 * SIZE(BO)
  1773. LFD f21, 0 * SIZE(BO)
  1774. FMUL f4, f19, f4
  1775. FMUL f5, f19, f5
  1776. FMUL f6, f19, f6
  1777. FMUL f7, f19, f7
  1778. FNMSUB f0, f20, f4, f0
  1779. FNMSUB f1, f20, f5, f1
  1780. FNMSUB f2, f20, f6, f2
  1781. FNMSUB f3, f20, f7, f3
  1782. FMUL f0, f21, f0
  1783. FMUL f1, f21, f1
  1784. FMUL f2, f21, f2
  1785. FMUL f3, f21, f3
  1786. #endif
  1787. #ifdef LN
  1788. subi CO1, CO1, 4 * SIZE
  1789. subi CO2, CO2, 4 * SIZE
  1790. #endif
  1791. #if defined(LN) || defined(LT)
  1792. STFD f0, 0 * SIZE(BO)
  1793. STFD f4, 1 * SIZE(BO)
  1794. STFD f1, 2 * SIZE(BO)
  1795. STFD f5, 3 * SIZE(BO)
  1796. STFD f2, 4 * SIZE(BO)
  1797. STFD f6, 5 * SIZE(BO)
  1798. STFD f3, 6 * SIZE(BO)
  1799. STFD f7, 7 * SIZE(BO)
  1800. #else
  1801. STFD f0, 0 * SIZE(AO)
  1802. STFD f1, 1 * SIZE(AO)
  1803. STFD f2, 2 * SIZE(AO)
  1804. STFD f3, 3 * SIZE(AO)
  1805. STFD f4, 4 * SIZE(AO)
  1806. STFD f5, 5 * SIZE(AO)
  1807. STFD f6, 6 * SIZE(AO)
  1808. STFD f7, 7 * SIZE(AO)
  1809. #endif
  1810. STFD f0, 0 * SIZE(CO1)
  1811. STFD f1, 1 * SIZE(CO1)
  1812. STFD f2, 2 * SIZE(CO1)
  1813. STFD f3, 3 * SIZE(CO1)
  1814. STFD f4, 0 * SIZE(CO2)
  1815. STFD f5, 1 * SIZE(CO2)
  1816. STFD f6, 2 * SIZE(CO2)
  1817. STFD f7, 3 * SIZE(CO2)
  1818. lfs f0, FZERO
  1819. fmr f1, f0
  1820. fmr f2, f0
  1821. fmr f3, f0
  1822. fmr f4, f0
  1823. fmr f5, f0
  1824. fmr f6, f0
  1825. fmr f7, f0
  1826. #ifndef LN
  1827. addi CO1, CO1, 4 * SIZE
  1828. addi CO2, CO2, 4 * SIZE
  1829. #endif
  1830. #ifdef RT
  1831. slwi r0, K, 2 + BASE_SHIFT
  1832. add AORIG, AORIG, r0
  1833. #endif
  1834. #if defined(LT) || defined(RN)
  1835. sub TEMP, K, KK
  1836. slwi r0, TEMP, 2 + BASE_SHIFT
  1837. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1838. add AO, AO, r0
  1839. add BO, BO, TEMP
  1840. #endif
  1841. #ifdef LN
  1842. subi KK, KK, 4
  1843. #endif
  1844. #ifdef LT
  1845. addi KK, KK, 4
  1846. #endif
  1847. addic. I, I, -1
  1848. bgt+ .L41
  1849. .align 4
  1850. .L50:
  1851. andi. I, M, 2
  1852. ble .L60
  1853. #if defined(LT) || defined(RN)
  1854. LFD f16, 0 * SIZE(AO)
  1855. LFD f17, 1 * SIZE(AO)
  1856. LFD f18, 2 * SIZE(AO)
  1857. LFD f19, 3 * SIZE(AO)
  1858. LFD f20, 0 * SIZE(B)
  1859. LFD f21, 1 * SIZE(B)
  1860. LFD f22, 2 * SIZE(B)
  1861. LFD f23, 3 * SIZE(B)
  1862. LFD f24, 4 * SIZE(B)
  1863. LFD f25, 5 * SIZE(B)
  1864. LFD f26, 6 * SIZE(B)
  1865. LFD f27, 7 * SIZE(B)
  1866. srawi. r0, KK, 2
  1867. mtspr CTR, r0
  1868. mr BO, B
  1869. #else
  1870. #ifdef LN
  1871. slwi r0, K, 1 + BASE_SHIFT
  1872. sub AORIG, AORIG, r0
  1873. #endif
  1874. slwi r0, KK, 1 + BASE_SHIFT
  1875. slwi TEMP, KK, 1 + BASE_SHIFT
  1876. add AO, AORIG, r0
  1877. add BO, B, TEMP
  1878. sub TEMP, K, KK
  1879. LFD f16, 0 * SIZE(AO)
  1880. LFD f17, 1 * SIZE(AO)
  1881. LFD f18, 2 * SIZE(AO)
  1882. LFD f19, 3 * SIZE(AO)
  1883. LFD f20, 0 * SIZE(BO)
  1884. LFD f21, 1 * SIZE(BO)
  1885. LFD f22, 2 * SIZE(BO)
  1886. LFD f23, 3 * SIZE(BO)
  1887. LFD f24, 4 * SIZE(BO)
  1888. LFD f25, 5 * SIZE(BO)
  1889. LFD f26, 6 * SIZE(BO)
  1890. LFD f27, 7 * SIZE(BO)
  1891. srawi. r0, TEMP, 2
  1892. mtspr CTR, r0
  1893. #endif
  1894. ble .L55
  1895. .align 5
  1896. .L52:
  1897. FMADD f0, f16, f20, f0
  1898. FMADD f1, f17, f20, f1
  1899. LFDU f20, 8 * SIZE(BO)
  1900. FMADD f2, f16, f21, f2
  1901. LFD f16, 4 * SIZE(AO)
  1902. FMADD f3, f17, f21, f3
  1903. LFD f17, 5 * SIZE(AO)
  1904. FMADD f4, f18, f22, f4
  1905. LFD f21, 1 * SIZE(BO)
  1906. FMADD f5, f19, f22, f5
  1907. LFD f22, 2 * SIZE(BO)
  1908. FMADD f6, f18, f23, f6
  1909. LFD f18, 6 * SIZE(AO)
  1910. FMADD f7, f19, f23, f7
  1911. LFD f19, 7 * SIZE(AO)
  1912. FMADD f0, f16, f24, f0
  1913. LFD f23, 3 * SIZE(BO)
  1914. FMADD f1, f17, f24, f1
  1915. LFD f24, 4 * SIZE(BO)
  1916. FMADD f2, f16, f25, f2
  1917. LFDU f16, 8 * SIZE(AO)
  1918. FMADD f3, f17, f25, f3
  1919. LFD f17, 1 * SIZE(AO)
  1920. FMADD f4, f18, f26, f4
  1921. LFD f25, 5 * SIZE(BO)
  1922. FMADD f5, f19, f26, f5
  1923. LFD f26, 6 * SIZE(BO)
  1924. FMADD f6, f18, f27, f6
  1925. LFD f18, 2 * SIZE(AO)
  1926. FMADD f7, f19, f27, f7
  1927. LFD f19, 3 * SIZE(AO)
  1928. LFD f27, 7 * SIZE(BO)
  1929. bdnz .L52
  1930. .align 4
  1931. .L55:
  1932. #if defined(LT) || defined(RN)
  1933. andi. r0, KK, 3
  1934. #else
  1935. andi. r0, TEMP, 3
  1936. #endif
  1937. mtspr CTR, r0
  1938. ble+ .L58
  1939. .align 4
  1940. .L56:
  1941. FMADD f0, f16, f20, f0
  1942. FMADD f1, f17, f20, f1
  1943. LFDU f20, 2 * SIZE(BO)
  1944. FMADD f2, f16, f21, f2
  1945. LFDU f16, 2 * SIZE(AO)
  1946. FMADD f3, f17, f21, f3
  1947. LFD f17, 1 * SIZE(AO)
  1948. LFD f21, 1 * SIZE(BO)
  1949. bdnz .L56
  1950. .align 4
  1951. .L58:
  1952. FADD f0, f4, f0
  1953. FADD f1, f5, f1
  1954. FADD f2, f6, f2
  1955. FADD f3, f7, f3
  1956. #if defined(LN) || defined(RT)
  1957. #ifdef LN
  1958. subi r0, KK, 2
  1959. #else
  1960. subi r0, KK, 2
  1961. #endif
  1962. slwi TEMP, r0, 1 + BASE_SHIFT
  1963. slwi r0, r0, 1 + BASE_SHIFT
  1964. add AO, AORIG, TEMP
  1965. add BO, B, r0
  1966. #endif
  1967. #if defined(LN) || defined(LT)
  1968. LFD f16, 0 * SIZE(BO)
  1969. LFD f17, 1 * SIZE(BO)
  1970. LFD f20, 2 * SIZE(BO)
  1971. LFD f21, 3 * SIZE(BO)
  1972. FSUB f0, f16, f0
  1973. FSUB f2, f17, f2
  1974. FSUB f1, f20, f1
  1975. FSUB f3, f21, f3
  1976. #else
  1977. LFD f16, 0 * SIZE(AO)
  1978. LFD f17, 1 * SIZE(AO)
  1979. LFD f20, 2 * SIZE(AO)
  1980. LFD f21, 3 * SIZE(AO)
  1981. FSUB f0, f16, f0
  1982. FSUB f1, f17, f1
  1983. FSUB f2, f20, f2
  1984. FSUB f3, f21, f3
  1985. #endif
  1986. #ifdef LN
  1987. LFD f19, 3 * SIZE(AO)
  1988. LFD f20, 2 * SIZE(AO)
  1989. LFD f21, 0 * SIZE(AO)
  1990. FMUL f1, f19, f1
  1991. FMUL f3, f19, f3
  1992. FNMSUB f0, f20, f1, f0
  1993. FNMSUB f2, f20, f3, f2
  1994. FMUL f0, f21, f0
  1995. FMUL f2, f21, f2
  1996. #endif
  1997. #ifdef LT
  1998. LFD f16, 0 * SIZE(AO)
  1999. LFD f17, 1 * SIZE(AO)
  2000. FMUL f0, f16, f0
  2001. FMUL f2, f16, f2
  2002. FNMSUB f1, f17, f0, f1
  2003. FNMSUB f3, f17, f2, f3
  2004. LFD f17, 3 * SIZE(AO)
  2005. FMUL f1, f17, f1
  2006. FMUL f3, f17, f3
  2007. #endif
  2008. #ifdef RN
  2009. LFD f16, 0 * SIZE(BO)
  2010. LFD f17, 1 * SIZE(BO)
  2011. LFD f18, 3 * SIZE(BO)
  2012. FMUL f0, f16, f0
  2013. FMUL f1, f16, f1
  2014. FNMSUB f2, f17, f0, f2
  2015. FNMSUB f3, f17, f1, f3
  2016. FMUL f2, f18, f2
  2017. FMUL f3, f18, f3
  2018. #endif
  2019. #ifdef RT
  2020. LFD f19, 3 * SIZE(BO)
  2021. LFD f20, 2 * SIZE(BO)
  2022. LFD f21, 0 * SIZE(BO)
  2023. FMUL f2, f19, f2
  2024. FMUL f3, f19, f3
  2025. FNMSUB f0, f20, f2, f0
  2026. FNMSUB f1, f20, f3, f1
  2027. FMUL f0, f21, f0
  2028. FMUL f1, f21, f1
  2029. #endif
  2030. #ifdef LN
  2031. subi CO1, CO1, 2 * SIZE
  2032. subi CO2, CO2, 2 * SIZE
  2033. #endif
  2034. #if defined(LN) || defined(LT)
  2035. STFD f0, 0 * SIZE(BO)
  2036. STFD f2, 1 * SIZE(BO)
  2037. STFD f1, 2 * SIZE(BO)
  2038. STFD f3, 3 * SIZE(BO)
  2039. #else
  2040. STFD f0, 0 * SIZE(AO)
  2041. STFD f1, 1 * SIZE(AO)
  2042. STFD f2, 2 * SIZE(AO)
  2043. STFD f3, 3 * SIZE(AO)
  2044. #endif
  2045. STFD f0, 0 * SIZE(CO1)
  2046. STFD f1, 1 * SIZE(CO1)
  2047. STFD f2, 0 * SIZE(CO2)
  2048. STFD f3, 1 * SIZE(CO2)
  2049. lfs f0, FZERO
  2050. fmr f1, f0
  2051. fmr f2, f0
  2052. fmr f3, f0
  2053. fmr f4, f0
  2054. fmr f5, f0
  2055. fmr f6, f0
  2056. fmr f7, f0
  2057. #ifndef LN
  2058. addi CO1, CO1, 2 * SIZE
  2059. addi CO2, CO2, 2 * SIZE
  2060. #endif
  2061. #ifdef RT
  2062. slwi r0, K, 1 + BASE_SHIFT
  2063. add AORIG, AORIG, r0
  2064. #endif
  2065. #if defined(LT) || defined(RN)
  2066. sub TEMP, K, KK
  2067. slwi r0, TEMP, 1 + BASE_SHIFT
  2068. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2069. add AO, AO, r0
  2070. add BO, BO, TEMP
  2071. #endif
  2072. #ifdef LN
  2073. subi KK, KK, 2
  2074. #endif
  2075. #ifdef LT
  2076. addi KK, KK, 2
  2077. #endif
  2078. .align 4
  2079. .L60:
  2080. andi. I, M, 1
  2081. ble .L69
  2082. #if defined(LT) || defined(RN)
  2083. LFD f16, 0 * SIZE(AO)
  2084. LFD f17, 1 * SIZE(AO)
  2085. LFD f18, 2 * SIZE(AO)
  2086. LFD f19, 3 * SIZE(AO)
  2087. LFD f20, 0 * SIZE(B)
  2088. LFD f21, 1 * SIZE(B)
  2089. LFD f22, 2 * SIZE(B)
  2090. LFD f23, 3 * SIZE(B)
  2091. LFD f24, 4 * SIZE(B)
  2092. LFD f25, 5 * SIZE(B)
  2093. LFD f26, 6 * SIZE(B)
  2094. LFD f27, 7 * SIZE(B)
  2095. srawi. r0, KK, 2
  2096. mtspr CTR, r0
  2097. mr BO, B
  2098. #else
  2099. #ifdef LN
  2100. slwi r0, K, BASE_SHIFT
  2101. sub AORIG, AORIG, r0
  2102. #endif
  2103. slwi r0, KK, 0 + BASE_SHIFT
  2104. slwi TEMP, KK, 1 + BASE_SHIFT
  2105. add AO, AORIG, r0
  2106. add BO, B, TEMP
  2107. sub TEMP, K, KK
  2108. LFD f16, 0 * SIZE(AO)
  2109. LFD f17, 1 * SIZE(AO)
  2110. LFD f18, 2 * SIZE(AO)
  2111. LFD f19, 3 * SIZE(AO)
  2112. LFD f20, 0 * SIZE(BO)
  2113. LFD f21, 1 * SIZE(BO)
  2114. LFD f22, 2 * SIZE(BO)
  2115. LFD f23, 3 * SIZE(BO)
  2116. LFD f24, 4 * SIZE(BO)
  2117. LFD f25, 5 * SIZE(BO)
  2118. LFD f26, 6 * SIZE(BO)
  2119. LFD f27, 7 * SIZE(BO)
  2120. srawi. r0, TEMP, 2
  2121. mtspr CTR, r0
  2122. #endif
  2123. ble .L65
  2124. .align 5
  2125. .L62:
  2126. FMADD f0, f16, f20, f0
  2127. LFDU f20, 8 * SIZE(BO)
  2128. FMADD f1, f16, f21, f1
  2129. LFDU f16, 4 * SIZE(AO)
  2130. LFD f21, 1 * SIZE(BO)
  2131. FMADD f2, f17, f22, f2
  2132. LFD f22, 2 * SIZE(BO)
  2133. FMADD f3, f17, f23, f3
  2134. LFD f17, 1 * SIZE(AO)
  2135. LFD f23, 3 * SIZE(BO)
  2136. FMADD f0, f18, f24, f0
  2137. LFD f24, 4 * SIZE(BO)
  2138. FMADD f1, f18, f25, f1
  2139. LFD f18, 2 * SIZE(AO)
  2140. LFD f25, 5 * SIZE(BO)
  2141. FMADD f2, f19, f26, f2
  2142. LFD f26, 6 * SIZE(BO)
  2143. FMADD f3, f19, f27, f3
  2144. LFD f19, 3 * SIZE(AO)
  2145. LFD f27, 7 * SIZE(BO)
  2146. bdnz .L62
  2147. .align 4
  2148. .L65:
  2149. #if defined(LT) || defined(RN)
  2150. andi. r0, KK, 3
  2151. #else
  2152. andi. r0, TEMP, 3
  2153. #endif
  2154. mtspr CTR, r0
  2155. ble+ .L68
  2156. .align 4
  2157. .L66:
  2158. FMADD f0, f16, f20, f0
  2159. LFDU f20, 2 * SIZE(BO)
  2160. FMADD f1, f16, f21, f1
  2161. LFDU f16, 1 * SIZE(AO)
  2162. LFD f21, 1 * SIZE(BO)
  2163. bdnz .L66
  2164. .align 4
  2165. .L68:
  2166. FADD f0, f2, f0
  2167. FADD f1, f3, f1
  2168. #if defined(LN) || defined(RT)
  2169. #ifdef LN
  2170. subi r0, KK, 1
  2171. #else
  2172. subi r0, KK, 2
  2173. #endif
  2174. slwi TEMP, r0, 0 + BASE_SHIFT
  2175. slwi r0, r0, 1 + BASE_SHIFT
  2176. add AO, AORIG, TEMP
  2177. add BO, B, r0
  2178. #endif
  2179. #if defined(LN) || defined(LT)
  2180. LFD f16, 0 * SIZE(BO)
  2181. LFD f17, 1 * SIZE(BO)
  2182. FSUB f0, f16, f0
  2183. FSUB f1, f17, f1
  2184. #else
  2185. LFD f16, 0 * SIZE(AO)
  2186. LFD f20, 1 * SIZE(AO)
  2187. FSUB f0, f16, f0
  2188. FSUB f1, f20, f1
  2189. #endif
  2190. #ifdef LN
  2191. LFD f21, 0 * SIZE(AO)
  2192. FMUL f0, f21, f0
  2193. FMUL f1, f21, f1
  2194. #endif
  2195. #ifdef LT
  2196. LFD f16, 0 * SIZE(AO)
  2197. FMUL f0, f16, f0
  2198. FMUL f1, f16, f1
  2199. #endif
  2200. #ifdef RN
  2201. LFD f16, 0 * SIZE(BO)
  2202. LFD f17, 1 * SIZE(BO)
  2203. LFD f18, 3 * SIZE(BO)
  2204. FMUL f0, f16, f0
  2205. FNMSUB f1, f17, f0, f1
  2206. FMUL f1, f18, f1
  2207. #endif
  2208. #ifdef RT
  2209. LFD f19, 3 * SIZE(BO)
  2210. LFD f20, 2 * SIZE(BO)
  2211. LFD f21, 0 * SIZE(BO)
  2212. FMUL f1, f19, f1
  2213. FNMSUB f0, f20, f1, f0
  2214. FMUL f0, f21, f0
  2215. #endif
  2216. #ifdef LN
  2217. subi CO1, CO1, 1 * SIZE
  2218. subi CO2, CO2, 1 * SIZE
  2219. #endif
  2220. #if defined(LN) || defined(LT)
  2221. STFD f0, 0 * SIZE(BO)
  2222. STFD f1, 1 * SIZE(BO)
  2223. #else
  2224. STFD f0, 0 * SIZE(AO)
  2225. STFD f1, 1 * SIZE(AO)
  2226. #endif
  2227. STFD f0, 0 * SIZE(CO1)
  2228. STFD f1, 0 * SIZE(CO2)
  2229. lfs f0, FZERO
  2230. fmr f1, f0
  2231. fmr f4, f0
  2232. fmr f5, f0
  2233. #ifndef LN
  2234. addi CO1, CO1, 1 * SIZE
  2235. addi CO2, CO2, 1 * SIZE
  2236. #endif
  2237. #ifdef RT
  2238. slwi r0, K, 0 + BASE_SHIFT
  2239. add AORIG, AORIG, r0
  2240. #endif
  2241. #if defined(LT) || defined(RN)
  2242. sub TEMP, K, KK
  2243. slwi r0, TEMP, 0 + BASE_SHIFT
  2244. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2245. add AO, AO, r0
  2246. add BO, BO, TEMP
  2247. #endif
  2248. #ifdef LN
  2249. subi KK, KK, 1
  2250. #endif
  2251. #ifdef LT
  2252. addi KK, KK, 1
  2253. #endif
  2254. .align 4
  2255. .L69:
  2256. #ifdef LN
  2257. slwi r0, K, 1 + BASE_SHIFT
  2258. add B, B, r0
  2259. #endif
  2260. #if defined(LT) || defined(RN)
  2261. mr B, BO
  2262. #endif
  2263. #ifdef RN
  2264. addi KK, KK, 2
  2265. #endif
  2266. #ifdef RT
  2267. subi KK, KK, 2
  2268. #endif
  2269. lfs f0, FZERO
  2270. .align 4
  2271. .L70:
  2272. andi. J, N, 1
  2273. ble .L999
  2274. #ifdef RT
  2275. slwi r0, K, 0 + BASE_SHIFT
  2276. sub B, B, r0
  2277. sub C, C, LDC
  2278. #endif
  2279. mr CO1, C
  2280. #ifdef LN
  2281. add KK, M, OFFSET
  2282. #endif
  2283. #ifdef LT
  2284. mr KK, OFFSET
  2285. #endif
  2286. fmr f1, f0
  2287. fmr f2, f0
  2288. fmr f3, f0
  2289. srawi. I, M, 2
  2290. #if defined(LN) || defined(RT)
  2291. mr AORIG, A
  2292. #else
  2293. mr AO, A
  2294. #endif
  2295. #ifndef RT
  2296. add C, CO1, LDC
  2297. #endif
  2298. ble .L80
  2299. .align 4
  2300. .L71:
  2301. #if defined(LT) || defined(RN)
  2302. LFD f16, 0 * SIZE(AO)
  2303. LFD f17, 1 * SIZE(AO)
  2304. LFD f18, 2 * SIZE(AO)
  2305. LFD f19, 3 * SIZE(AO)
  2306. LFD f20, 0 * SIZE(B)
  2307. LFD f21, 1 * SIZE(B)
  2308. LFD f22, 2 * SIZE(B)
  2309. LFD f23, 3 * SIZE(B)
  2310. srawi. r0, KK, 2
  2311. mtspr CTR, r0
  2312. mr BO, B
  2313. #else
  2314. #ifdef LN
  2315. slwi r0, K, 2 + BASE_SHIFT
  2316. sub AORIG, AORIG, r0
  2317. #endif
  2318. slwi r0, KK, 2 + BASE_SHIFT
  2319. slwi TEMP, KK, 0 + BASE_SHIFT
  2320. add AO, AORIG, r0
  2321. add BO, B, TEMP
  2322. sub TEMP, K, KK
  2323. LFD f16, 0 * SIZE(AO)
  2324. LFD f17, 1 * SIZE(AO)
  2325. LFD f18, 2 * SIZE(AO)
  2326. LFD f19, 3 * SIZE(AO)
  2327. LFD f20, 0 * SIZE(BO)
  2328. LFD f21, 1 * SIZE(BO)
  2329. LFD f22, 2 * SIZE(BO)
  2330. LFD f23, 3 * SIZE(BO)
  2331. srawi. r0, TEMP, 2
  2332. mtspr CTR, r0
  2333. #endif
  2334. ble .L75
  2335. .align 5
  2336. .L72:
  2337. FMADD f0, f16, f20, f0
  2338. LFD f16, 4 * SIZE(AO)
  2339. FMADD f1, f17, f20, f1
  2340. LFD f17, 5 * SIZE(AO)
  2341. FMADD f2, f18, f20, f2
  2342. LFD f18, 6 * SIZE(AO)
  2343. FMADD f3, f19, f20, f3
  2344. LFD f19, 7 * SIZE(AO)
  2345. LFDU f20, 4 * SIZE(BO)
  2346. FMADD f0, f16, f21, f0
  2347. LFD f16, 8 * SIZE(AO)
  2348. FMADD f1, f17, f21, f1
  2349. LFD f17, 9 * SIZE(AO)
  2350. FMADD f2, f18, f21, f2
  2351. LFD f18, 10 * SIZE(AO)
  2352. FMADD f3, f19, f21, f3
  2353. LFD f19, 11 * SIZE(AO)
  2354. LFD f21, 1 * SIZE(BO)
  2355. FMADD f0, f16, f22, f0
  2356. LFD f16, 12 * SIZE(AO)
  2357. FMADD f1, f17, f22, f1
  2358. LFD f17, 13 * SIZE(AO)
  2359. FMADD f2, f18, f22, f2
  2360. LFD f18, 14 * SIZE(AO)
  2361. FMADD f3, f19, f22, f3
  2362. LFD f19, 15 * SIZE(AO)
  2363. LFD f22, 2 * SIZE(BO)
  2364. FMADD f0, f16, f23, f0
  2365. LFDU f16, 16 * SIZE(AO)
  2366. FMADD f1, f17, f23, f1
  2367. LFD f17, 1 * SIZE(AO)
  2368. FMADD f2, f18, f23, f2
  2369. LFD f18, 2 * SIZE(AO)
  2370. FMADD f3, f19, f23, f3
  2371. LFD f19, 3 * SIZE(AO)
  2372. LFD f23, 3 * SIZE(BO)
  2373. bdnz .L72
  2374. .align 4
  2375. .L75:
  2376. #if defined(LT) || defined(RN)
  2377. andi. r0, KK, 3
  2378. #else
  2379. andi. r0, TEMP, 3
  2380. #endif
  2381. mtspr CTR, r0
  2382. ble+ .L78
  2383. .align 4
  2384. .L76:
  2385. FMADD f0, f16, f20, f0
  2386. LFDU f16, 4 * SIZE(AO)
  2387. FMADD f1, f17, f20, f1
  2388. LFD f17, 1 * SIZE(AO)
  2389. FMADD f2, f18, f20, f2
  2390. LFD f18, 2 * SIZE(AO)
  2391. FMADD f3, f19, f20, f3
  2392. LFDU f20, 1 * SIZE(BO)
  2393. LFD f19, 3 * SIZE(AO)
  2394. bdnz .L76
  2395. .align 4
  2396. .L78:
  2397. #if defined(LN) || defined(RT)
  2398. #ifdef LN
  2399. subi r0, KK, 4
  2400. #else
  2401. subi r0, KK, 1
  2402. #endif
  2403. slwi TEMP, r0, 2 + BASE_SHIFT
  2404. slwi r0, r0, 0 + BASE_SHIFT
  2405. add AO, AORIG, TEMP
  2406. add BO, B, r0
  2407. #endif
  2408. #if defined(LN) || defined(LT)
  2409. LFD f16, 0 * SIZE(BO)
  2410. LFD f20, 1 * SIZE(BO)
  2411. LFD f24, 2 * SIZE(BO)
  2412. LFD f28, 3 * SIZE(BO)
  2413. FSUB f0, f16, f0
  2414. FSUB f1, f20, f1
  2415. FSUB f2, f24, f2
  2416. FSUB f3, f28, f3
  2417. #else
  2418. LFD f16, 0 * SIZE(AO)
  2419. LFD f17, 1 * SIZE(AO)
  2420. LFD f18, 2 * SIZE(AO)
  2421. LFD f19, 3 * SIZE(AO)
  2422. FSUB f0, f16, f0
  2423. FSUB f1, f17, f1
  2424. FSUB f2, f18, f2
  2425. FSUB f3, f19, f3
  2426. #endif
  2427. #ifdef LN
  2428. LFD f16, 15 * SIZE(AO)
  2429. LFD f17, 14 * SIZE(AO)
  2430. LFD f18, 13 * SIZE(AO)
  2431. LFD f19, 12 * SIZE(AO)
  2432. FMUL f3, f16, f3
  2433. FNMSUB f2, f17, f3, f2
  2434. FNMSUB f1, f18, f3, f1
  2435. FNMSUB f0, f19, f3, f0
  2436. LFD f16, 10 * SIZE(AO)
  2437. LFD f17, 9 * SIZE(AO)
  2438. LFD f18, 8 * SIZE(AO)
  2439. LFD f19, 5 * SIZE(AO)
  2440. LFD f20, 4 * SIZE(AO)
  2441. LFD f21, 0 * SIZE(AO)
  2442. FMUL f2, f16, f2
  2443. FNMSUB f1, f17, f2, f1
  2444. FNMSUB f0, f18, f2, f0
  2445. FMUL f1, f19, f1
  2446. FNMSUB f0, f20, f1, f0
  2447. FMUL f0, f21, f0
  2448. #endif
  2449. #ifdef LT
  2450. LFD f16, 0 * SIZE(AO)
  2451. LFD f17, 1 * SIZE(AO)
  2452. LFD f18, 2 * SIZE(AO)
  2453. LFD f19, 3 * SIZE(AO)
  2454. FMUL f0, f16, f0
  2455. FNMSUB f1, f17, f0, f1
  2456. FNMSUB f2, f18, f0, f2
  2457. FNMSUB f3, f19, f0, f3
  2458. LFD f17, 5 * SIZE(AO)
  2459. LFD f18, 6 * SIZE(AO)
  2460. LFD f19, 7 * SIZE(AO)
  2461. FMUL f1, f17, f1
  2462. FNMSUB f2, f18, f1, f2
  2463. FNMSUB f3, f19, f1, f3
  2464. LFD f18, 10 * SIZE(AO)
  2465. LFD f19, 11 * SIZE(AO)
  2466. FMUL f2, f18, f2
  2467. FNMSUB f3, f19, f2, f3
  2468. LFD f19, 15 * SIZE(AO)
  2469. FMUL f3, f19, f3
  2470. #endif
  2471. #ifdef RN
  2472. LFD f16, 0 * SIZE(BO)
  2473. FMUL f0, f16, f0
  2474. FMUL f1, f16, f1
  2475. FMUL f2, f16, f2
  2476. FMUL f3, f16, f3
  2477. #endif
  2478. #ifdef RT
  2479. LFD f21, 0 * SIZE(BO)
  2480. FMUL f0, f21, f0
  2481. FMUL f1, f21, f1
  2482. FMUL f2, f21, f2
  2483. FMUL f3, f21, f3
  2484. #endif
  2485. #ifdef LN
  2486. subi CO1, CO1, 4 * SIZE
  2487. #endif
  2488. #if defined(LN) || defined(LT)
  2489. STFD f0, 0 * SIZE(BO)
  2490. STFD f1, 1 * SIZE(BO)
  2491. STFD f2, 2 * SIZE(BO)
  2492. STFD f3, 3 * SIZE(BO)
  2493. #else
  2494. STFD f0, 0 * SIZE(AO)
  2495. STFD f1, 1 * SIZE(AO)
  2496. STFD f2, 2 * SIZE(AO)
  2497. STFD f3, 3 * SIZE(AO)
  2498. #endif
  2499. STFD f0, 0 * SIZE(CO1)
  2500. STFD f1, 1 * SIZE(CO1)
  2501. STFD f2, 2 * SIZE(CO1)
  2502. STFD f3, 3 * SIZE(CO1)
  2503. lfs f0, FZERO
  2504. fmr f1, f0
  2505. fmr f2, f0
  2506. fmr f3, f0
  2507. #ifndef LN
  2508. addi CO1, CO1, 4 * SIZE
  2509. #endif
  2510. #ifdef RT
  2511. slwi r0, K, 2 + BASE_SHIFT
  2512. add AORIG, AORIG, r0
  2513. #endif
  2514. #if defined(LT) || defined(RN)
  2515. sub TEMP, K, KK
  2516. slwi r0, TEMP, 2 + BASE_SHIFT
  2517. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2518. add AO, AO, r0
  2519. add BO, BO, TEMP
  2520. #endif
  2521. #ifdef LN
  2522. subi KK, KK, 4
  2523. #endif
  2524. #ifdef LT
  2525. addi KK, KK, 4
  2526. #endif
  2527. addic. I, I, -1
  2528. bgt+ .L71
  2529. .align 4
  2530. .L80:
  2531. andi. I, M, 2
  2532. ble .L90
  2533. #if defined(LT) || defined(RN)
  2534. LFD f16, 0 * SIZE(AO)
  2535. LFD f17, 1 * SIZE(AO)
  2536. LFD f18, 2 * SIZE(AO)
  2537. LFD f19, 3 * SIZE(AO)
  2538. LFD f20, 0 * SIZE(B)
  2539. LFD f21, 1 * SIZE(B)
  2540. LFD f22, 2 * SIZE(B)
  2541. LFD f23, 3 * SIZE(B)
  2542. srawi. r0, KK, 2
  2543. mtspr CTR, r0
  2544. mr BO, B
  2545. #else
  2546. #ifdef LN
  2547. slwi r0, K, 1 + BASE_SHIFT
  2548. sub AORIG, AORIG, r0
  2549. #endif
  2550. slwi r0, KK, 1 + BASE_SHIFT
  2551. slwi TEMP, KK, 0 + BASE_SHIFT
  2552. add AO, AORIG, r0
  2553. add BO, B, TEMP
  2554. sub TEMP, K, KK
  2555. LFD f16, 0 * SIZE(AO)
  2556. LFD f17, 1 * SIZE(AO)
  2557. LFD f18, 2 * SIZE(AO)
  2558. LFD f19, 3 * SIZE(AO)
  2559. LFD f20, 0 * SIZE(BO)
  2560. LFD f21, 1 * SIZE(BO)
  2561. LFD f22, 2 * SIZE(BO)
  2562. LFD f23, 3 * SIZE(BO)
  2563. srawi. r0, TEMP, 2
  2564. mtspr CTR, r0
  2565. #endif
  2566. ble .L85
  2567. .align 5
  2568. .L82:
  2569. FMADD f0, f16, f20, f0
  2570. LFD f16, 4 * SIZE(AO)
  2571. FMADD f1, f17, f20, f1
  2572. LFDU f20, 4 * SIZE(BO)
  2573. LFD f17, 5 * SIZE(AO)
  2574. FMADD f2, f18, f21, f2
  2575. LFD f18, 6 * SIZE(AO)
  2576. FMADD f3, f19, f21, f3
  2577. LFD f21, 1 * SIZE(BO)
  2578. LFD f19, 7 * SIZE(AO)
  2579. FMADD f0, f16, f22, f0
  2580. LFDU f16, 8 * SIZE(AO)
  2581. FMADD f1, f17, f22, f1
  2582. LFD f22, 2 * SIZE(BO)
  2583. LFD f17, 1 * SIZE(AO)
  2584. FMADD f2, f18, f23, f2
  2585. LFD f18, 2 * SIZE(AO)
  2586. FMADD f3, f19, f23, f3
  2587. LFD f23, 3 * SIZE(BO)
  2588. LFD f19, 3 * SIZE(AO)
  2589. bdnz .L82
  2590. .align 4
  2591. .L85:
  2592. #if defined(LT) || defined(RN)
  2593. andi. r0, KK, 3
  2594. #else
  2595. andi. r0, TEMP, 3
  2596. #endif
  2597. mtspr CTR, r0
  2598. ble+ .L88
  2599. .align 4
  2600. .L86:
  2601. FMADD f0, f16, f20, f0
  2602. LFDU f16, 2 * SIZE(AO)
  2603. FMADD f1, f17, f20, f1
  2604. LFDU f20, 1 * SIZE(BO)
  2605. LFD f17, 1 * SIZE(AO)
  2606. bdnz .L86
  2607. .align 4
  2608. .L88:
  2609. FADD f0, f2, f0
  2610. FADD f1, f3, f1
  2611. #if defined(LN) || defined(RT)
  2612. #ifdef LN
  2613. subi r0, KK, 2
  2614. #else
  2615. subi r0, KK, 1
  2616. #endif
  2617. slwi TEMP, r0, 1 + BASE_SHIFT
  2618. slwi r0, r0, 0 + BASE_SHIFT
  2619. add AO, AORIG, TEMP
  2620. add BO, B, r0
  2621. #endif
  2622. #if defined(LN) || defined(LT)
  2623. LFD f16, 0 * SIZE(BO)
  2624. LFD f20, 1 * SIZE(BO)
  2625. FSUB f0, f16, f0
  2626. FSUB f1, f20, f1
  2627. #else
  2628. LFD f16, 0 * SIZE(AO)
  2629. LFD f17, 1 * SIZE(AO)
  2630. FSUB f0, f16, f0
  2631. FSUB f1, f17, f1
  2632. #endif
  2633. #ifdef LN
  2634. LFD f19, 3 * SIZE(AO)
  2635. LFD f20, 2 * SIZE(AO)
  2636. LFD f21, 0 * SIZE(AO)
  2637. FMUL f1, f19, f1
  2638. FNMSUB f0, f20, f1, f0
  2639. FMUL f0, f21, f0
  2640. #endif
  2641. #ifdef LT
  2642. LFD f16, 0 * SIZE(AO)
  2643. LFD f17, 1 * SIZE(AO)
  2644. FMUL f0, f16, f0
  2645. FNMSUB f1, f17, f0, f1
  2646. LFD f17, 3 * SIZE(AO)
  2647. FMUL f1, f17, f1
  2648. #endif
  2649. #ifdef RN
  2650. LFD f16, 0 * SIZE(BO)
  2651. FMUL f0, f16, f0
  2652. FMUL f1, f16, f1
  2653. #endif
  2654. #ifdef RT
  2655. LFD f21, 0 * SIZE(BO)
  2656. FMUL f0, f21, f0
  2657. FMUL f1, f21, f1
  2658. #endif
  2659. #ifdef LN
  2660. subi CO1, CO1, 2 * SIZE
  2661. #endif
  2662. #if defined(LN) || defined(LT)
  2663. STFD f0, 0 * SIZE(BO)
  2664. STFD f1, 1 * SIZE(BO)
  2665. #else
  2666. STFD f0, 0 * SIZE(AO)
  2667. STFD f1, 1 * SIZE(AO)
  2668. #endif
  2669. STFD f0, 0 * SIZE(CO1)
  2670. STFD f1, 1 * SIZE(CO1)
  2671. lfs f0, FZERO
  2672. fmr f1, f0
  2673. fmr f2, f0
  2674. fmr f3, f0
  2675. #ifndef LN
  2676. addi CO1, CO1, 2 * SIZE
  2677. #endif
  2678. #ifdef RT
  2679. slwi r0, K, 1 + BASE_SHIFT
  2680. add AORIG, AORIG, r0
  2681. #endif
  2682. #if defined(LT) || defined(RN)
  2683. sub TEMP, K, KK
  2684. slwi r0, TEMP, 1 + BASE_SHIFT
  2685. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2686. add AO, AO, r0
  2687. add BO, BO, TEMP
  2688. #endif
  2689. #ifdef LN
  2690. subi KK, KK, 2
  2691. #endif
  2692. #ifdef LT
  2693. addi KK, KK, 2
  2694. #endif
  2695. .align 4
  2696. .L90:
  2697. andi. I, M, 1
  2698. ble .L999
  2699. #if defined(LT) || defined(RN)
  2700. LFD f16, 0 * SIZE(AO)
  2701. LFD f17, 1 * SIZE(AO)
  2702. LFD f18, 2 * SIZE(AO)
  2703. LFD f19, 3 * SIZE(AO)
  2704. LFD f20, 0 * SIZE(B)
  2705. LFD f21, 1 * SIZE(B)
  2706. LFD f22, 2 * SIZE(B)
  2707. LFD f23, 3 * SIZE(B)
  2708. srawi. r0, KK, 3
  2709. mtspr CTR, r0
  2710. mr BO, B
  2711. #else
  2712. #ifdef LN
  2713. slwi r0, K, BASE_SHIFT
  2714. sub AORIG, AORIG, r0
  2715. #endif
  2716. slwi r0, KK, 0 + BASE_SHIFT
  2717. slwi TEMP, KK, 0 + BASE_SHIFT
  2718. add AO, AORIG, r0
  2719. add BO, B, TEMP
  2720. sub TEMP, K, KK
  2721. LFD f16, 0 * SIZE(AO)
  2722. LFD f17, 1 * SIZE(AO)
  2723. LFD f18, 2 * SIZE(AO)
  2724. LFD f19, 3 * SIZE(AO)
  2725. LFD f20, 0 * SIZE(BO)
  2726. LFD f21, 1 * SIZE(BO)
  2727. LFD f22, 2 * SIZE(BO)
  2728. LFD f23, 3 * SIZE(BO)
  2729. srawi. r0, TEMP, 3
  2730. mtspr CTR, r0
  2731. #endif
  2732. ble .L95
  2733. .align 5
  2734. .L92:
  2735. FMADD f0, f16, f20, f0
  2736. LFD f16, 4 * SIZE(AO)
  2737. LFD f20, 4 * SIZE(BO)
  2738. FMADD f1, f17, f21, f1
  2739. LFD f17, 5 * SIZE(AO)
  2740. LFD f21, 5 * SIZE(BO)
  2741. FMADD f2, f18, f22, f2
  2742. LFD f18, 6 * SIZE(AO)
  2743. LFD f22, 6 * SIZE(BO)
  2744. FMADD f3, f19, f23, f3
  2745. LFD f19, 7 * SIZE(AO)
  2746. LFD f23, 7 * SIZE(BO)
  2747. FMADD f0, f16, f20, f0
  2748. LFDU f16, 8 * SIZE(AO)
  2749. LFDU f20, 8 * SIZE(BO)
  2750. FMADD f1, f17, f21, f1
  2751. LFD f17, 1 * SIZE(AO)
  2752. LFD f21, 1 * SIZE(BO)
  2753. FMADD f2, f18, f22, f2
  2754. LFD f18, 2 * SIZE(AO)
  2755. LFD f22, 2 * SIZE(BO)
  2756. FMADD f3, f19, f23, f3
  2757. LFD f19, 3 * SIZE(AO)
  2758. LFD f23, 3 * SIZE(BO)
  2759. bdnz .L92
  2760. .align 4
  2761. .L95:
  2762. #if defined(LT) || defined(RN)
  2763. andi. r0, KK, 7
  2764. #else
  2765. andi. r0, TEMP, 7
  2766. #endif
  2767. mtspr CTR, r0
  2768. ble+ .L98
  2769. .align 4
  2770. .L96:
  2771. FMADD f0, f16, f20, f0
  2772. LFDU f16, 1 * SIZE(AO)
  2773. LFDU f20, 1 * SIZE(BO)
  2774. bdnz .L96
  2775. .align 4
  2776. .L98:
  2777. FADD f0, f1, f0
  2778. FADD f2, f3, f2
  2779. FADD f0, f2, f0
  2780. #if defined(LN) || defined(RT)
  2781. #ifdef LN
  2782. subi r0, KK, 1
  2783. #else
  2784. subi r0, KK, 1
  2785. #endif
  2786. slwi TEMP, r0, 0 + BASE_SHIFT
  2787. slwi r0, r0, 0 + BASE_SHIFT
  2788. add AO, AORIG, TEMP
  2789. add BO, B, r0
  2790. #endif
  2791. #if defined(LN) || defined(LT)
  2792. LFD f16, 0 * SIZE(BO)
  2793. FSUB f0, f16, f0
  2794. #else
  2795. LFD f16, 0 * SIZE(AO)
  2796. FSUB f0, f16, f0
  2797. #endif
  2798. #ifdef LN
  2799. LFD f21, 0 * SIZE(AO)
  2800. FMUL f0, f21, f0
  2801. #endif
  2802. #ifdef LT
  2803. LFD f16, 0 * SIZE(AO)
  2804. FMUL f0, f16, f0
  2805. #endif
  2806. #ifdef RN
  2807. LFD f16, 0 * SIZE(BO)
  2808. FMUL f0, f16, f0
  2809. #endif
  2810. #ifdef RT
  2811. LFD f21, 0 * SIZE(BO)
  2812. FMUL f0, f21, f0
  2813. #endif
  2814. #ifdef LN
  2815. subi CO1, CO1, 1 * SIZE
  2816. #endif
  2817. #if defined(LN) || defined(LT)
  2818. STFD f0, 0 * SIZE(BO)
  2819. #else
  2820. STFD f0, 0 * SIZE(AO)
  2821. #endif
  2822. STFD f0, 0 * SIZE(CO1)
  2823. #ifndef LN
  2824. addi CO1, CO1, 1 * SIZE
  2825. #endif
  2826. #ifdef RT
  2827. slwi r0, K, 0 + BASE_SHIFT
  2828. add AORIG, AORIG, r0
  2829. #endif
  2830. #if defined(LT) || defined(RN)
  2831. sub TEMP, K, KK
  2832. slwi r0, TEMP, 0 + BASE_SHIFT
  2833. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2834. add AO, AO, r0
  2835. add BO, BO, TEMP
  2836. #endif
  2837. #ifdef LN
  2838. subi KK, KK, 1
  2839. #endif
  2840. #ifdef LT
  2841. addi KK, KK, 1
  2842. #endif
  2843. .align 4
  2844. .L999:
  2845. addi r3, 0, 0
  2846. lfd f14, 0(SP)
  2847. lfd f15, 8(SP)
  2848. lfd f16, 16(SP)
  2849. lfd f17, 24(SP)
  2850. lfd f18, 32(SP)
  2851. lfd f19, 40(SP)
  2852. lfd f20, 48(SP)
  2853. lfd f21, 56(SP)
  2854. lfd f22, 64(SP)
  2855. lfd f23, 72(SP)
  2856. lfd f24, 80(SP)
  2857. lfd f25, 88(SP)
  2858. lfd f26, 96(SP)
  2859. lfd f27, 104(SP)
  2860. lfd f28, 112(SP)
  2861. lfd f29, 120(SP)
  2862. lfd f30, 128(SP)
  2863. lfd f31, 136(SP)
  2864. #ifdef __64BIT__
  2865. ld r31, 144(SP)
  2866. ld r30, 152(SP)
  2867. ld r29, 160(SP)
  2868. ld r28, 168(SP)
  2869. ld r27, 176(SP)
  2870. ld r26, 184(SP)
  2871. ld r25, 192(SP)
  2872. ld r24, 200(SP)
  2873. ld r23, 208(SP)
  2874. ld r22, 216(SP)
  2875. ld r21, 224(SP)
  2876. ld r20, 232(SP)
  2877. ld r19, 240(SP)
  2878. ld r18, 248(SP)
  2879. #else
  2880. lwz r31, 144(SP)
  2881. lwz r30, 148(SP)
  2882. lwz r29, 152(SP)
  2883. lwz r28, 156(SP)
  2884. lwz r27, 160(SP)
  2885. lwz r26, 164(SP)
  2886. lwz r25, 168(SP)
  2887. lwz r24, 172(SP)
  2888. lwz r23, 176(SP)
  2889. lwz r22, 180(SP)
  2890. lwz r21, 184(SP)
  2891. lwz r20, 188(SP)
  2892. lwz r19, 192(SP)
  2893. lwz r18, 196(SP)
  2894. #endif
  2895. addi SP, SP, STACKSIZE
  2896. blr
  2897. EPILOGUE
  2898. #endif