You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_power6_RT.S 84 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define FZERO 312(SP)
  48. #else
  49. #define STACKSIZE 256
  50. #define FZERO 240(SP)
  51. #endif
  52. #define M r3
  53. #define N r4
  54. #define K r5
  55. #ifdef linux
  56. #ifndef __64BIT__
  57. #define A r6
  58. #define B r7
  59. #define C r8
  60. #define LDC r9
  61. #define OFFSET r10
  62. #else
  63. #define A r8
  64. #define B r9
  65. #define C r10
  66. #define LDC r6
  67. #define OFFSET r7
  68. #endif
  69. #endif
  70. #if defined(_AIX) || defined(__APPLE__)
  71. #if !defined(__64BIT__) && defined(DOUBLE)
  72. #define A r10
  73. #define B r6
  74. #define C r7
  75. #define LDC r8
  76. #define OFFSET r9
  77. #else
  78. #define A r8
  79. #define B r9
  80. #define C r10
  81. #define LDC r6
  82. #define OFFSET r7
  83. #endif
  84. #endif
  85. #define AORIG r19
  86. #define TEMP r20
  87. #define KK r21
  88. #define I r22
  89. #define J r23
  90. #define AO r24
  91. #define BO r25
  92. #define CO1 r26
  93. #define CO2 r27
  94. #define CO3 r28
  95. #define CO4 r29
  96. #define PREA r30
  97. #define PREC r31
  98. #ifndef CONJ
  99. #define FMA1 FMADD
  100. #define FMA2 FMADD
  101. #define FMA3 FNMSUB
  102. #define FMA4 FMADD
  103. #elif defined(LN) || defined(LT)
  104. #define FMA1 FMADD
  105. #define FMA2 FMADD
  106. #define FMA3 FMADD
  107. #define FMA4 FNMSUB
  108. #else
  109. #define FMA1 FMADD
  110. #define FMA2 FNMSUB
  111. #define FMA3 FMADD
  112. #define FMA4 FMADD
  113. #endif
  114. #ifndef NEEDPARAM
  115. PROLOGUE
  116. PROFCODE
  117. addi SP, SP, -STACKSIZE
  118. li r0, 0
  119. stfd f14, 0(SP)
  120. stfd f15, 8(SP)
  121. stfd f16, 16(SP)
  122. stfd f17, 24(SP)
  123. stfd f18, 32(SP)
  124. stfd f19, 40(SP)
  125. stfd f20, 48(SP)
  126. stfd f21, 56(SP)
  127. stfd f22, 64(SP)
  128. stfd f23, 72(SP)
  129. stfd f24, 80(SP)
  130. stfd f25, 88(SP)
  131. stfd f26, 96(SP)
  132. stfd f27, 104(SP)
  133. stfd f28, 112(SP)
  134. stfd f29, 120(SP)
  135. stfd f30, 128(SP)
  136. stfd f31, 136(SP)
  137. #ifdef __64BIT__
  138. std r31, 144(SP)
  139. std r30, 152(SP)
  140. std r29, 160(SP)
  141. std r28, 168(SP)
  142. std r27, 176(SP)
  143. std r26, 184(SP)
  144. std r25, 192(SP)
  145. std r24, 200(SP)
  146. std r23, 208(SP)
  147. std r22, 216(SP)
  148. std r21, 224(SP)
  149. std r20, 232(SP)
  150. std r19, 240(SP)
  151. #else
  152. stw r31, 144(SP)
  153. stw r30, 148(SP)
  154. stw r29, 152(SP)
  155. stw r28, 156(SP)
  156. stw r27, 160(SP)
  157. stw r26, 164(SP)
  158. stw r25, 168(SP)
  159. stw r24, 172(SP)
  160. stw r23, 176(SP)
  161. stw r22, 180(SP)
  162. stw r21, 184(SP)
  163. stw r20, 188(SP)
  164. stw r19, 192(SP)
  165. #endif
  166. stw r0, FZERO
  167. #ifdef linux
  168. #ifdef __64BIT__
  169. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  170. #endif
  171. #endif
  172. #if defined(_AIX) || defined(__APPLE__)
  173. #ifdef __64BIT__
  174. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  175. #else
  176. #ifdef DOUBLE
  177. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  178. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  179. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  180. #else
  181. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  182. #endif
  183. #endif
  184. #endif
  185. #if defined(linux) && defined(__64BIT__)
  186. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  187. #endif
  188. #if defined(_AIX) || defined(__APPLE__)
  189. #ifdef __64BIT__
  190. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  191. #else
  192. #ifdef DOUBLE
  193. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  194. #else
  195. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  196. #endif
  197. #endif
  198. #endif
  199. slwi LDC, LDC, ZBASE_SHIFT
  200. #ifdef LN
  201. mullw r0, M, K
  202. slwi r0, r0, ZBASE_SHIFT
  203. add A, A, r0
  204. slwi r0, M, ZBASE_SHIFT
  205. add C, C, r0
  206. #endif
  207. #ifdef RN
  208. neg KK, OFFSET
  209. #endif
  210. #ifdef RT
  211. mullw r0, N, K
  212. slwi r0, r0, ZBASE_SHIFT
  213. add B, B, r0
  214. mullw r0, N, LDC
  215. add C, C, r0
  216. sub KK, N, OFFSET
  217. #endif
  218. cmpwi cr0, M, 0
  219. ble LL(999)
  220. cmpwi cr0, N, 0
  221. ble LL(999)
  222. cmpwi cr0, K, 0
  223. ble LL(999)
  224. li PREA, 48 * SIZE
  225. li PREC, 4 * SIZE
  226. andi. J, N, 1
  227. ble LL(30)
  228. #ifdef RT
  229. slwi r0, K, 0 + ZBASE_SHIFT
  230. sub B, B, r0
  231. sub C, C, LDC
  232. #endif
  233. mr CO1, C
  234. #ifdef LN
  235. add KK, M, OFFSET
  236. #endif
  237. #ifdef LT
  238. mr KK, OFFSET
  239. #endif
  240. srawi. I, M, 1
  241. #if defined(LN) || defined(RT)
  242. mr AORIG, A
  243. #else
  244. mr AO, A
  245. #endif
  246. #ifndef RT
  247. add C, C, LDC
  248. #endif
  249. ble LL(60)
  250. .align 4
  251. LL(51):
  252. #if defined(LT) || defined(RN)
  253. LFD f20, 0 * SIZE(AO)
  254. LFD f21, 1 * SIZE(AO)
  255. LFD f22, 2 * SIZE(AO)
  256. LFD f23, 3 * SIZE(AO)
  257. LFD f24, 4 * SIZE(AO)
  258. LFD f25, 5 * SIZE(AO)
  259. LFD f26, 6 * SIZE(AO)
  260. LFD f27, 7 * SIZE(AO)
  261. LFD f16, 0 * SIZE(B)
  262. LFD f17, 1 * SIZE(B)
  263. LFD f18, 2 * SIZE(B)
  264. LFD f19, 3 * SIZE(B)
  265. lfs f0, FZERO
  266. fmr f1, f0
  267. fmr f2, f0
  268. fmr f3, f0
  269. fmr f4, f0
  270. fmr f5, f0
  271. fmr f6, f0
  272. fmr f7, f0
  273. dcbt CO1, PREC
  274. srawi. r0, KK, 2
  275. mr BO, B
  276. mtspr CTR, r0
  277. #else
  278. #ifdef LN
  279. slwi r0, K, 1 + ZBASE_SHIFT
  280. sub AORIG, AORIG, r0
  281. #endif
  282. slwi r0, KK, 1 + ZBASE_SHIFT
  283. slwi TEMP, KK, 0 + ZBASE_SHIFT
  284. add AO, AORIG, r0
  285. add BO, B, TEMP
  286. sub TEMP, K, KK
  287. LFD f20, 0 * SIZE(AO)
  288. LFD f21, 1 * SIZE(AO)
  289. LFD f22, 2 * SIZE(AO)
  290. LFD f23, 3 * SIZE(AO)
  291. LFD f24, 4 * SIZE(AO)
  292. LFD f25, 5 * SIZE(AO)
  293. LFD f26, 6 * SIZE(AO)
  294. LFD f27, 7 * SIZE(AO)
  295. LFD f16, 0 * SIZE(BO)
  296. LFD f17, 1 * SIZE(BO)
  297. LFD f18, 2 * SIZE(BO)
  298. LFD f19, 3 * SIZE(BO)
  299. lfs f0, FZERO
  300. fmr f1, f0
  301. fmr f2, f0
  302. fmr f3, f0
  303. fmr f4, f0
  304. fmr f5, f0
  305. fmr f6, f0
  306. fmr f7, f0
  307. srawi. r0, TEMP, 2
  308. mtspr CTR, r0
  309. #endif
  310. ble LL(55)
  311. .align 4
  312. LL(52):
  313. FMADD f0, f16, f20, f0
  314. FMADD f1, f16, f21, f1
  315. FMADD f2, f16, f22, f2
  316. FMADD f3, f16, f23, f3
  317. FMADD f4, f17, f20, f4
  318. FMADD f5, f17, f21, f5
  319. FMADD f6, f17, f22, f6
  320. FMADD f7, f17, f23, f7
  321. LFD f20, 8 * SIZE(AO)
  322. LFD f21, 9 * SIZE(AO)
  323. LFD f22, 10 * SIZE(AO)
  324. LFD f23, 11 * SIZE(AO)
  325. FMADD f0, f18, f24, f0
  326. FMADD f1, f18, f25, f1
  327. FMADD f2, f18, f26, f2
  328. FMADD f3, f18, f27, f3
  329. FMADD f4, f19, f24, f4
  330. FMADD f5, f19, f25, f5
  331. FMADD f6, f19, f26, f6
  332. FMADD f7, f19, f27, f7
  333. LFD f24, 12 * SIZE(AO)
  334. LFD f25, 13 * SIZE(AO)
  335. LFD f26, 14 * SIZE(AO)
  336. LFD f27, 15 * SIZE(AO)
  337. LFD f16, 4 * SIZE(BO)
  338. LFD f17, 5 * SIZE(BO)
  339. LFD f18, 6 * SIZE(BO)
  340. LFD f19, 7 * SIZE(BO)
  341. FMADD f0, f16, f20, f0
  342. FMADD f1, f16, f21, f1
  343. FMADD f2, f16, f22, f2
  344. FMADD f3, f16, f23, f3
  345. FMADD f4, f17, f20, f4
  346. FMADD f5, f17, f21, f5
  347. FMADD f6, f17, f22, f6
  348. FMADD f7, f17, f23, f7
  349. LFD f20, 16 * SIZE(AO)
  350. LFD f21, 17 * SIZE(AO)
  351. LFD f22, 18 * SIZE(AO)
  352. LFD f23, 19 * SIZE(AO)
  353. FMADD f0, f18, f24, f0
  354. FMADD f1, f18, f25, f1
  355. FMADD f2, f18, f26, f2
  356. FMADD f3, f18, f27, f3
  357. FMADD f4, f19, f24, f4
  358. FMADD f5, f19, f25, f5
  359. FMADD f6, f19, f26, f6
  360. FMADD f7, f19, f27, f7
  361. LFD f24, 20 * SIZE(AO)
  362. LFD f25, 21 * SIZE(AO)
  363. LFD f26, 22 * SIZE(AO)
  364. LFD f27, 23 * SIZE(AO)
  365. LFD f16, 8 * SIZE(BO)
  366. LFD f17, 9 * SIZE(BO)
  367. LFD f18, 10 * SIZE(BO)
  368. LFD f19, 11 * SIZE(BO)
  369. addi AO, AO, 16 * SIZE
  370. addi BO, BO, 8 * SIZE
  371. dcbt PREA, AO
  372. dcbt PREA, BO
  373. bdnz LL(52)
  374. .align 4
  375. LL(55):
  376. #if defined(LT) || defined(RN)
  377. andi. r0, KK, 3
  378. #else
  379. andi. r0, TEMP, 3
  380. #endif
  381. mtspr CTR, r0
  382. ble LL(57)
  383. .align 4
  384. LL(56):
  385. FMADD f0, f16, f20, f0
  386. FMADD f1, f16, f21, f1
  387. FMADD f2, f16, f22, f2
  388. FMADD f3, f16, f23, f3
  389. FMADD f4, f17, f20, f4
  390. FMADD f5, f17, f21, f5
  391. FMADD f6, f17, f22, f6
  392. FMADD f7, f17, f23, f7
  393. LFD f20, 4 * SIZE(AO)
  394. LFD f21, 5 * SIZE(AO)
  395. LFD f22, 6 * SIZE(AO)
  396. LFD f23, 7 * SIZE(AO)
  397. LFD f16, 2 * SIZE(BO)
  398. LFD f17, 3 * SIZE(BO)
  399. addi BO, BO, 2 * SIZE
  400. addi AO, AO, 4 * SIZE
  401. bdnz LL(56)
  402. .align 4
  403. LL(57):
  404. #ifndef CONJ
  405. FSUB f0, f0, f5
  406. FADD f1, f1, f4
  407. FSUB f2, f2, f7
  408. FADD f3, f3, f6
  409. #else
  410. FADD f0, f0, f5
  411. FSUB f1, f4, f1
  412. FADD f2, f2, f7
  413. FSUB f3, f6, f3
  414. #endif
  415. #if defined(LN) || defined(RT)
  416. #ifdef LN
  417. subi r0, KK, 2
  418. #else
  419. subi r0, KK, 1
  420. #endif
  421. slwi TEMP, r0, 1 + ZBASE_SHIFT
  422. slwi r0, r0, 0 + ZBASE_SHIFT
  423. add AO, AORIG, TEMP
  424. add BO, B, r0
  425. #endif
  426. #if defined(LN) || defined(LT)
  427. LFD f16, 0 * SIZE(BO)
  428. LFD f17, 1 * SIZE(BO)
  429. LFD f18, 2 * SIZE(BO)
  430. LFD f19, 3 * SIZE(BO)
  431. FSUB f0, f16, f0
  432. FSUB f1, f17, f1
  433. FSUB f2, f18, f2
  434. FSUB f3, f19, f3
  435. #else
  436. LFD f16, 0 * SIZE(AO)
  437. LFD f17, 1 * SIZE(AO)
  438. LFD f18, 2 * SIZE(AO)
  439. LFD f19, 3 * SIZE(AO)
  440. #ifndef CONJ
  441. FSUB f0, f16, f0
  442. FSUB f1, f17, f1
  443. FSUB f2, f18, f2
  444. FSUB f3, f19, f3
  445. #else
  446. FSUB f0, f16, f0
  447. FADD f1, f17, f1
  448. FSUB f2, f18, f2
  449. FADD f3, f19, f3
  450. #endif
  451. #endif
  452. #ifdef LN
  453. LFD f16, 6 * SIZE(AO)
  454. LFD f17, 7 * SIZE(AO)
  455. LFD f18, 4 * SIZE(AO)
  456. LFD f19, 5 * SIZE(AO)
  457. LFD f20, 0 * SIZE(AO)
  458. LFD f21, 1 * SIZE(AO)
  459. FMUL f6, f17, f3
  460. FMUL f7, f17, f2
  461. #ifndef CONJ
  462. FMSUB f2, f16, f2, f6
  463. FMADD f3, f16, f3, f7
  464. FMADD f0, f19, f3, f0
  465. FNMSUB f1, f19, f2, f1
  466. FNMSUB f0, f18, f2, f0
  467. FNMSUB f1, f18, f3, f1
  468. FMUL f4, f21, f1
  469. FMUL f5, f21, f0
  470. FMSUB f0, f20, f0, f4
  471. FMADD f1, f20, f1, f5
  472. #else
  473. FMADD f2, f16, f2, f6
  474. FMSUB f3, f16, f3, f7
  475. FMSUB f0, f19, f3, f0
  476. FNMADD f1, f19, f2, f1
  477. FNMADD f0, f18, f2, f0
  478. FNMADD f1, f18, f3, f1
  479. FMUL f4, f21, f1
  480. FMUL f5, f21, f0
  481. FMADD f0, f20, f0, f4
  482. FMSUB f1, f20, f1, f5
  483. #endif
  484. #endif
  485. #ifdef LT
  486. LFD f16, 0 * SIZE(AO)
  487. LFD f17, 1 * SIZE(AO)
  488. LFD f18, 2 * SIZE(AO)
  489. LFD f19, 3 * SIZE(AO)
  490. LFD f20, 6 * SIZE(AO)
  491. LFD f21, 7 * SIZE(AO)
  492. FMUL f4, f17, f1
  493. FMUL f5, f17, f0
  494. #ifndef CONJ
  495. FMSUB f0, f16, f0, f4
  496. FMADD f1, f16, f1, f5
  497. FMADD f2, f19, f1, f2
  498. FNMSUB f3, f19, f0, f3
  499. FNMSUB f2, f18, f0, f2
  500. FNMSUB f3, f18, f1, f3
  501. FMUL f4, f21, f3
  502. FMUL f5, f21, f2
  503. FMSUB f2, f20, f2, f4
  504. FMADD f3, f20, f3, f5
  505. #else
  506. FMADD f0, f16, f0, f4
  507. FMSUB f1, f16, f1, f5
  508. FMSUB f2, f19, f1, f2
  509. FNMADD f3, f19, f0, f3
  510. FNMADD f2, f18, f0, f2
  511. FNMADD f3, f18, f1, f3
  512. FMUL f4, f21, f3
  513. FMUL f5, f21, f2
  514. FMADD f2, f20, f2, f4
  515. FMSUB f3, f20, f3, f5
  516. #endif
  517. #endif
  518. #ifdef RN
  519. LFD f16, 0 * SIZE(BO)
  520. LFD f17, 1 * SIZE(BO)
  521. FMUL f4, f17, f1
  522. FMUL f5, f17, f0
  523. FMUL f6, f17, f3
  524. FMUL f7, f17, f2
  525. #ifndef CONJ
  526. FMSUB f0, f16, f0, f4
  527. FMADD f1, f16, f1, f5
  528. FMSUB f2, f16, f2, f6
  529. FMADD f3, f16, f3, f7
  530. #else
  531. FMADD f0, f16, f0, f4
  532. FMSUB f1, f16, f1, f5
  533. FMADD f2, f16, f2, f6
  534. FMSUB f3, f16, f3, f7
  535. #endif
  536. #endif
  537. #ifdef RT
  538. LFD f20, 0 * SIZE(BO)
  539. LFD f21, 1 * SIZE(BO)
  540. FMUL f4, f21, f1
  541. FMUL f5, f21, f0
  542. FMUL f6, f21, f3
  543. FMUL f7, f21, f2
  544. #ifndef CONJ
  545. FMSUB f0, f20, f0, f4
  546. FMADD f1, f20, f1, f5
  547. FMSUB f2, f20, f2, f6
  548. FMADD f3, f20, f3, f7
  549. #else
  550. FMADD f0, f20, f0, f4
  551. FMSUB f1, f20, f1, f5
  552. FMADD f2, f20, f2, f6
  553. FMSUB f3, f20, f3, f7
  554. #endif
  555. #endif
  556. #ifdef LN
  557. subi CO1, CO1, 4 * SIZE
  558. #endif
  559. #if defined(LN) || defined(LT)
  560. STFD f0, 0 * SIZE(BO)
  561. STFD f1, 1 * SIZE(BO)
  562. STFD f2, 2 * SIZE(BO)
  563. STFD f3, 3 * SIZE(BO)
  564. #else
  565. STFD f0, 0 * SIZE(AO)
  566. STFD f1, 1 * SIZE(AO)
  567. STFD f2, 2 * SIZE(AO)
  568. STFD f3, 3 * SIZE(AO)
  569. #endif
  570. STFD f0, 0 * SIZE(CO1)
  571. STFD f1, 1 * SIZE(CO1)
  572. STFD f2, 2 * SIZE(CO1)
  573. STFD f3, 3 * SIZE(CO1)
  574. #ifndef LN
  575. addi CO1, CO1, 4 * SIZE
  576. #endif
  577. #ifdef RT
  578. slwi r0, K, 1 + ZBASE_SHIFT
  579. add AORIG, AORIG, r0
  580. #endif
  581. #if defined(LT) || defined(RN)
  582. sub TEMP, K, KK
  583. slwi r0, TEMP, 1 + ZBASE_SHIFT
  584. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  585. add AO, AO, r0
  586. add BO, BO, TEMP
  587. #endif
  588. #ifdef LT
  589. addi KK, KK, 2
  590. #endif
  591. #ifdef LN
  592. subi KK, KK, 2
  593. #endif
  594. addic. I, I, -1
  595. bgt LL(51)
  596. .align 4
  597. LL(60):
  598. andi. I, M, 1
  599. ble LL(69)
  600. #if defined(LT) || defined(RN)
  601. LFD f16, 0 * SIZE(AO)
  602. LFD f17, 1 * SIZE(AO)
  603. LFD f18, 2 * SIZE(AO)
  604. LFD f19, 3 * SIZE(AO)
  605. LFD f20, 0 * SIZE(B)
  606. LFD f21, 1 * SIZE(B)
  607. LFD f22, 2 * SIZE(B)
  608. LFD f23, 3 * SIZE(B)
  609. lfs f0, FZERO
  610. fmr f1, f0
  611. fmr f2, f0
  612. fmr f3, f0
  613. fmr f4, f0
  614. fmr f5, f0
  615. fmr f6, f0
  616. fmr f7, f0
  617. srawi. r0, KK, 2
  618. mr BO, B
  619. mtspr CTR, r0
  620. #else
  621. #ifdef LN
  622. slwi r0, K, 0 + ZBASE_SHIFT
  623. sub AORIG, AORIG, r0
  624. #endif
  625. slwi r0, KK, 0 + ZBASE_SHIFT
  626. add AO, AORIG, r0
  627. add BO, B, r0
  628. sub TEMP, K, KK
  629. LFD f16, 0 * SIZE(AO)
  630. LFD f17, 1 * SIZE(AO)
  631. LFD f18, 2 * SIZE(AO)
  632. LFD f19, 3 * SIZE(AO)
  633. LFD f20, 0 * SIZE(BO)
  634. LFD f21, 1 * SIZE(BO)
  635. LFD f22, 2 * SIZE(BO)
  636. LFD f23, 3 * SIZE(BO)
  637. lfs f0, FZERO
  638. fmr f1, f0
  639. fmr f2, f0
  640. fmr f3, f0
  641. fmr f4, f0
  642. fmr f5, f0
  643. fmr f6, f0
  644. fmr f7, f0
  645. srawi. r0, TEMP, 2
  646. mtspr CTR, r0
  647. #endif
  648. ble LL(65)
  649. .align 4
  650. LL(62):
  651. FMADD f0, f16, f20, f0
  652. FMADD f1, f17, f21, f1
  653. FMADD f2, f17, f20, f2
  654. FMADD f3, f16, f21, f3
  655. LFD f16, 4 * SIZE(AO)
  656. LFD f17, 5 * SIZE(AO)
  657. LFD f20, 4 * SIZE(BO)
  658. LFD f21, 5 * SIZE(BO)
  659. FMADD f4, f18, f22, f4
  660. FMADD f5, f19, f23, f5
  661. FMADD f6, f19, f22, f6
  662. FMADD f7, f18, f23, f7
  663. LFD f18, 6 * SIZE(AO)
  664. LFD f19, 7 * SIZE(AO)
  665. LFD f22, 6 * SIZE(BO)
  666. LFD f23, 7 * SIZE(BO)
  667. FMADD f0, f16, f20, f0
  668. FMADD f1, f17, f21, f1
  669. FMADD f2, f17, f20, f2
  670. FMADD f3, f16, f21, f3
  671. LFD f16, 8 * SIZE(AO)
  672. LFD f17, 9 * SIZE(AO)
  673. LFD f20, 8 * SIZE(BO)
  674. LFD f21, 9 * SIZE(BO)
  675. FMADD f4, f18, f22, f4
  676. FMADD f5, f19, f23, f5
  677. FMADD f6, f19, f22, f6
  678. FMADD f7, f18, f23, f7
  679. LFD f18, 10 * SIZE(AO)
  680. LFD f19, 11 * SIZE(AO)
  681. LFD f22, 10 * SIZE(BO)
  682. LFD f23, 11 * SIZE(BO)
  683. addi AO, AO, 8 * SIZE
  684. addi BO, BO, 8 * SIZE
  685. bdnz LL(62)
  686. .align 4
  687. LL(65):
  688. fadd f0, f0, f4
  689. fadd f1, f1, f5
  690. fadd f2, f2, f6
  691. fadd f3, f3, f7
  692. #if defined(LT) || defined(RN)
  693. andi. r0, KK, 3
  694. #else
  695. andi. r0, TEMP, 3
  696. #endif
  697. mtspr CTR,r0
  698. ble LL(67)
  699. .align 4
  700. LL(66):
  701. FMADD f0, f16, f20, f0
  702. FMADD f1, f17, f21, f1
  703. FMADD f2, f17, f20, f2
  704. FMADD f3, f16, f21, f3
  705. LFD f16, 2 * SIZE(AO)
  706. LFD f17, 3 * SIZE(AO)
  707. LFD f20, 2 * SIZE(BO)
  708. LFD f21, 3 * SIZE(BO)
  709. addi AO, AO, 2 * SIZE
  710. addi BO, BO, 2 * SIZE
  711. bdnz LL(66)
  712. .align 4
  713. LL(67):
  714. #ifndef CONJ
  715. FSUB f0, f0, f1
  716. FADD f1, f2, f3
  717. #else
  718. FADD f0, f0, f1
  719. FSUB f1, f3, f2
  720. #endif
  721. #if defined(LN) || defined(RT)
  722. subi r0, KK, 1
  723. slwi r0, r0, 0 + ZBASE_SHIFT
  724. add AO, AORIG, r0
  725. add BO, B, r0
  726. #endif
  727. #if defined(LN) || defined(LT)
  728. LFD f16, 0 * SIZE(BO)
  729. LFD f17, 1 * SIZE(BO)
  730. FSUB f0, f16, f0
  731. FSUB f1, f17, f1
  732. #else
  733. LFD f16, 0 * SIZE(AO)
  734. LFD f17, 1 * SIZE(AO)
  735. #ifndef CONJ
  736. FSUB f0, f16, f0
  737. FSUB f1, f17, f1
  738. #else
  739. FSUB f0, f16, f0
  740. FADD f1, f17, f1
  741. #endif
  742. #endif
  743. #ifdef LN
  744. LFD f20, 0 * SIZE(AO)
  745. LFD f21, 1 * SIZE(AO)
  746. FMUL f4, f21, f1
  747. FMUL f5, f21, f0
  748. #ifndef CONJ
  749. FMSUB f0, f20, f0, f4
  750. FMADD f1, f20, f1, f5
  751. #else
  752. FMADD f0, f20, f0, f4
  753. FMSUB f1, f20, f1, f5
  754. #endif
  755. #endif
  756. #ifdef LT
  757. LFD f16, 0 * SIZE(AO)
  758. LFD f17, 1 * SIZE(AO)
  759. FMUL f4, f17, f1
  760. FMUL f5, f17, f0
  761. #ifndef CONJ
  762. FMSUB f0, f16, f0, f4
  763. FMADD f1, f16, f1, f5
  764. #else
  765. FMADD f0, f16, f0, f4
  766. FMSUB f1, f16, f1, f5
  767. #endif
  768. #endif
  769. #ifdef RN
  770. LFD f16, 0 * SIZE(BO)
  771. LFD f17, 1 * SIZE(BO)
  772. FMUL f4, f17, f1
  773. FMUL f5, f17, f0
  774. #ifndef CONJ
  775. FMSUB f0, f16, f0, f4
  776. FMADD f1, f16, f1, f5
  777. #else
  778. FMADD f0, f16, f0, f4
  779. FMSUB f1, f16, f1, f5
  780. #endif
  781. #endif
  782. #ifdef RT
  783. LFD f20, 0 * SIZE(BO)
  784. LFD f21, 1 * SIZE(BO)
  785. FMUL f4, f21, f1
  786. FMUL f5, f21, f0
  787. #ifndef CONJ
  788. FMSUB f0, f20, f0, f4
  789. FMADD f1, f20, f1, f5
  790. #else
  791. FMADD f0, f20, f0, f4
  792. FMSUB f1, f20, f1, f5
  793. #endif
  794. #endif
  795. #ifdef LN
  796. subi CO1, CO1, 2 * SIZE
  797. #endif
  798. #if defined(LN) || defined(LT)
  799. STFD f0, 0 * SIZE(BO)
  800. STFD f1, 1 * SIZE(BO)
  801. #else
  802. STFD f0, 0 * SIZE(AO)
  803. STFD f1, 1 * SIZE(AO)
  804. #endif
  805. STFD f0, 0 * SIZE(CO1)
  806. STFD f1, 1 * SIZE(CO1)
  807. #ifndef LN
  808. addi CO1, CO1, 2 * SIZE
  809. #endif
  810. #ifdef RT
  811. slwi r0, K, 0 + ZBASE_SHIFT
  812. add AORIG, AORIG, r0
  813. #endif
  814. #if defined(LT) || defined(RN)
  815. sub TEMP, K, KK
  816. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  817. add AO, AO, TEMP
  818. add BO, BO, TEMP
  819. #endif
  820. #ifdef LT
  821. addi KK, KK, 1
  822. #endif
  823. #ifdef LN
  824. subi KK, KK, 1
  825. #endif
  826. .align 4
  827. LL(69):
  828. #ifdef LN
  829. slwi r0, K, 0 + ZBASE_SHIFT
  830. add B, B, r0
  831. #endif
  832. #if defined(LT) || defined(RN)
  833. mr B, BO
  834. #endif
  835. #ifdef RN
  836. addi KK, KK, 1
  837. #endif
  838. #ifdef RT
  839. subi KK, KK, 1
  840. #endif
  841. .align 4
  842. LL(30):
  843. andi. J, N, 2
  844. ble LL(50)
  845. #ifdef RT
  846. slwi r0, K, 1 + ZBASE_SHIFT
  847. sub B, B, r0
  848. slwi r0, LDC, 1
  849. sub C, C, r0
  850. #endif
  851. mr CO1, C
  852. add CO2, C, LDC
  853. #ifdef LN
  854. add KK, M, OFFSET
  855. #endif
  856. #ifdef LT
  857. mr KK, OFFSET
  858. #endif
  859. srawi. I, M, 1
  860. #if defined(LN) || defined(RT)
  861. mr AORIG, A
  862. #else
  863. mr AO, A
  864. #endif
  865. #ifndef RT
  866. add C, CO2, LDC
  867. #endif
  868. ble LL(40)
  869. .align 4
  870. LL(31):
  871. #if defined(LT) || defined(RN)
  872. LFD f16, 0 * SIZE(AO)
  873. LFD f17, 1 * SIZE(AO)
  874. LFD f18, 2 * SIZE(AO)
  875. LFD f19, 3 * SIZE(AO)
  876. LFD f20, 0 * SIZE(B)
  877. LFD f21, 1 * SIZE(B)
  878. LFD f22, 2 * SIZE(B)
  879. LFD f23, 3 * SIZE(B)
  880. lfs f0, FZERO
  881. fmr f1, f0
  882. fmr f2, f0
  883. fmr f3, f0
  884. fmr f4, f0
  885. fmr f5, f0
  886. fmr f6, f0
  887. fmr f7, f0
  888. fmr f8, f0
  889. fmr f9, f0
  890. fmr f10, f0
  891. fmr f11, f0
  892. fmr f12, f0
  893. fmr f13, f0
  894. fmr f14, f0
  895. fmr f15, f0
  896. dcbtst CO1, PREC
  897. dcbtst CO2, PREC
  898. srawi. r0, KK, 3
  899. mtspr CTR, r0
  900. mr BO, B
  901. #else
  902. #ifdef LN
  903. slwi r0, K, 1 + ZBASE_SHIFT
  904. sub AORIG, AORIG, r0
  905. #endif
  906. slwi TEMP, KK, 1 + ZBASE_SHIFT
  907. add AO, AORIG, TEMP
  908. add BO, B, TEMP
  909. sub TEMP, K, KK
  910. LFD f16, 0 * SIZE(AO)
  911. LFD f17, 1 * SIZE(AO)
  912. LFD f18, 2 * SIZE(AO)
  913. LFD f19, 3 * SIZE(AO)
  914. LFD f20, 0 * SIZE(BO)
  915. LFD f21, 1 * SIZE(BO)
  916. LFD f22, 2 * SIZE(BO)
  917. LFD f23, 3 * SIZE(BO)
  918. lfs f0, FZERO
  919. fmr f1, f0
  920. fmr f2, f0
  921. fmr f3, f0
  922. fmr f4, f0
  923. fmr f5, f0
  924. fmr f6, f0
  925. fmr f7, f0
  926. fmr f8, f0
  927. fmr f9, f0
  928. fmr f10, f0
  929. fmr f11, f0
  930. fmr f12, f0
  931. fmr f13, f0
  932. fmr f14, f0
  933. fmr f15, f0
  934. dcbtst CO1, PREC
  935. dcbtst CO2, PREC
  936. srawi. r0, TEMP, 3
  937. mtspr CTR, r0
  938. #endif
  939. ble LL(35)
  940. .align 4
  941. LL(32):
  942. dcbt AO, PREA
  943. dcbtst BO, PREA
  944. FMADD f0, f16, f20, f0
  945. FMADD f4, f16, f21, f4
  946. FMADD f8, f16, f22, f8
  947. FMADD f12, f16, f23, f12
  948. LFD f24, 4 * SIZE(AO)
  949. LFD f28, 4 * SIZE(BO)
  950. LFD f25, 5 * SIZE(AO)
  951. LFD f29, 5 * SIZE(BO)
  952. FMADD f1, f17, f20, f1
  953. FMADD f5, f17, f21, f5
  954. FMADD f9, f17, f22, f9
  955. FMADD f13, f17, f23, f13
  956. FMADD f2, f18, f20, f2
  957. FMADD f6, f18, f21, f6
  958. FMADD f10, f18, f22, f10
  959. FMADD f14, f18, f23, f14
  960. LFD f26, 6 * SIZE(AO)
  961. LFD f30, 6 * SIZE(BO)
  962. LFD f27, 7 * SIZE(AO)
  963. LFD f31, 7 * SIZE(BO)
  964. FMADD f3, f19, f20, f3
  965. FMADD f7, f19, f21, f7
  966. FMADD f11, f19, f22, f11
  967. FMADD f15, f19, f23, f15
  968. FMADD f0, f24, f28, f0
  969. FMADD f4, f24, f29, f4
  970. FMADD f8, f24, f30, f8
  971. FMADD f12, f24, f31, f12
  972. LFD f16, 8 * SIZE(AO)
  973. LFD f20, 8 * SIZE(BO)
  974. LFD f17, 9 * SIZE(AO)
  975. LFD f21, 9 * SIZE(BO)
  976. FMADD f1, f25, f28, f1
  977. FMADD f5, f25, f29, f5
  978. FMADD f9, f25, f30, f9
  979. FMADD f13, f25, f31, f13
  980. FMADD f2, f26, f28, f2
  981. FMADD f6, f26, f29, f6
  982. FMADD f10, f26, f30, f10
  983. FMADD f14, f26, f31, f14
  984. LFD f18, 10 * SIZE(AO)
  985. LFD f22, 10 * SIZE(BO)
  986. LFD f19, 11 * SIZE(AO)
  987. LFD f23, 11 * SIZE(BO)
  988. FMADD f3, f27, f28, f3
  989. FMADD f7, f27, f29, f7
  990. FMADD f11, f27, f30, f11
  991. FMADD f15, f27, f31, f15
  992. FMADD f0, f16, f20, f0
  993. FMADD f4, f16, f21, f4
  994. FMADD f8, f16, f22, f8
  995. FMADD f12, f16, f23, f12
  996. LFD f24, 12 * SIZE(AO)
  997. LFD f28, 12 * SIZE(BO)
  998. LFD f25, 13 * SIZE(AO)
  999. LFD f29, 13 * SIZE(BO)
  1000. FMADD f1, f17, f20, f1
  1001. FMADD f5, f17, f21, f5
  1002. FMADD f9, f17, f22, f9
  1003. FMADD f13, f17, f23, f13
  1004. FMADD f2, f18, f20, f2
  1005. FMADD f6, f18, f21, f6
  1006. FMADD f10, f18, f22, f10
  1007. FMADD f14, f18, f23, f14
  1008. LFD f26, 14 * SIZE(AO)
  1009. LFD f30, 14 * SIZE(BO)
  1010. LFD f27, 15 * SIZE(AO)
  1011. LFD f31, 15 * SIZE(BO)
  1012. FMADD f3, f19, f20, f3
  1013. FMADD f7, f19, f21, f7
  1014. FMADD f11, f19, f22, f11
  1015. FMADD f15, f19, f23, f15
  1016. FMADD f0, f24, f28, f0
  1017. FMADD f4, f24, f29, f4
  1018. FMADD f8, f24, f30, f8
  1019. FMADD f12, f24, f31, f12
  1020. LFD f16, 16 * SIZE(AO)
  1021. LFD f20, 16 * SIZE(BO)
  1022. LFD f17, 17 * SIZE(AO)
  1023. LFD f21, 17 * SIZE(BO)
  1024. FMADD f1, f25, f28, f1
  1025. FMADD f5, f25, f29, f5
  1026. FMADD f9, f25, f30, f9
  1027. FMADD f13, f25, f31, f13
  1028. FMADD f2, f26, f28, f2
  1029. FMADD f6, f26, f29, f6
  1030. FMADD f10, f26, f30, f10
  1031. FMADD f14, f26, f31, f14
  1032. LFD f18, 18 * SIZE(AO)
  1033. LFD f22, 18 * SIZE(BO)
  1034. LFD f19, 19 * SIZE(AO)
  1035. LFD f23, 19 * SIZE(BO)
  1036. FMADD f3, f27, f28, f3
  1037. FMADD f7, f27, f29, f7
  1038. FMADD f11, f27, f30, f11
  1039. FMADD f15, f27, f31, f15
  1040. FMADD f0, f16, f20, f0
  1041. FMADD f4, f16, f21, f4
  1042. FMADD f8, f16, f22, f8
  1043. FMADD f12, f16, f23, f12
  1044. LFD f24, 20 * SIZE(AO)
  1045. LFD f28, 20 * SIZE(BO)
  1046. LFD f25, 21 * SIZE(AO)
  1047. LFD f29, 21 * SIZE(BO)
  1048. FMADD f1, f17, f20, f1
  1049. FMADD f5, f17, f21, f5
  1050. FMADD f9, f17, f22, f9
  1051. FMADD f13, f17, f23, f13
  1052. FMADD f2, f18, f20, f2
  1053. FMADD f6, f18, f21, f6
  1054. FMADD f10, f18, f22, f10
  1055. FMADD f14, f18, f23, f14
  1056. LFD f26, 22 * SIZE(AO)
  1057. LFD f30, 22 * SIZE(BO)
  1058. LFD f27, 23 * SIZE(AO)
  1059. LFD f31, 23 * SIZE(BO)
  1060. FMADD f3, f19, f20, f3
  1061. FMADD f7, f19, f21, f7
  1062. FMADD f11, f19, f22, f11
  1063. FMADD f15, f19, f23, f15
  1064. FMADD f0, f24, f28, f0
  1065. FMADD f4, f24, f29, f4
  1066. FMADD f8, f24, f30, f8
  1067. FMADD f12, f24, f31, f12
  1068. LFD f16, 24 * SIZE(AO)
  1069. LFD f20, 24 * SIZE(BO)
  1070. LFD f17, 25 * SIZE(AO)
  1071. LFD f21, 25 * SIZE(BO)
  1072. FMADD f1, f25, f28, f1
  1073. FMADD f5, f25, f29, f5
  1074. FMADD f9, f25, f30, f9
  1075. FMADD f13, f25, f31, f13
  1076. FMADD f2, f26, f28, f2
  1077. FMADD f6, f26, f29, f6
  1078. FMADD f10, f26, f30, f10
  1079. FMADD f14, f26, f31, f14
  1080. LFD f18, 26 * SIZE(AO)
  1081. LFD f22, 26 * SIZE(BO)
  1082. LFD f19, 27 * SIZE(AO)
  1083. LFD f23, 27 * SIZE(BO)
  1084. FMADD f3, f27, f28, f3
  1085. FMADD f7, f27, f29, f7
  1086. FMADD f11, f27, f30, f11
  1087. FMADD f15, f27, f31, f15
  1088. FMADD f0, f16, f20, f0
  1089. FMADD f4, f16, f21, f4
  1090. FMADD f8, f16, f22, f8
  1091. FMADD f12, f16, f23, f12
  1092. LFD f24, 28 * SIZE(AO)
  1093. LFD f28, 28 * SIZE(BO)
  1094. LFD f25, 29 * SIZE(AO)
  1095. LFD f29, 29 * SIZE(BO)
  1096. FMADD f1, f17, f20, f1
  1097. FMADD f5, f17, f21, f5
  1098. FMADD f9, f17, f22, f9
  1099. FMADD f13, f17, f23, f13
  1100. FMADD f2, f18, f20, f2
  1101. FMADD f6, f18, f21, f6
  1102. FMADD f10, f18, f22, f10
  1103. FMADD f14, f18, f23, f14
  1104. LFD f26, 30 * SIZE(AO)
  1105. LFD f30, 30 * SIZE(BO)
  1106. LFD f27, 31 * SIZE(AO)
  1107. LFD f31, 31 * SIZE(BO)
  1108. FMADD f3, f19, f20, f3
  1109. FMADD f7, f19, f21, f7
  1110. FMADD f11, f19, f22, f11
  1111. FMADD f15, f19, f23, f15
  1112. FMADD f0, f24, f28, f0
  1113. FMADD f4, f24, f29, f4
  1114. FMADD f8, f24, f30, f8
  1115. FMADD f12, f24, f31, f12
  1116. LFD f16, 32 * SIZE(AO)
  1117. LFD f20, 32 * SIZE(BO)
  1118. LFD f17, 33 * SIZE(AO)
  1119. LFD f21, 33 * SIZE(BO)
  1120. FMADD f1, f25, f28, f1
  1121. FMADD f5, f25, f29, f5
  1122. FMADD f9, f25, f30, f9
  1123. FMADD f13, f25, f31, f13
  1124. FMADD f2, f26, f28, f2
  1125. FMADD f6, f26, f29, f6
  1126. FMADD f10, f26, f30, f10
  1127. FMADD f14, f26, f31, f14
  1128. LFD f18, 34 * SIZE(AO)
  1129. LFD f22, 34 * SIZE(BO)
  1130. LFD f19, 35 * SIZE(AO)
  1131. LFD f23, 35 * SIZE(BO)
  1132. addi AO, AO, 32 * SIZE
  1133. addi BO, BO, 32 * SIZE
  1134. FMADD f3, f27, f28, f3
  1135. FMADD f7, f27, f29, f7
  1136. FMADD f11, f27, f30, f11
  1137. FMADD f15, f27, f31, f15
  1138. bdnz LL(32)
  1139. .align 4
  1140. LL(35):
  1141. #if defined(LT) || defined(RN)
  1142. andi. r0, KK, 7
  1143. #else
  1144. andi. r0, TEMP, 7
  1145. #endif
  1146. mtspr CTR, r0
  1147. ble LL(38)
  1148. .align 4
  1149. LL(36):
  1150. FMADD f0, f16, f20, f0
  1151. FMADD f4, f16, f21, f4
  1152. FMADD f8, f16, f22, f8
  1153. FMADD f12, f16, f23, f12
  1154. FMADD f1, f17, f20, f1
  1155. FMADD f5, f17, f21, f5
  1156. FMADD f9, f17, f22, f9
  1157. FMADD f13, f17, f23, f13
  1158. FMADD f2, f18, f20, f2
  1159. FMADD f6, f18, f21, f6
  1160. FMADD f10, f18, f22, f10
  1161. FMADD f14, f18, f23, f14
  1162. FMADD f3, f19, f20, f3
  1163. FMADD f7, f19, f21, f7
  1164. FMADD f11, f19, f22, f11
  1165. FMADD f15, f19, f23, f15
  1166. LFD f16, 4 * SIZE(AO)
  1167. LFD f17, 5 * SIZE(AO)
  1168. LFD f18, 6 * SIZE(AO)
  1169. LFD f19, 7 * SIZE(AO)
  1170. LFD f20, 4 * SIZE(BO)
  1171. LFD f21, 5 * SIZE(BO)
  1172. LFD f22, 6 * SIZE(BO)
  1173. LFD f23, 7 * SIZE(BO)
  1174. addi BO, BO, 4 * SIZE
  1175. addi AO, AO, 4 * SIZE
  1176. bdnz LL(36)
  1177. .align 4
  1178. LL(38):
  1179. #ifndef CONJ
  1180. FSUB f0, f0, f5
  1181. FADD f1, f1, f4
  1182. FSUB f2, f2, f7
  1183. FADD f3, f3, f6
  1184. FSUB f8, f8, f13
  1185. FADD f9, f9, f12
  1186. FSUB f10, f10, f15
  1187. FADD f11, f11, f14
  1188. #else
  1189. FADD f0, f0, f5
  1190. FSUB f1, f4, f1
  1191. FADD f2, f2, f7
  1192. FSUB f3, f6, f3
  1193. FADD f8, f8, f13
  1194. FSUB f9, f12, f9
  1195. FADD f10, f10, f15
  1196. FSUB f11, f14, f11
  1197. #endif
  1198. #if defined(LN) || defined(RT)
  1199. subi r0, KK, 2
  1200. slwi r0, r0, 1 + ZBASE_SHIFT
  1201. add AO, AORIG, r0
  1202. add BO, B, r0
  1203. #endif
  1204. #if defined(LN) || defined(LT)
  1205. LFD f16, 0 * SIZE(BO)
  1206. LFD f17, 1 * SIZE(BO)
  1207. LFD f18, 2 * SIZE(BO)
  1208. LFD f19, 3 * SIZE(BO)
  1209. LFD f20, 4 * SIZE(BO)
  1210. LFD f21, 5 * SIZE(BO)
  1211. LFD f22, 6 * SIZE(BO)
  1212. LFD f23, 7 * SIZE(BO)
  1213. FSUB f0, f16, f0
  1214. FSUB f1, f17, f1
  1215. FSUB f8, f18, f8
  1216. FSUB f9, f19, f9
  1217. FSUB f2, f20, f2
  1218. FSUB f3, f21, f3
  1219. FSUB f10, f22, f10
  1220. FSUB f11, f23, f11
  1221. #else
  1222. LFD f16, 0 * SIZE(AO)
  1223. LFD f17, 1 * SIZE(AO)
  1224. LFD f18, 2 * SIZE(AO)
  1225. LFD f19, 3 * SIZE(AO)
  1226. LFD f20, 4 * SIZE(AO)
  1227. LFD f21, 5 * SIZE(AO)
  1228. LFD f22, 6 * SIZE(AO)
  1229. LFD f23, 7 * SIZE(AO)
  1230. #ifndef CONJ
  1231. FSUB f0, f16, f0
  1232. FSUB f1, f17, f1
  1233. FSUB f2, f18, f2
  1234. FSUB f3, f19, f3
  1235. FSUB f8, f20, f8
  1236. FSUB f9, f21, f9
  1237. FSUB f10, f22, f10
  1238. FSUB f11, f23, f11
  1239. #else
  1240. FSUB f0, f16, f0
  1241. FADD f1, f17, f1
  1242. FSUB f2, f18, f2
  1243. FADD f3, f19, f3
  1244. FSUB f8, f20, f8
  1245. FADD f9, f21, f9
  1246. FSUB f10, f22, f10
  1247. FADD f11, f23, f11
  1248. #endif
  1249. #endif
  1250. #ifdef LN
  1251. LFD f16, 6 * SIZE(AO)
  1252. LFD f17, 7 * SIZE(AO)
  1253. LFD f18, 4 * SIZE(AO)
  1254. LFD f19, 5 * SIZE(AO)
  1255. LFD f20, 0 * SIZE(AO)
  1256. LFD f21, 1 * SIZE(AO)
  1257. FMUL f6, f17, f3
  1258. FMUL f7, f17, f2
  1259. FMUL f14, f17, f11
  1260. FMUL f15, f17, f10
  1261. #ifndef CONJ
  1262. FMSUB f2, f16, f2, f6
  1263. FMADD f3, f16, f3, f7
  1264. FMSUB f10, f16, f10, f14
  1265. FMADD f11, f16, f11, f15
  1266. FMADD f0, f19, f3, f0
  1267. FNMSUB f1, f19, f2, f1
  1268. FMADD f8, f19, f11, f8
  1269. FNMSUB f9, f19, f10, f9
  1270. FNMSUB f0, f18, f2, f0
  1271. FNMSUB f1, f18, f3, f1
  1272. FNMSUB f8, f18, f10, f8
  1273. FNMSUB f9, f18, f11, f9
  1274. FMUL f4, f21, f1
  1275. FMUL f5, f21, f0
  1276. FMUL f12, f21, f9
  1277. FMUL f13, f21, f8
  1278. FMSUB f0, f20, f0, f4
  1279. FMADD f1, f20, f1, f5
  1280. FMSUB f8, f20, f8, f12
  1281. FMADD f9, f20, f9, f13
  1282. #else
  1283. FMADD f2, f16, f2, f6
  1284. FMSUB f3, f16, f3, f7
  1285. FMADD f10, f16, f10, f14
  1286. FMSUB f11, f16, f11, f15
  1287. FMSUB f0, f19, f3, f0
  1288. FNMADD f1, f19, f2, f1
  1289. FMSUB f8, f19, f11, f8
  1290. FNMADD f9, f19, f10, f9
  1291. FNMADD f0, f18, f2, f0
  1292. FNMADD f1, f18, f3, f1
  1293. FNMADD f8, f18, f10, f8
  1294. FNMADD f9, f18, f11, f9
  1295. FMUL f4, f21, f1
  1296. FMUL f5, f21, f0
  1297. FMUL f12, f21, f9
  1298. FMUL f13, f21, f8
  1299. FMADD f0, f20, f0, f4
  1300. FMSUB f1, f20, f1, f5
  1301. FMADD f8, f20, f8, f12
  1302. FMSUB f9, f20, f9, f13
  1303. #endif
  1304. #endif
  1305. #ifdef LT
  1306. LFD f16, 0 * SIZE(AO)
  1307. LFD f17, 1 * SIZE(AO)
  1308. LFD f18, 2 * SIZE(AO)
  1309. LFD f19, 3 * SIZE(AO)
  1310. LFD f20, 6 * SIZE(AO)
  1311. LFD f21, 7 * SIZE(AO)
  1312. FMUL f4, f17, f1
  1313. FMUL f5, f17, f0
  1314. FMUL f12, f17, f9
  1315. FMUL f13, f17, f8
  1316. #ifndef CONJ
  1317. FMSUB f0, f16, f0, f4
  1318. FMADD f1, f16, f1, f5
  1319. FMSUB f8, f16, f8, f12
  1320. FMADD f9, f16, f9, f13
  1321. FMADD f2, f19, f1, f2
  1322. FNMSUB f3, f19, f0, f3
  1323. FMADD f10, f19, f9, f10
  1324. FNMSUB f11, f19, f8, f11
  1325. FNMSUB f2, f18, f0, f2
  1326. FNMSUB f3, f18, f1, f3
  1327. FNMSUB f10, f18, f8, f10
  1328. FNMSUB f11, f18, f9, f11
  1329. FMUL f4, f21, f3
  1330. FMUL f5, f21, f2
  1331. FMUL f12, f21, f11
  1332. FMUL f13, f21, f10
  1333. FMSUB f2, f20, f2, f4
  1334. FMADD f3, f20, f3, f5
  1335. FMSUB f10, f20, f10, f12
  1336. FMADD f11, f20, f11, f13
  1337. #else
  1338. FMADD f0, f16, f0, f4
  1339. FMSUB f1, f16, f1, f5
  1340. FMADD f8, f16, f8, f12
  1341. FMSUB f9, f16, f9, f13
  1342. FMSUB f2, f19, f1, f2
  1343. FNMADD f3, f19, f0, f3
  1344. FMSUB f10, f19, f9, f10
  1345. FNMADD f11, f19, f8, f11
  1346. FNMADD f2, f18, f0, f2
  1347. FNMADD f3, f18, f1, f3
  1348. FNMADD f10, f18, f8, f10
  1349. FNMADD f11, f18, f9, f11
  1350. FMUL f4, f21, f3
  1351. FMUL f5, f21, f2
  1352. FMUL f12, f21, f11
  1353. FMUL f13, f21, f10
  1354. FMADD f2, f20, f2, f4
  1355. FMSUB f3, f20, f3, f5
  1356. FMADD f10, f20, f10, f12
  1357. FMSUB f11, f20, f11, f13
  1358. #endif
  1359. #endif
  1360. #ifdef RN
  1361. LFD f16, 0 * SIZE(BO)
  1362. LFD f17, 1 * SIZE(BO)
  1363. LFD f18, 2 * SIZE(BO)
  1364. LFD f19, 3 * SIZE(BO)
  1365. LFD f20, 6 * SIZE(BO)
  1366. LFD f21, 7 * SIZE(BO)
  1367. FMUL f4, f17, f1
  1368. FMUL f5, f17, f0
  1369. FMUL f6, f17, f3
  1370. FMUL f7, f17, f2
  1371. #ifndef CONJ
  1372. FMSUB f0, f16, f0, f4
  1373. FMADD f1, f16, f1, f5
  1374. FMSUB f2, f16, f2, f6
  1375. FMADD f3, f16, f3, f7
  1376. FMADD f8, f19, f1, f8
  1377. FNMSUB f9, f19, f0, f9
  1378. FMADD f10, f19, f3, f10
  1379. FNMSUB f11, f19, f2, f11
  1380. FNMSUB f8, f18, f0, f8
  1381. FNMSUB f9, f18, f1, f9
  1382. FNMSUB f10, f18, f2, f10
  1383. FNMSUB f11, f18, f3, f11
  1384. FMUL f4, f21, f9
  1385. FMUL f5, f21, f8
  1386. FMUL f6, f21, f11
  1387. FMUL f7, f21, f10
  1388. FMSUB f8, f20, f8, f4
  1389. FMADD f9, f20, f9, f5
  1390. FMSUB f10, f20, f10, f6
  1391. FMADD f11, f20, f11, f7
  1392. #else
  1393. FMADD f0, f16, f0, f4
  1394. FMSUB f1, f16, f1, f5
  1395. FMADD f2, f16, f2, f6
  1396. FMSUB f3, f16, f3, f7
  1397. FMSUB f8, f19, f1, f8
  1398. FNMADD f9, f19, f0, f9
  1399. FMSUB f10, f19, f3, f10
  1400. FNMADD f11, f19, f2, f11
  1401. FNMADD f8, f18, f0, f8
  1402. FNMADD f9, f18, f1, f9
  1403. FNMADD f10, f18, f2, f10
  1404. FNMADD f11, f18, f3, f11
  1405. FMUL f4, f21, f9
  1406. FMUL f5, f21, f8
  1407. FMUL f6, f21, f11
  1408. FMUL f7, f21, f10
  1409. FMADD f8, f20, f8, f4
  1410. FMSUB f9, f20, f9, f5
  1411. FMADD f10, f20, f10, f6
  1412. FMSUB f11, f20, f11, f7
  1413. #endif
  1414. #endif
  1415. #ifdef RT
  1416. LFD f16, 6 * SIZE(BO)
  1417. LFD f17, 7 * SIZE(BO)
  1418. LFD f18, 4 * SIZE(BO)
  1419. LFD f19, 5 * SIZE(BO)
  1420. LFD f20, 0 * SIZE(BO)
  1421. LFD f21, 1 * SIZE(BO)
  1422. FMUL f12, f17, f9
  1423. FMUL f13, f17, f8
  1424. FMUL f14, f17, f11
  1425. FMUL f15, f17, f10
  1426. #ifndef CONJ
  1427. FMSUB f8, f16, f8, f12
  1428. FMADD f9, f16, f9, f13
  1429. FMSUB f10, f16, f10, f14
  1430. FMADD f11, f16, f11, f15
  1431. FMADD f0, f19, f9, f0
  1432. FNMSUB f1, f19, f8, f1
  1433. FMADD f2, f19, f11, f2
  1434. FNMSUB f3, f19, f10, f3
  1435. FNMSUB f0, f18, f8, f0
  1436. FNMSUB f1, f18, f9, f1
  1437. FNMSUB f2, f18, f10, f2
  1438. FNMSUB f3, f18, f11, f3
  1439. FMUL f4, f21, f1
  1440. FMUL f5, f21, f0
  1441. FMUL f6, f21, f3
  1442. FMUL f7, f21, f2
  1443. FMSUB f0, f20, f0, f4
  1444. FMADD f1, f20, f1, f5
  1445. FMSUB f2, f20, f2, f6
  1446. FMADD f3, f20, f3, f7
  1447. #else
  1448. FMADD f8, f16, f8, f12
  1449. FMSUB f9, f16, f9, f13
  1450. FMADD f10, f16, f10, f14
  1451. FMSUB f11, f16, f11, f15
  1452. FMSUB f0, f19, f9, f0
  1453. FNMADD f1, f19, f8, f1
  1454. FMSUB f2, f19, f11, f2
  1455. FNMADD f3, f19, f10, f3
  1456. FNMADD f0, f18, f8, f0
  1457. FNMADD f1, f18, f9, f1
  1458. FNMADD f2, f18, f10, f2
  1459. FNMADD f3, f18, f11, f3
  1460. FMUL f4, f21, f1
  1461. FMUL f5, f21, f0
  1462. FMUL f6, f21, f3
  1463. FMUL f7, f21, f2
  1464. FMADD f0, f20, f0, f4
  1465. FMSUB f1, f20, f1, f5
  1466. FMADD f2, f20, f2, f6
  1467. FMSUB f3, f20, f3, f7
  1468. #endif
  1469. #endif
  1470. #ifdef LN
  1471. subi CO1, CO1, 4 * SIZE
  1472. subi CO2, CO2, 4 * SIZE
  1473. #endif
  1474. #if defined(LN) || defined(LT)
  1475. STFD f0, 0 * SIZE(BO)
  1476. STFD f1, 1 * SIZE(BO)
  1477. STFD f8, 2 * SIZE(BO)
  1478. STFD f9, 3 * SIZE(BO)
  1479. STFD f2, 4 * SIZE(BO)
  1480. STFD f3, 5 * SIZE(BO)
  1481. STFD f10, 6 * SIZE(BO)
  1482. STFD f11, 7 * SIZE(BO)
  1483. #else
  1484. STFD f0, 0 * SIZE(AO)
  1485. STFD f1, 1 * SIZE(AO)
  1486. STFD f2, 2 * SIZE(AO)
  1487. STFD f3, 3 * SIZE(AO)
  1488. STFD f8, 4 * SIZE(AO)
  1489. STFD f9, 5 * SIZE(AO)
  1490. STFD f10, 6 * SIZE(AO)
  1491. STFD f11, 7 * SIZE(AO)
  1492. #endif
  1493. STFD f0, 0 * SIZE(CO1)
  1494. STFD f1, 1 * SIZE(CO1)
  1495. STFD f2, 2 * SIZE(CO1)
  1496. STFD f3, 3 * SIZE(CO1)
  1497. STFD f8, 0 * SIZE(CO2)
  1498. STFD f9, 1 * SIZE(CO2)
  1499. STFD f10, 2 * SIZE(CO2)
  1500. STFD f11, 3 * SIZE(CO2)
  1501. #ifndef LN
  1502. addi CO1, CO1, 4 * SIZE
  1503. addi CO2, CO2, 4 * SIZE
  1504. #endif
  1505. #ifdef RT
  1506. slwi r0, K, 1 + ZBASE_SHIFT
  1507. add AORIG, AORIG, r0
  1508. #endif
  1509. #if defined(LT) || defined(RN)
  1510. sub TEMP, K, KK
  1511. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1512. add AO, AO, TEMP
  1513. add BO, BO, TEMP
  1514. #endif
  1515. #ifdef LT
  1516. addi KK, KK, 2
  1517. #endif
  1518. #ifdef LN
  1519. subi KK, KK, 2
  1520. #endif
  1521. addic. I, I, -1
  1522. bgt LL(31)
  1523. .align 4
  1524. LL(40):
  1525. andi. I, M, 1
  1526. ble LL(49)
  1527. #if defined(LT) || defined(RN)
  1528. LFD f16, 0 * SIZE(AO)
  1529. LFD f17, 1 * SIZE(AO)
  1530. LFD f18, 2 * SIZE(AO)
  1531. LFD f19, 3 * SIZE(AO)
  1532. LFD f20, 0 * SIZE(B)
  1533. LFD f21, 1 * SIZE(B)
  1534. LFD f22, 2 * SIZE(B)
  1535. LFD f23, 3 * SIZE(B)
  1536. LFD f24, 4 * SIZE(B)
  1537. LFD f25, 5 * SIZE(B)
  1538. LFD f26, 6 * SIZE(B)
  1539. LFD f27, 7 * SIZE(B)
  1540. lfs f0, FZERO
  1541. fmr f1, f0
  1542. fmr f2, f0
  1543. fmr f3, f0
  1544. fmr f4, f0
  1545. fmr f5, f0
  1546. fmr f6, f0
  1547. fmr f7, f0
  1548. srawi. r0, KK, 2
  1549. mr BO, B
  1550. mtspr CTR, r0
  1551. #else
  1552. #ifdef LN
  1553. slwi r0, K, 0 + ZBASE_SHIFT
  1554. sub AORIG, AORIG, r0
  1555. #endif
  1556. slwi r0, KK, 0 + ZBASE_SHIFT
  1557. slwi TEMP, KK, 1 + ZBASE_SHIFT
  1558. add AO, AORIG, r0
  1559. add BO, B, TEMP
  1560. sub TEMP, K, KK
  1561. LFD f16, 0 * SIZE(AO)
  1562. LFD f17, 1 * SIZE(AO)
  1563. LFD f18, 2 * SIZE(AO)
  1564. LFD f19, 3 * SIZE(AO)
  1565. LFD f20, 0 * SIZE(BO)
  1566. LFD f21, 1 * SIZE(BO)
  1567. LFD f22, 2 * SIZE(BO)
  1568. LFD f23, 3 * SIZE(BO)
  1569. LFD f24, 4 * SIZE(BO)
  1570. LFD f25, 5 * SIZE(BO)
  1571. LFD f26, 6 * SIZE(BO)
  1572. LFD f27, 7 * SIZE(BO)
  1573. lfs f0, FZERO
  1574. fmr f1, f0
  1575. fmr f2, f0
  1576. fmr f3, f0
  1577. fmr f4, f0
  1578. fmr f5, f0
  1579. fmr f6, f0
  1580. fmr f7, f0
  1581. srawi. r0, TEMP, 2
  1582. mtspr CTR, r0
  1583. #endif
  1584. ble LL(45)
  1585. .align 4
  1586. LL(42):
  1587. FMADD f0, f16, f20, f0
  1588. FMADD f1, f16, f21, f1
  1589. FMADD f2, f16, f22, f2
  1590. FMADD f3, f16, f23, f3
  1591. FMADD f4, f17, f20, f4
  1592. FMADD f5, f17, f21, f5
  1593. FMADD f6, f17, f22, f6
  1594. FMADD f7, f17, f23, f7
  1595. LFD f20, 8 * SIZE(BO)
  1596. LFD f21, 9 * SIZE(BO)
  1597. LFD f22, 10 * SIZE(BO)
  1598. LFD f23, 11 * SIZE(BO)
  1599. FMADD f0, f18, f24, f0
  1600. FMADD f1, f18, f25, f1
  1601. FMADD f2, f18, f26, f2
  1602. FMADD f3, f18, f27, f3
  1603. FMADD f4, f19, f24, f4
  1604. FMADD f5, f19, f25, f5
  1605. FMADD f6, f19, f26, f6
  1606. FMADD f7, f19, f27, f7
  1607. LFD f24, 12 * SIZE(BO)
  1608. LFD f25, 13 * SIZE(BO)
  1609. LFD f26, 14 * SIZE(BO)
  1610. LFD f27, 15 * SIZE(BO)
  1611. LFD f16, 4 * SIZE(AO)
  1612. LFD f17, 5 * SIZE(AO)
  1613. LFD f18, 6 * SIZE(AO)
  1614. LFD f19, 7 * SIZE(AO)
  1615. FMADD f0, f16, f20, f0
  1616. FMADD f1, f16, f21, f1
  1617. FMADD f2, f16, f22, f2
  1618. FMADD f3, f16, f23, f3
  1619. FMADD f4, f17, f20, f4
  1620. FMADD f5, f17, f21, f5
  1621. FMADD f6, f17, f22, f6
  1622. FMADD f7, f17, f23, f7
  1623. LFD f20, 16 * SIZE(BO)
  1624. LFD f21, 17 * SIZE(BO)
  1625. LFD f22, 18 * SIZE(BO)
  1626. LFD f23, 19 * SIZE(BO)
  1627. FMADD f0, f18, f24, f0
  1628. FMADD f1, f18, f25, f1
  1629. FMADD f2, f18, f26, f2
  1630. FMADD f3, f18, f27, f3
  1631. FMADD f4, f19, f24, f4
  1632. FMADD f5, f19, f25, f5
  1633. FMADD f6, f19, f26, f6
  1634. FMADD f7, f19, f27, f7
  1635. LFD f16, 8 * SIZE(AO)
  1636. LFD f17, 9 * SIZE(AO)
  1637. LFD f18, 10 * SIZE(AO)
  1638. LFD f19, 11 * SIZE(AO)
  1639. LFD f24, 20 * SIZE(BO)
  1640. LFD f25, 21 * SIZE(BO)
  1641. LFD f26, 22 * SIZE(BO)
  1642. LFD f27, 23 * SIZE(BO)
  1643. addi BO, BO, 16 * SIZE
  1644. addi AO, AO, 8 * SIZE
  1645. bdnz LL(42)
  1646. .align 4
  1647. LL(45):
  1648. #if defined(LT) || defined(RN)
  1649. andi. r0, KK, 3
  1650. #else
  1651. andi. r0, TEMP, 3
  1652. #endif
  1653. mtspr CTR, r0
  1654. ble LL(47)
  1655. .align 4
  1656. LL(46):
  1657. FMADD f0, f16, f20, f0
  1658. FMADD f1, f16, f21, f1
  1659. FMADD f2, f16, f22, f2
  1660. FMADD f3, f16, f23, f3
  1661. FMADD f4, f17, f20, f4
  1662. FMADD f5, f17, f21, f5
  1663. FMADD f6, f17, f22, f6
  1664. FMADD f7, f17, f23, f7
  1665. LFD f20, 4 * SIZE(BO)
  1666. LFD f21, 5 * SIZE(BO)
  1667. LFD f22, 6 * SIZE(BO)
  1668. LFD f23, 7 * SIZE(BO)
  1669. LFD f16, 2 * SIZE(AO)
  1670. LFD f17, 3 * SIZE(AO)
  1671. addi AO, AO, 2 * SIZE
  1672. addi BO, BO, 4 * SIZE
  1673. bdnz LL(46)
  1674. .align 4
  1675. LL(47):
  1676. #ifndef CONJ
  1677. FSUB f0, f0, f5
  1678. FADD f1, f1, f4
  1679. FSUB f2, f2, f7
  1680. FADD f3, f3, f6
  1681. #else
  1682. #if defined(LN) || defined(LT)
  1683. FADD f0, f0, f5
  1684. FSUB f1, f1, f4
  1685. FADD f2, f2, f7
  1686. FSUB f3, f3, f6
  1687. #else
  1688. FADD f0, f0, f5
  1689. FSUB f1, f4, f1
  1690. FADD f2, f2, f7
  1691. FSUB f3, f6, f3
  1692. #endif
  1693. #endif
  1694. #if defined(LN) || defined(RT)
  1695. #ifdef LN
  1696. subi r0, KK, 1
  1697. #else
  1698. subi r0, KK, 2
  1699. #endif
  1700. slwi TEMP, r0, 0 + ZBASE_SHIFT
  1701. slwi r0, r0, 1 + ZBASE_SHIFT
  1702. add AO, AORIG, TEMP
  1703. add BO, B, r0
  1704. #endif
  1705. #if defined(LN) || defined(LT)
  1706. LFD f16, 0 * SIZE(BO)
  1707. LFD f17, 1 * SIZE(BO)
  1708. LFD f18, 2 * SIZE(BO)
  1709. LFD f19, 3 * SIZE(BO)
  1710. FSUB f0, f16, f0
  1711. FSUB f1, f17, f1
  1712. FSUB f2, f18, f2
  1713. FSUB f3, f19, f3
  1714. #else
  1715. LFD f16, 0 * SIZE(AO)
  1716. LFD f17, 1 * SIZE(AO)
  1717. LFD f20, 2 * SIZE(AO)
  1718. LFD f21, 3 * SIZE(AO)
  1719. FSUB f0, f16, f0
  1720. FSUB f1, f17, f1
  1721. FSUB f2, f20, f2
  1722. FSUB f3, f21, f3
  1723. #endif
  1724. #ifdef LN
  1725. LFD f20, 0 * SIZE(AO)
  1726. LFD f21, 1 * SIZE(AO)
  1727. FMUL f4, f21, f1
  1728. FMUL f5, f21, f0
  1729. FMUL f12, f21, f3
  1730. FMUL f13, f21, f2
  1731. #ifndef CONJ
  1732. FMSUB f0, f20, f0, f4
  1733. FMADD f1, f20, f1, f5
  1734. FMSUB f2, f20, f2, f12
  1735. FMADD f3, f20, f3, f13
  1736. #else
  1737. FMADD f0, f20, f0, f4
  1738. FMSUB f1, f20, f1, f5
  1739. FMADD f2, f20, f2, f12
  1740. FMSUB f3, f20, f3, f13
  1741. #endif
  1742. #endif
  1743. #ifdef LT
  1744. LFD f16, 0 * SIZE(AO)
  1745. LFD f17, 1 * SIZE(AO)
  1746. FMUL f4, f17, f1
  1747. FMUL f5, f17, f0
  1748. FMUL f12, f17, f3
  1749. FMUL f13, f17, f2
  1750. #ifndef CONJ
  1751. FMSUB f0, f16, f0, f4
  1752. FMADD f1, f16, f1, f5
  1753. FMSUB f2, f16, f2, f12
  1754. FMADD f3, f16, f3, f13
  1755. #else
  1756. FMADD f0, f16, f0, f4
  1757. FMSUB f1, f16, f1, f5
  1758. FMADD f2, f16, f2, f12
  1759. FMSUB f3, f16, f3, f13
  1760. #endif
  1761. #endif
  1762. #ifdef RN
  1763. LFD f16, 0 * SIZE(BO)
  1764. LFD f17, 1 * SIZE(BO)
  1765. LFD f18, 2 * SIZE(BO)
  1766. LFD f19, 3 * SIZE(BO)
  1767. LFD f20, 6 * SIZE(BO)
  1768. LFD f21, 7 * SIZE(BO)
  1769. FMUL f4, f17, f1
  1770. FMUL f5, f17, f0
  1771. #ifndef CONJ
  1772. FMSUB f0, f16, f0, f4
  1773. FMADD f1, f16, f1, f5
  1774. FMADD f2, f19, f1, f2
  1775. FNMSUB f3, f19, f0, f3
  1776. FNMSUB f2, f18, f0, f2
  1777. FNMSUB f3, f18, f1, f3
  1778. FMUL f4, f21, f3
  1779. FMUL f5, f21, f2
  1780. FMSUB f2, f20, f2, f4
  1781. FMADD f3, f20, f3, f5
  1782. #else
  1783. FMADD f0, f16, f0, f4
  1784. FMSUB f1, f16, f1, f5
  1785. FMSUB f2, f19, f1, f2
  1786. FNMADD f3, f19, f0, f3
  1787. FNMADD f2, f18, f0, f2
  1788. FNMADD f3, f18, f1, f3
  1789. FMUL f4, f21, f3
  1790. FMUL f5, f21, f2
  1791. FMADD f2, f20, f2, f4
  1792. FMSUB f3, f20, f3, f5
  1793. #endif
  1794. #endif
  1795. #ifdef RT
  1796. LFD f16, 6 * SIZE(BO)
  1797. LFD f17, 7 * SIZE(BO)
  1798. LFD f18, 4 * SIZE(BO)
  1799. LFD f19, 5 * SIZE(BO)
  1800. LFD f20, 0 * SIZE(BO)
  1801. LFD f21, 1 * SIZE(BO)
  1802. FMUL f12, f17, f3
  1803. FMUL f13, f17, f2
  1804. #ifndef CONJ
  1805. FMSUB f2, f16, f2, f12
  1806. FMADD f3, f16, f3, f13
  1807. FMADD f0, f19, f3, f0
  1808. FNMSUB f1, f19, f2, f1
  1809. FNMSUB f0, f18, f2, f0
  1810. FNMSUB f1, f18, f3, f1
  1811. FMUL f4, f21, f1
  1812. FMUL f5, f21, f0
  1813. FMSUB f0, f20, f0, f4
  1814. FMADD f1, f20, f1, f5
  1815. #else
  1816. FMADD f2, f16, f2, f12
  1817. FMSUB f3, f16, f3, f13
  1818. FMSUB f0, f19, f3, f0
  1819. FNMADD f1, f19, f2, f1
  1820. FNMADD f0, f18, f2, f0
  1821. FNMADD f1, f18, f3, f1
  1822. FMUL f4, f21, f1
  1823. FMUL f5, f21, f0
  1824. FMADD f0, f20, f0, f4
  1825. FMSUB f1, f20, f1, f5
  1826. #endif
  1827. #endif
  1828. #ifdef LN
  1829. subi CO1, CO1, 2 * SIZE
  1830. subi CO2, CO2, 2 * SIZE
  1831. #endif
  1832. #if defined(LN) || defined(LT)
  1833. STFD f0, 0 * SIZE(BO)
  1834. STFD f1, 1 * SIZE(BO)
  1835. STFD f2, 2 * SIZE(BO)
  1836. STFD f3, 3 * SIZE(BO)
  1837. #else
  1838. STFD f0, 0 * SIZE(AO)
  1839. STFD f1, 1 * SIZE(AO)
  1840. STFD f2, 2 * SIZE(AO)
  1841. STFD f3, 3 * SIZE(AO)
  1842. #endif
  1843. STFD f0, 0 * SIZE(CO1)
  1844. STFD f1, 1 * SIZE(CO1)
  1845. STFD f2, 0 * SIZE(CO2)
  1846. STFD f3, 1 * SIZE(CO2)
  1847. #ifndef LN
  1848. addi CO1, CO1, 2 * SIZE
  1849. addi CO2, CO2, 2 * SIZE
  1850. #endif
  1851. #ifdef RT
  1852. slwi r0, K, 0 + ZBASE_SHIFT
  1853. add AORIG, AORIG, r0
  1854. #endif
  1855. #if defined(LT) || defined(RN)
  1856. sub TEMP, K, KK
  1857. slwi r0, TEMP, 0 + ZBASE_SHIFT
  1858. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1859. add AO, AO, r0
  1860. add BO, BO, TEMP
  1861. #endif
  1862. #ifdef LT
  1863. addi KK, KK, 1
  1864. #endif
  1865. #ifdef LN
  1866. subi KK, KK, 1
  1867. #endif
  1868. .align 4
  1869. LL(49):
  1870. #ifdef LN
  1871. slwi r0, K, 1 + ZBASE_SHIFT
  1872. add B, B, r0
  1873. #endif
  1874. #if defined(LT) || defined(RN)
  1875. mr B, BO
  1876. #endif
  1877. #ifdef RN
  1878. addi KK, KK, 2
  1879. #endif
  1880. #ifdef RT
  1881. subi KK, KK, 2
  1882. #endif
  1883. .align 4
  1884. LL(50):
  1885. srawi. J, N, 2
  1886. ble LL(999)
  1887. .align 4
  1888. LL(10):
  1889. #ifdef RT
  1890. slwi r0, K, 2 + ZBASE_SHIFT
  1891. sub B, B, r0
  1892. slwi r0, LDC, 2
  1893. sub C, C, r0
  1894. #endif
  1895. mr CO1, C
  1896. add CO2, C, LDC
  1897. add CO3, CO2, LDC
  1898. add CO4, CO3, LDC
  1899. #ifdef LN
  1900. add KK, M, OFFSET
  1901. #endif
  1902. #ifdef LT
  1903. mr KK, OFFSET
  1904. #endif
  1905. lfs f0, FZERO
  1906. fmr f1, f0
  1907. fmr f2, f0
  1908. fmr f3, f0
  1909. fmr f4, f0
  1910. fmr f5, f0
  1911. fmr f6, f0
  1912. fmr f7, f0
  1913. fmr f8, f0
  1914. fmr f9, f0
  1915. fmr f10, f0
  1916. fmr f11, f0
  1917. fmr f12, f0
  1918. fmr f13, f0
  1919. fmr f14, f0
  1920. fmr f15, f0
  1921. srawi. I, M, 1
  1922. #if defined(LN) || defined(RT)
  1923. mr AORIG, A
  1924. #else
  1925. mr AO, A
  1926. #endif
  1927. #ifndef RT
  1928. add C, CO4, LDC
  1929. #endif
  1930. ble LL(20)
  1931. .align 4
  1932. LL(11):
  1933. #if defined(LT) || defined(RN)
  1934. LFD f16, 0 * SIZE(AO)
  1935. LFD f20, 0 * SIZE(B)
  1936. LFD f17, 1 * SIZE(AO)
  1937. LFD f21, 1 * SIZE(B)
  1938. LFD f18, 2 * SIZE(AO)
  1939. LFD f22, 2 * SIZE(B)
  1940. LFD f19, 3 * SIZE(AO)
  1941. LFD f23, 3 * SIZE(B)
  1942. LFD f24, 4 * SIZE(B)
  1943. LFD f25, 5 * SIZE(B)
  1944. LFD f26, 6 * SIZE(B)
  1945. LFD f27, 7 * SIZE(B)
  1946. dcbtst CO1, PREC
  1947. dcbtst CO2, PREC
  1948. dcbtst CO3, PREC
  1949. dcbtst CO4, PREC
  1950. srawi. r0, KK, 3
  1951. mtspr CTR, r0
  1952. mr BO, B
  1953. #else
  1954. #ifdef LN
  1955. slwi r0, K, 1 + ZBASE_SHIFT
  1956. sub AORIG, AORIG, r0
  1957. #endif
  1958. slwi r0, KK, 1 + ZBASE_SHIFT
  1959. slwi TEMP, KK, 2 + ZBASE_SHIFT
  1960. add AO, AORIG, r0
  1961. add BO, B, TEMP
  1962. sub TEMP, K, KK
  1963. LFD f16, 0 * SIZE(AO)
  1964. LFD f20, 0 * SIZE(BO)
  1965. LFD f17, 1 * SIZE(AO)
  1966. LFD f21, 1 * SIZE(BO)
  1967. LFD f18, 2 * SIZE(AO)
  1968. LFD f22, 2 * SIZE(BO)
  1969. LFD f19, 3 * SIZE(AO)
  1970. LFD f23, 3 * SIZE(BO)
  1971. LFD f24, 4 * SIZE(BO)
  1972. LFD f25, 5 * SIZE(BO)
  1973. LFD f26, 6 * SIZE(BO)
  1974. LFD f27, 7 * SIZE(BO)
  1975. dcbtst CO1, PREC
  1976. dcbtst CO2, PREC
  1977. dcbtst CO3, PREC
  1978. dcbtst CO4, PREC
  1979. srawi. r0, TEMP, 3
  1980. mtspr CTR, r0
  1981. #endif
  1982. ble LL(15)
  1983. .align 4
  1984. LL(12):
  1985. dcbt AO, PREA
  1986. dcbtst BO, PREA
  1987. FMA1 f0, f16, f20, f0
  1988. FMA1 f2, f18, f20, f2
  1989. FMA2 f1, f16, f21, f1
  1990. FMA2 f3, f18, f21, f3
  1991. LFD f28, 4 * SIZE(AO)
  1992. LFD f29, 5 * SIZE(AO)
  1993. LFD f30, 6 * SIZE(AO)
  1994. LFD f31, 7 * SIZE(AO)
  1995. FMA1 f4, f16, f22, f4
  1996. FMA1 f6, f18, f22, f6
  1997. FMA2 f5, f16, f23, f5
  1998. FMA2 f7, f18, f23, f7
  1999. FMA1 f8, f16, f24, f8
  2000. FMA1 f10, f18, f24, f10
  2001. FMA2 f9, f16, f25, f9
  2002. FMA2 f11, f18, f25, f11
  2003. FMA1 f12, f16, f26, f12
  2004. FMA1 f14, f18, f26, f14
  2005. FMA2 f13, f16, f27, f13
  2006. FMA2 f15, f18, f27, f15
  2007. FMA4 f1, f17, f20, f1
  2008. FMA4 f3, f19, f20, f3
  2009. FMA3 f0, f17, f21, f0
  2010. FMA3 f2, f19, f21, f2
  2011. FMA4 f5, f17, f22, f5
  2012. FMA4 f7, f19, f22, f7
  2013. FMA3 f4, f17, f23, f4
  2014. FMA3 f6, f19, f23, f6
  2015. LFD f20, 8 * SIZE(BO)
  2016. LFD f21, 9 * SIZE(BO)
  2017. LFD f22, 10 * SIZE(BO)
  2018. LFD f23, 11 * SIZE(BO)
  2019. FMA4 f9, f17, f24, f9
  2020. FMA4 f11, f19, f24, f11
  2021. FMA3 f8, f17, f25, f8
  2022. FMA3 f10, f19, f25, f10
  2023. FMA4 f13, f17, f26, f13
  2024. FMA4 f15, f19, f26, f15
  2025. FMA3 f12, f17, f27, f12
  2026. FMA3 f14, f19, f27, f14
  2027. LFD f24, 12 * SIZE(BO)
  2028. LFD f25, 13 * SIZE(BO)
  2029. LFD f26, 14 * SIZE(BO)
  2030. LFD f27, 15 * SIZE(BO)
  2031. FMA1 f0, f28, f20, f0
  2032. FMA1 f2, f30, f20, f2
  2033. FMA2 f1, f28, f21, f1
  2034. FMA2 f3, f30, f21, f3
  2035. LFD f16, 8 * SIZE(AO)
  2036. LFD f17, 9 * SIZE(AO)
  2037. LFD f18, 10 * SIZE(AO)
  2038. LFD f19, 11 * SIZE(AO)
  2039. FMA1 f4, f28, f22, f4
  2040. FMA1 f6, f30, f22, f6
  2041. FMA2 f5, f28, f23, f5
  2042. FMA2 f7, f30, f23, f7
  2043. FMA1 f8, f28, f24, f8
  2044. FMA1 f10, f30, f24, f10
  2045. FMA2 f9, f28, f25, f9
  2046. FMA2 f11, f30, f25, f11
  2047. FMA1 f12, f28, f26, f12
  2048. FMA1 f14, f30, f26, f14
  2049. FMA2 f13, f28, f27, f13
  2050. FMA2 f15, f30, f27, f15
  2051. FMA4 f1, f29, f20, f1
  2052. FMA4 f3, f31, f20, f3
  2053. FMA3 f0, f29, f21, f0
  2054. FMA3 f2, f31, f21, f2
  2055. FMA4 f5, f29, f22, f5
  2056. FMA4 f7, f31, f22, f7
  2057. FMA3 f4, f29, f23, f4
  2058. FMA3 f6, f31, f23, f6
  2059. LFD f20, 16 * SIZE(BO)
  2060. LFD f21, 17 * SIZE(BO)
  2061. LFD f22, 18 * SIZE(BO)
  2062. LFD f23, 19 * SIZE(BO)
  2063. FMA4 f9, f29, f24, f9
  2064. FMA4 f11, f31, f24, f11
  2065. FMA3 f8, f29, f25, f8
  2066. FMA3 f10, f31, f25, f10
  2067. FMA4 f13, f29, f26, f13
  2068. FMA4 f15, f31, f26, f15
  2069. FMA3 f12, f29, f27, f12
  2070. FMA3 f14, f31, f27, f14
  2071. LFD f24, 20 * SIZE(BO)
  2072. LFD f25, 21 * SIZE(BO)
  2073. LFD f26, 22 * SIZE(BO)
  2074. LFD f27, 23 * SIZE(BO)
  2075. FMA1 f0, f16, f20, f0
  2076. FMA1 f2, f18, f20, f2
  2077. FMA2 f1, f16, f21, f1
  2078. FMA2 f3, f18, f21, f3
  2079. LFD f28, 12 * SIZE(AO)
  2080. LFD f29, 13 * SIZE(AO)
  2081. LFD f30, 14 * SIZE(AO)
  2082. LFD f31, 15 * SIZE(AO)
  2083. FMA1 f4, f16, f22, f4
  2084. FMA1 f6, f18, f22, f6
  2085. FMA2 f5, f16, f23, f5
  2086. FMA2 f7, f18, f23, f7
  2087. FMA1 f8, f16, f24, f8
  2088. FMA1 f10, f18, f24, f10
  2089. FMA2 f9, f16, f25, f9
  2090. FMA2 f11, f18, f25, f11
  2091. FMA1 f12, f16, f26, f12
  2092. FMA1 f14, f18, f26, f14
  2093. FMA2 f13, f16, f27, f13
  2094. FMA2 f15, f18, f27, f15
  2095. FMA4 f1, f17, f20, f1
  2096. FMA4 f3, f19, f20, f3
  2097. FMA3 f0, f17, f21, f0
  2098. FMA3 f2, f19, f21, f2
  2099. FMA4 f5, f17, f22, f5
  2100. FMA4 f7, f19, f22, f7
  2101. FMA3 f4, f17, f23, f4
  2102. FMA3 f6, f19, f23, f6
  2103. LFD f20, 24 * SIZE(BO)
  2104. LFD f21, 25 * SIZE(BO)
  2105. LFD f22, 26 * SIZE(BO)
  2106. LFD f23, 27 * SIZE(BO)
  2107. FMA4 f9, f17, f24, f9
  2108. FMA4 f11, f19, f24, f11
  2109. FMA3 f8, f17, f25, f8
  2110. FMA3 f10, f19, f25, f10
  2111. FMA4 f13, f17, f26, f13
  2112. FMA4 f15, f19, f26, f15
  2113. FMA3 f12, f17, f27, f12
  2114. FMA3 f14, f19, f27, f14
  2115. LFD f24, 28 * SIZE(BO)
  2116. LFD f25, 29 * SIZE(BO)
  2117. LFD f26, 30 * SIZE(BO)
  2118. LFD f27, 31 * SIZE(BO)
  2119. FMA1 f0, f28, f20, f0
  2120. FMA1 f2, f30, f20, f2
  2121. FMA2 f1, f28, f21, f1
  2122. FMA2 f3, f30, f21, f3
  2123. LFD f16, 16 * SIZE(AO)
  2124. LFD f17, 17 * SIZE(AO)
  2125. LFD f18, 18 * SIZE(AO)
  2126. LFD f19, 19 * SIZE(AO)
  2127. FMA1 f4, f28, f22, f4
  2128. FMA1 f6, f30, f22, f6
  2129. FMA2 f5, f28, f23, f5
  2130. FMA2 f7, f30, f23, f7
  2131. FMA1 f8, f28, f24, f8
  2132. FMA1 f10, f30, f24, f10
  2133. FMA2 f9, f28, f25, f9
  2134. FMA2 f11, f30, f25, f11
  2135. FMA1 f12, f28, f26, f12
  2136. FMA1 f14, f30, f26, f14
  2137. FMA2 f13, f28, f27, f13
  2138. FMA2 f15, f30, f27, f15
  2139. FMA4 f1, f29, f20, f1
  2140. FMA4 f3, f31, f20, f3
  2141. FMA3 f0, f29, f21, f0
  2142. FMA3 f2, f31, f21, f2
  2143. FMA4 f5, f29, f22, f5
  2144. FMA4 f7, f31, f22, f7
  2145. FMA3 f4, f29, f23, f4
  2146. FMA3 f6, f31, f23, f6
  2147. LFD f20, 32 * SIZE(BO)
  2148. LFD f21, 33 * SIZE(BO)
  2149. LFD f22, 34 * SIZE(BO)
  2150. LFD f23, 35 * SIZE(BO)
  2151. FMA4 f9, f29, f24, f9
  2152. FMA4 f11, f31, f24, f11
  2153. FMA3 f8, f29, f25, f8
  2154. FMA3 f10, f31, f25, f10
  2155. FMA4 f13, f29, f26, f13
  2156. FMA4 f15, f31, f26, f15
  2157. FMA3 f12, f29, f27, f12
  2158. FMA3 f14, f31, f27, f14
  2159. LFD f24, 36 * SIZE(BO)
  2160. LFD f25, 37 * SIZE(BO)
  2161. LFD f26, 38 * SIZE(BO)
  2162. LFD f27, 39 * SIZE(BO)
  2163. FMA1 f0, f16, f20, f0
  2164. FMA1 f2, f18, f20, f2
  2165. FMA2 f1, f16, f21, f1
  2166. FMA2 f3, f18, f21, f3
  2167. LFD f28, 20 * SIZE(AO)
  2168. LFD f29, 21 * SIZE(AO)
  2169. LFD f30, 22 * SIZE(AO)
  2170. LFD f31, 23 * SIZE(AO)
  2171. FMA1 f4, f16, f22, f4
  2172. FMA1 f6, f18, f22, f6
  2173. FMA2 f5, f16, f23, f5
  2174. FMA2 f7, f18, f23, f7
  2175. FMA1 f8, f16, f24, f8
  2176. FMA1 f10, f18, f24, f10
  2177. FMA2 f9, f16, f25, f9
  2178. FMA2 f11, f18, f25, f11
  2179. FMA1 f12, f16, f26, f12
  2180. FMA1 f14, f18, f26, f14
  2181. FMA2 f13, f16, f27, f13
  2182. FMA2 f15, f18, f27, f15
  2183. FMA4 f1, f17, f20, f1
  2184. FMA4 f3, f19, f20, f3
  2185. FMA3 f0, f17, f21, f0
  2186. FMA3 f2, f19, f21, f2
  2187. FMA4 f5, f17, f22, f5
  2188. FMA4 f7, f19, f22, f7
  2189. FMA3 f4, f17, f23, f4
  2190. FMA3 f6, f19, f23, f6
  2191. LFD f20, 40 * SIZE(BO)
  2192. LFD f21, 41 * SIZE(BO)
  2193. LFD f22, 42 * SIZE(BO)
  2194. LFD f23, 43 * SIZE(BO)
  2195. FMA4 f9, f17, f24, f9
  2196. FMA4 f11, f19, f24, f11
  2197. FMA3 f8, f17, f25, f8
  2198. FMA3 f10, f19, f25, f10
  2199. FMA4 f13, f17, f26, f13
  2200. FMA4 f15, f19, f26, f15
  2201. FMA3 f12, f17, f27, f12
  2202. FMA3 f14, f19, f27, f14
  2203. LFD f24, 44 * SIZE(BO)
  2204. LFD f25, 45 * SIZE(BO)
  2205. LFD f26, 46 * SIZE(BO)
  2206. LFD f27, 47 * SIZE(BO)
  2207. FMA1 f0, f28, f20, f0
  2208. FMA1 f2, f30, f20, f2
  2209. FMA2 f1, f28, f21, f1
  2210. FMA2 f3, f30, f21, f3
  2211. LFD f16, 24 * SIZE(AO)
  2212. LFD f17, 25 * SIZE(AO)
  2213. LFD f18, 26 * SIZE(AO)
  2214. LFD f19, 27 * SIZE(AO)
  2215. FMA1 f4, f28, f22, f4
  2216. FMA1 f6, f30, f22, f6
  2217. FMA2 f5, f28, f23, f5
  2218. FMA2 f7, f30, f23, f7
  2219. FMA1 f8, f28, f24, f8
  2220. FMA1 f10, f30, f24, f10
  2221. FMA2 f9, f28, f25, f9
  2222. FMA2 f11, f30, f25, f11
  2223. FMA1 f12, f28, f26, f12
  2224. FMA1 f14, f30, f26, f14
  2225. FMA2 f13, f28, f27, f13
  2226. FMA2 f15, f30, f27, f15
  2227. FMA4 f1, f29, f20, f1
  2228. FMA4 f3, f31, f20, f3
  2229. FMA3 f0, f29, f21, f0
  2230. FMA3 f2, f31, f21, f2
  2231. FMA4 f5, f29, f22, f5
  2232. FMA4 f7, f31, f22, f7
  2233. FMA3 f4, f29, f23, f4
  2234. FMA3 f6, f31, f23, f6
  2235. LFD f20, 48 * SIZE(BO)
  2236. LFD f21, 49 * SIZE(BO)
  2237. LFD f22, 50 * SIZE(BO)
  2238. LFD f23, 51 * SIZE(BO)
  2239. FMA4 f9, f29, f24, f9
  2240. FMA4 f11, f31, f24, f11
  2241. FMA3 f8, f29, f25, f8
  2242. FMA3 f10, f31, f25, f10
  2243. FMA4 f13, f29, f26, f13
  2244. FMA4 f15, f31, f26, f15
  2245. FMA3 f12, f29, f27, f12
  2246. FMA3 f14, f31, f27, f14
  2247. LFD f24, 52 * SIZE(BO)
  2248. LFD f25, 53 * SIZE(BO)
  2249. LFD f26, 54 * SIZE(BO)
  2250. LFD f27, 55 * SIZE(BO)
  2251. FMA1 f0, f16, f20, f0
  2252. FMA1 f2, f18, f20, f2
  2253. FMA2 f1, f16, f21, f1
  2254. FMA2 f3, f18, f21, f3
  2255. LFD f28, 28 * SIZE(AO)
  2256. LFD f29, 29 * SIZE(AO)
  2257. LFD f30, 30 * SIZE(AO)
  2258. LFD f31, 31 * SIZE(AO)
  2259. FMA1 f4, f16, f22, f4
  2260. FMA1 f6, f18, f22, f6
  2261. FMA2 f5, f16, f23, f5
  2262. FMA2 f7, f18, f23, f7
  2263. FMA1 f8, f16, f24, f8
  2264. FMA1 f10, f18, f24, f10
  2265. FMA2 f9, f16, f25, f9
  2266. FMA2 f11, f18, f25, f11
  2267. FMA1 f12, f16, f26, f12
  2268. FMA1 f14, f18, f26, f14
  2269. FMA2 f13, f16, f27, f13
  2270. FMA2 f15, f18, f27, f15
  2271. FMA4 f1, f17, f20, f1
  2272. FMA4 f3, f19, f20, f3
  2273. FMA3 f0, f17, f21, f0
  2274. FMA3 f2, f19, f21, f2
  2275. FMA4 f5, f17, f22, f5
  2276. FMA4 f7, f19, f22, f7
  2277. FMA3 f4, f17, f23, f4
  2278. FMA3 f6, f19, f23, f6
  2279. LFD f20, 56 * SIZE(BO)
  2280. LFD f21, 57 * SIZE(BO)
  2281. LFD f22, 58 * SIZE(BO)
  2282. LFD f23, 59 * SIZE(BO)
  2283. FMA4 f9, f17, f24, f9
  2284. FMA4 f11, f19, f24, f11
  2285. FMA3 f8, f17, f25, f8
  2286. FMA3 f10, f19, f25, f10
  2287. FMA4 f13, f17, f26, f13
  2288. FMA4 f15, f19, f26, f15
  2289. FMA3 f12, f17, f27, f12
  2290. FMA3 f14, f19, f27, f14
  2291. LFD f24, 60 * SIZE(BO)
  2292. LFD f25, 61 * SIZE(BO)
  2293. LFD f26, 62 * SIZE(BO)
  2294. LFD f27, 63 * SIZE(BO)
  2295. FMA1 f0, f28, f20, f0
  2296. FMA1 f2, f30, f20, f2
  2297. FMA2 f1, f28, f21, f1
  2298. FMA2 f3, f30, f21, f3
  2299. LFD f16, 32 * SIZE(AO)
  2300. LFD f17, 33 * SIZE(AO)
  2301. LFD f18, 34 * SIZE(AO)
  2302. LFD f19, 35 * SIZE(AO)
  2303. FMA1 f4, f28, f22, f4
  2304. FMA1 f6, f30, f22, f6
  2305. FMA2 f5, f28, f23, f5
  2306. FMA2 f7, f30, f23, f7
  2307. FMA1 f8, f28, f24, f8
  2308. FMA1 f10, f30, f24, f10
  2309. FMA2 f9, f28, f25, f9
  2310. FMA2 f11, f30, f25, f11
  2311. FMA1 f12, f28, f26, f12
  2312. FMA1 f14, f30, f26, f14
  2313. FMA2 f13, f28, f27, f13
  2314. FMA2 f15, f30, f27, f15
  2315. FMA4 f1, f29, f20, f1
  2316. FMA4 f3, f31, f20, f3
  2317. FMA3 f0, f29, f21, f0
  2318. FMA3 f2, f31, f21, f2
  2319. FMA4 f5, f29, f22, f5
  2320. FMA4 f7, f31, f22, f7
  2321. FMA3 f4, f29, f23, f4
  2322. FMA3 f6, f31, f23, f6
  2323. LFD f20, 64 * SIZE(BO)
  2324. LFD f21, 65 * SIZE(BO)
  2325. LFD f22, 66 * SIZE(BO)
  2326. LFD f23, 67 * SIZE(BO)
  2327. FMA4 f9, f29, f24, f9
  2328. FMA4 f11, f31, f24, f11
  2329. FMA3 f8, f29, f25, f8
  2330. FMA3 f10, f31, f25, f10
  2331. FMA4 f13, f29, f26, f13
  2332. FMA4 f15, f31, f26, f15
  2333. FMA3 f12, f29, f27, f12
  2334. FMA3 f14, f31, f27, f14
  2335. LFD f24, 68 * SIZE(BO)
  2336. LFD f25, 69 * SIZE(BO)
  2337. LFD f26, 70 * SIZE(BO)
  2338. LFD f27, 71 * SIZE(BO)
  2339. addi AO, AO, 32 * SIZE
  2340. addi BO, BO, 64 * SIZE
  2341. bdnz LL(12)
  2342. .align 4
  2343. LL(15):
  2344. #if defined(LT) || defined(RN)
  2345. andi. r0, KK, 7
  2346. #else
  2347. andi. r0, TEMP, 7
  2348. #endif
  2349. mtspr CTR, r0
  2350. ble LL(18)
  2351. .align 4
  2352. LL(16):
  2353. FMA1 f0, f16, f20, f0
  2354. FMA1 f2, f18, f20, f2
  2355. FMA2 f1, f16, f21, f1
  2356. FMA2 f3, f18, f21, f3
  2357. FMA1 f4, f16, f22, f4
  2358. FMA1 f6, f18, f22, f6
  2359. FMA2 f5, f16, f23, f5
  2360. FMA2 f7, f18, f23, f7
  2361. FMA1 f8, f16, f24, f8
  2362. FMA1 f10, f18, f24, f10
  2363. FMA2 f9, f16, f25, f9
  2364. FMA2 f11, f18, f25, f11
  2365. FMA1 f12, f16, f26, f12
  2366. FMA1 f14, f18, f26, f14
  2367. FMA2 f13, f16, f27, f13
  2368. FMA2 f15, f18, f27, f15
  2369. FMA4 f1, f17, f20, f1
  2370. FMA4 f3, f19, f20, f3
  2371. FMA3 f0, f17, f21, f0
  2372. FMA3 f2, f19, f21, f2
  2373. FMA4 f5, f17, f22, f5
  2374. FMA4 f7, f19, f22, f7
  2375. FMA3 f4, f17, f23, f4
  2376. FMA3 f6, f19, f23, f6
  2377. FMA4 f9, f17, f24, f9
  2378. FMA4 f11, f19, f24, f11
  2379. FMA3 f8, f17, f25, f8
  2380. FMA3 f10, f19, f25, f10
  2381. FMA4 f13, f17, f26, f13
  2382. FMA4 f15, f19, f26, f15
  2383. FMA3 f12, f17, f27, f12
  2384. FMA3 f14, f19, f27, f14
  2385. LFD f16, 4 * SIZE(AO)
  2386. LFD f17, 5 * SIZE(AO)
  2387. LFD f18, 6 * SIZE(AO)
  2388. LFD f19, 7 * SIZE(AO)
  2389. LFD f20, 8 * SIZE(BO)
  2390. LFD f21, 9 * SIZE(BO)
  2391. LFD f22, 10 * SIZE(BO)
  2392. LFD f23, 11 * SIZE(BO)
  2393. LFD f24, 12 * SIZE(BO)
  2394. LFD f25, 13 * SIZE(BO)
  2395. LFD f26, 14 * SIZE(BO)
  2396. LFD f27, 15 * SIZE(BO)
  2397. addi AO, AO, 4 * SIZE
  2398. addi BO, BO, 8 * SIZE
  2399. bdnz LL(16)
  2400. .align 4
  2401. LL(18):
  2402. #if defined(LN) || defined(RT)
  2403. #ifdef LN
  2404. subi r0, KK, 2
  2405. #else
  2406. subi r0, KK, 4
  2407. #endif
  2408. slwi TEMP, r0, 1 + ZBASE_SHIFT
  2409. slwi r0, r0, 2 + ZBASE_SHIFT
  2410. add AO, AORIG, TEMP
  2411. add BO, B, r0
  2412. #endif
  2413. #if defined(LN) || defined(LT)
  2414. LFD f16, 0 * SIZE(BO)
  2415. LFD f17, 1 * SIZE(BO)
  2416. LFD f18, 2 * SIZE(BO)
  2417. LFD f19, 3 * SIZE(BO)
  2418. FSUB f0, f16, f0
  2419. FSUB f1, f17, f1
  2420. FSUB f4, f18, f4
  2421. FSUB f5, f19, f5
  2422. LFD f20, 4 * SIZE(BO)
  2423. LFD f21, 5 * SIZE(BO)
  2424. LFD f22, 6 * SIZE(BO)
  2425. LFD f23, 7 * SIZE(BO)
  2426. FSUB f8, f20, f8
  2427. FSUB f9, f21, f9
  2428. FSUB f12, f22, f12
  2429. FSUB f13, f23, f13
  2430. LFD f24, 8 * SIZE(BO)
  2431. LFD f25, 9 * SIZE(BO)
  2432. LFD f26, 10 * SIZE(BO)
  2433. LFD f27, 11 * SIZE(BO)
  2434. FSUB f2, f24, f2
  2435. FSUB f3, f25, f3
  2436. FSUB f6, f26, f6
  2437. FSUB f7, f27, f7
  2438. LFD f28, 12 * SIZE(BO)
  2439. LFD f29, 13 * SIZE(BO)
  2440. LFD f30, 14 * SIZE(BO)
  2441. LFD f31, 15 * SIZE(BO)
  2442. FSUB f10, f28, f10
  2443. FSUB f11, f29, f11
  2444. FSUB f14, f30, f14
  2445. FSUB f15, f31, f15
  2446. #else
  2447. LFD f16, 0 * SIZE(AO)
  2448. LFD f17, 1 * SIZE(AO)
  2449. LFD f18, 2 * SIZE(AO)
  2450. LFD f19, 3 * SIZE(AO)
  2451. FSUB f0, f16, f0
  2452. FSUB f1, f17, f1
  2453. FSUB f2, f18, f2
  2454. FSUB f3, f19, f3
  2455. LFD f20, 4 * SIZE(AO)
  2456. LFD f21, 5 * SIZE(AO)
  2457. LFD f22, 6 * SIZE(AO)
  2458. LFD f23, 7 * SIZE(AO)
  2459. FSUB f4, f20, f4
  2460. FSUB f5, f21, f5
  2461. FSUB f6, f22, f6
  2462. FSUB f7, f23, f7
  2463. LFD f24, 8 * SIZE(AO)
  2464. LFD f25, 9 * SIZE(AO)
  2465. LFD f26, 10 * SIZE(AO)
  2466. LFD f27, 11 * SIZE(AO)
  2467. FSUB f8, f24, f8
  2468. FSUB f9, f25, f9
  2469. FSUB f10, f26, f10
  2470. FSUB f11, f27, f11
  2471. LFD f28, 12 * SIZE(AO)
  2472. LFD f29, 13 * SIZE(AO)
  2473. LFD f30, 14 * SIZE(AO)
  2474. LFD f31, 15 * SIZE(AO)
  2475. FSUB f12, f28, f12
  2476. FSUB f13, f29, f13
  2477. FSUB f14, f30, f14
  2478. FSUB f15, f31, f15
  2479. #endif
  2480. #ifdef LN
  2481. LFD f24, 6 * SIZE(AO)
  2482. LFD f25, 7 * SIZE(AO)
  2483. LFD f26, 4 * SIZE(AO)
  2484. LFD f27, 5 * SIZE(AO)
  2485. LFD f28, 0 * SIZE(AO)
  2486. LFD f29, 1 * SIZE(AO)
  2487. FMUL f16, f25, f3
  2488. FMUL f17, f25, f2
  2489. FMUL f18, f25, f7
  2490. FMUL f19, f25, f6
  2491. FMUL f20, f25, f11
  2492. FMUL f21, f25, f10
  2493. FMUL f22, f25, f15
  2494. FMUL f23, f25, f14
  2495. #ifndef CONJ
  2496. FMSUB f2, f24, f2, f16
  2497. FMADD f3, f24, f3, f17
  2498. FMSUB f6, f24, f6, f18
  2499. FMADD f7, f24, f7, f19
  2500. FMSUB f10, f24, f10, f20
  2501. FMADD f11, f24, f11, f21
  2502. FMSUB f14, f24, f14, f22
  2503. FMADD f15, f24, f15, f23
  2504. FMADD f0, f27, f3, f0
  2505. FNMSUB f1, f27, f2, f1
  2506. FMADD f4, f27, f7, f4
  2507. FNMSUB f5, f27, f6, f5
  2508. FMADD f8, f27, f11, f8
  2509. FNMSUB f9, f27, f10, f9
  2510. FMADD f12, f27, f15, f12
  2511. FNMSUB f13, f27, f14, f13
  2512. FNMSUB f0, f26, f2, f0
  2513. FNMSUB f1, f26, f3, f1
  2514. FNMSUB f4, f26, f6, f4
  2515. FNMSUB f5, f26, f7, f5
  2516. FNMSUB f8, f26, f10, f8
  2517. FNMSUB f9, f26, f11, f9
  2518. FNMSUB f12, f26, f14, f12
  2519. FNMSUB f13, f26, f15, f13
  2520. FMUL f16, f29, f1
  2521. FMUL f17, f29, f0
  2522. FMUL f18, f29, f5
  2523. FMUL f19, f29, f4
  2524. FMUL f20, f29, f9
  2525. FMUL f21, f29, f8
  2526. FMUL f22, f29, f13
  2527. FMUL f23, f29, f12
  2528. FMSUB f0, f28, f0, f16
  2529. FMADD f1, f28, f1, f17
  2530. FMSUB f4, f28, f4, f18
  2531. FMADD f5, f28, f5, f19
  2532. FMSUB f8, f28, f8, f20
  2533. FMADD f9, f28, f9, f21
  2534. FMSUB f12, f28, f12, f22
  2535. FMADD f13, f28, f13, f23
  2536. #else
  2537. FMADD f2, f24, f2, f16
  2538. FMSUB f3, f24, f3, f17
  2539. FMADD f6, f24, f6, f18
  2540. FMSUB f7, f24, f7, f19
  2541. FMADD f10, f24, f10, f20
  2542. FMSUB f11, f24, f11, f21
  2543. FMADD f14, f24, f14, f22
  2544. FMSUB f15, f24, f15, f23
  2545. FMSUB f0, f27, f3, f0
  2546. FNMADD f1, f27, f2, f1
  2547. FMSUB f4, f27, f7, f4
  2548. FNMADD f5, f27, f6, f5
  2549. FMSUB f8, f27, f11, f8
  2550. FNMADD f9, f27, f10, f9
  2551. FMSUB f12, f27, f15, f12
  2552. FNMADD f13, f27, f14, f13
  2553. FNMADD f0, f26, f2, f0
  2554. FNMADD f1, f26, f3, f1
  2555. FNMADD f4, f26, f6, f4
  2556. FNMADD f5, f26, f7, f5
  2557. FNMADD f8, f26, f10, f8
  2558. FNMADD f9, f26, f11, f9
  2559. FNMADD f12, f26, f14, f12
  2560. FNMADD f13, f26, f15, f13
  2561. FMUL f16, f29, f1
  2562. FMUL f17, f29, f0
  2563. FMUL f18, f29, f5
  2564. FMUL f19, f29, f4
  2565. FMUL f20, f29, f9
  2566. FMUL f21, f29, f8
  2567. FMUL f22, f29, f13
  2568. FMUL f23, f29, f12
  2569. FMADD f0, f28, f0, f16
  2570. FMSUB f1, f28, f1, f17
  2571. FMADD f4, f28, f4, f18
  2572. FMSUB f5, f28, f5, f19
  2573. FMADD f8, f28, f8, f20
  2574. FMSUB f9, f28, f9, f21
  2575. FMADD f12, f28, f12, f22
  2576. FMSUB f13, f28, f13, f23
  2577. #endif
  2578. #endif
  2579. #ifdef LT
  2580. LFD f24, 0 * SIZE(AO)
  2581. LFD f25, 1 * SIZE(AO)
  2582. LFD f26, 2 * SIZE(AO)
  2583. LFD f27, 3 * SIZE(AO)
  2584. LFD f28, 6 * SIZE(AO)
  2585. LFD f29, 7 * SIZE(AO)
  2586. FMUL f16, f25, f1
  2587. FMUL f17, f25, f0
  2588. FMUL f18, f25, f5
  2589. FMUL f19, f25, f4
  2590. FMUL f20, f25, f9
  2591. FMUL f21, f25, f8
  2592. FMUL f22, f25, f13
  2593. FMUL f23, f25, f12
  2594. #ifndef CONJ
  2595. FMSUB f0, f24, f0, f16
  2596. FMADD f1, f24, f1, f17
  2597. FMSUB f4, f24, f4, f18
  2598. FMADD f5, f24, f5, f19
  2599. FMSUB f8, f24, f8, f20
  2600. FMADD f9, f24, f9, f21
  2601. FMSUB f12, f24, f12, f22
  2602. FMADD f13, f24, f13, f23
  2603. FMADD f2, f27, f1, f2
  2604. FNMSUB f3, f27, f0, f3
  2605. FMADD f6, f27, f5, f6
  2606. FNMSUB f7, f27, f4, f7
  2607. FMADD f10, f27, f9, f10
  2608. FNMSUB f11, f27, f8, f11
  2609. FMADD f14, f27, f13, f14
  2610. FNMSUB f15, f27, f12, f15
  2611. FNMSUB f2, f26, f0, f2
  2612. FNMSUB f3, f26, f1, f3
  2613. FNMSUB f6, f26, f4, f6
  2614. FNMSUB f7, f26, f5, f7
  2615. FNMSUB f10, f26, f8, f10
  2616. FNMSUB f11, f26, f9, f11
  2617. FNMSUB f14, f26, f12, f14
  2618. FNMSUB f15, f26, f13, f15
  2619. FMUL f16, f29, f3
  2620. FMUL f17, f29, f2
  2621. FMUL f18, f29, f7
  2622. FMUL f19, f29, f6
  2623. FMUL f20, f29, f11
  2624. FMUL f21, f29, f10
  2625. FMUL f22, f29, f15
  2626. FMUL f23, f29, f14
  2627. FMSUB f2, f28, f2, f16
  2628. FMADD f3, f28, f3, f17
  2629. FMSUB f6, f28, f6, f18
  2630. FMADD f7, f28, f7, f19
  2631. FMSUB f10, f28, f10, f20
  2632. FMADD f11, f28, f11, f21
  2633. FMSUB f14, f28, f14, f22
  2634. FMADD f15, f28, f15, f23
  2635. #else
  2636. FMADD f0, f24, f0, f16
  2637. FMSUB f1, f24, f1, f17
  2638. FMADD f4, f24, f4, f18
  2639. FMSUB f5, f24, f5, f19
  2640. FMADD f8, f24, f8, f20
  2641. FMSUB f9, f24, f9, f21
  2642. FMADD f12, f24, f12, f22
  2643. FMSUB f13, f24, f13, f23
  2644. FMSUB f2, f27, f1, f2
  2645. FNMADD f3, f27, f0, f3
  2646. FMSUB f6, f27, f5, f6
  2647. FNMADD f7, f27, f4, f7
  2648. FMSUB f10, f27, f9, f10
  2649. FNMADD f11, f27, f8, f11
  2650. FMSUB f14, f27, f13, f14
  2651. FNMADD f15, f27, f12, f15
  2652. FNMADD f2, f26, f0, f2
  2653. FNMADD f3, f26, f1, f3
  2654. FNMADD f6, f26, f4, f6
  2655. FNMADD f7, f26, f5, f7
  2656. FNMADD f10, f26, f8, f10
  2657. FNMADD f11, f26, f9, f11
  2658. FNMADD f14, f26, f12, f14
  2659. FNMADD f15, f26, f13, f15
  2660. FMUL f16, f29, f3
  2661. FMUL f17, f29, f2
  2662. FMUL f18, f29, f7
  2663. FMUL f19, f29, f6
  2664. FMUL f20, f29, f11
  2665. FMUL f21, f29, f10
  2666. FMUL f22, f29, f15
  2667. FMUL f23, f29, f14
  2668. FMADD f2, f28, f2, f16
  2669. FMSUB f3, f28, f3, f17
  2670. FMADD f6, f28, f6, f18
  2671. FMSUB f7, f28, f7, f19
  2672. FMADD f10, f28, f10, f20
  2673. FMSUB f11, f28, f11, f21
  2674. FMADD f14, f28, f14, f22
  2675. FMSUB f15, f28, f15, f23
  2676. #endif
  2677. #endif
  2678. #ifdef RN
  2679. LFD f24, 0 * SIZE(BO)
  2680. LFD f25, 1 * SIZE(BO)
  2681. LFD f26, 2 * SIZE(BO)
  2682. LFD f27, 3 * SIZE(BO)
  2683. LFD f28, 4 * SIZE(BO)
  2684. LFD f29, 5 * SIZE(BO)
  2685. LFD f30, 6 * SIZE(BO)
  2686. LFD f31, 7 * SIZE(BO)
  2687. FMUL f16, f25, f1
  2688. FMUL f17, f25, f0
  2689. FMUL f18, f25, f3
  2690. FMUL f19, f25, f2
  2691. #ifndef CONJ
  2692. FMSUB f0, f24, f0, f16
  2693. FMADD f1, f24, f1, f17
  2694. FMSUB f2, f24, f2, f18
  2695. FMADD f3, f24, f3, f19
  2696. FMADD f4, f27, f1, f4
  2697. FNMSUB f5, f27, f0, f5
  2698. FMADD f6, f27, f3, f6
  2699. FNMSUB f7, f27, f2, f7
  2700. FNMSUB f4, f26, f0, f4
  2701. FNMSUB f5, f26, f1, f5
  2702. FNMSUB f6, f26, f2, f6
  2703. FNMSUB f7, f26, f3, f7
  2704. FMADD f8, f29, f1, f8
  2705. FNMSUB f9, f29, f0, f9
  2706. FMADD f10, f29, f3, f10
  2707. FNMSUB f11, f29, f2, f11
  2708. FNMSUB f8, f28, f0, f8
  2709. FNMSUB f9, f28, f1, f9
  2710. FNMSUB f10, f28, f2, f10
  2711. FNMSUB f11, f28, f3, f11
  2712. FMADD f12, f31, f1, f12
  2713. FNMSUB f13, f31, f0, f13
  2714. FMADD f14, f31, f3, f14
  2715. FNMSUB f15, f31, f2, f15
  2716. FNMSUB f12, f30, f0, f12
  2717. FNMSUB f13, f30, f1, f13
  2718. FNMSUB f14, f30, f2, f14
  2719. FNMSUB f15, f30, f3, f15
  2720. LFD f26, 10 * SIZE(BO)
  2721. LFD f27, 11 * SIZE(BO)
  2722. LFD f28, 12 * SIZE(BO)
  2723. LFD f29, 13 * SIZE(BO)
  2724. LFD f30, 14 * SIZE(BO)
  2725. LFD f31, 15 * SIZE(BO)
  2726. FMUL f16, f27, f5
  2727. FMUL f17, f27, f4
  2728. FMUL f18, f27, f7
  2729. FMUL f19, f27, f6
  2730. FMSUB f4, f26, f4, f16
  2731. FMADD f5, f26, f5, f17
  2732. FMSUB f6, f26, f6, f18
  2733. FMADD f7, f26, f7, f19
  2734. FMADD f8, f29, f5, f8
  2735. FNMSUB f9, f29, f4, f9
  2736. FMADD f10, f29, f7, f10
  2737. FNMSUB f11, f29, f6, f11
  2738. FNMSUB f8, f28, f4, f8
  2739. FNMSUB f9, f28, f5, f9
  2740. FNMSUB f10, f28, f6, f10
  2741. FNMSUB f11, f28, f7, f11
  2742. FMADD f12, f31, f5, f12
  2743. FNMSUB f13, f31, f4, f13
  2744. FMADD f14, f31, f7, f14
  2745. FNMSUB f15, f31, f6, f15
  2746. FNMSUB f12, f30, f4, f12
  2747. FNMSUB f13, f30, f5, f13
  2748. FNMSUB f14, f30, f6, f14
  2749. FNMSUB f15, f30, f7, f15
  2750. LFD f26, 20 * SIZE(BO)
  2751. LFD f27, 21 * SIZE(BO)
  2752. LFD f28, 22 * SIZE(BO)
  2753. LFD f29, 23 * SIZE(BO)
  2754. LFD f30, 30 * SIZE(BO)
  2755. LFD f31, 31 * SIZE(BO)
  2756. FMUL f16, f27, f9
  2757. FMUL f17, f27, f8
  2758. FMUL f18, f27, f11
  2759. FMUL f19, f27, f10
  2760. FMSUB f8, f26, f8, f16
  2761. FMADD f9, f26, f9, f17
  2762. FMSUB f10, f26, f10, f18
  2763. FMADD f11, f26, f11, f19
  2764. FMADD f12, f29, f9, f12
  2765. FNMSUB f13, f29, f8, f13
  2766. FMADD f14, f29, f11, f14
  2767. FNMSUB f15, f29, f10, f15
  2768. FNMSUB f12, f28, f8, f12
  2769. FNMSUB f13, f28, f9, f13
  2770. FNMSUB f14, f28, f10, f14
  2771. FNMSUB f15, f28, f11, f15
  2772. FMUL f16, f31, f13
  2773. FMUL f17, f31, f12
  2774. FMUL f18, f31, f15
  2775. FMUL f19, f31, f14
  2776. FMSUB f12, f30, f12, f16
  2777. FMADD f13, f30, f13, f17
  2778. FMSUB f14, f30, f14, f18
  2779. FMADD f15, f30, f15, f19
  2780. #else
  2781. FMADD f0, f24, f0, f16
  2782. FMSUB f1, f24, f1, f17
  2783. FMADD f2, f24, f2, f18
  2784. FMSUB f3, f24, f3, f19
  2785. FMSUB f4, f27, f1, f4
  2786. FNMADD f5, f27, f0, f5
  2787. FMSUB f6, f27, f3, f6
  2788. FNMADD f7, f27, f2, f7
  2789. FNMADD f4, f26, f0, f4
  2790. FNMADD f5, f26, f1, f5
  2791. FNMADD f6, f26, f2, f6
  2792. FNMADD f7, f26, f3, f7
  2793. FMSUB f8, f29, f1, f8
  2794. FNMADD f9, f29, f0, f9
  2795. FMSUB f10, f29, f3, f10
  2796. FNMADD f11, f29, f2, f11
  2797. FNMADD f8, f28, f0, f8
  2798. FNMADD f9, f28, f1, f9
  2799. FNMADD f10, f28, f2, f10
  2800. FNMADD f11, f28, f3, f11
  2801. FMSUB f12, f31, f1, f12
  2802. FNMADD f13, f31, f0, f13
  2803. FMSUB f14, f31, f3, f14
  2804. FNMADD f15, f31, f2, f15
  2805. FNMADD f12, f30, f0, f12
  2806. FNMADD f13, f30, f1, f13
  2807. FNMADD f14, f30, f2, f14
  2808. FNMADD f15, f30, f3, f15
  2809. LFD f26, 10 * SIZE(BO)
  2810. LFD f27, 11 * SIZE(BO)
  2811. LFD f28, 12 * SIZE(BO)
  2812. LFD f29, 13 * SIZE(BO)
  2813. LFD f30, 14 * SIZE(BO)
  2814. LFD f31, 15 * SIZE(BO)
  2815. FMUL f16, f27, f5
  2816. FMUL f17, f27, f4
  2817. FMUL f18, f27, f7
  2818. FMUL f19, f27, f6
  2819. FMADD f4, f26, f4, f16
  2820. FMSUB f5, f26, f5, f17
  2821. FMADD f6, f26, f6, f18
  2822. FMSUB f7, f26, f7, f19
  2823. FMSUB f8, f29, f5, f8
  2824. FNMADD f9, f29, f4, f9
  2825. FMSUB f10, f29, f7, f10
  2826. FNMADD f11, f29, f6, f11
  2827. FNMADD f8, f28, f4, f8
  2828. FNMADD f9, f28, f5, f9
  2829. FNMADD f10, f28, f6, f10
  2830. FNMADD f11, f28, f7, f11
  2831. FMSUB f12, f31, f5, f12
  2832. FNMADD f13, f31, f4, f13
  2833. FMSUB f14, f31, f7, f14
  2834. FNMADD f15, f31, f6, f15
  2835. FNMADD f12, f30, f4, f12
  2836. FNMADD f13, f30, f5, f13
  2837. FNMADD f14, f30, f6, f14
  2838. FNMADD f15, f30, f7, f15
  2839. LFD f26, 20 * SIZE(BO)
  2840. LFD f27, 21 * SIZE(BO)
  2841. LFD f28, 22 * SIZE(BO)
  2842. LFD f29, 23 * SIZE(BO)
  2843. LFD f30, 30 * SIZE(BO)
  2844. LFD f31, 31 * SIZE(BO)
  2845. FMUL f16, f27, f9
  2846. FMUL f17, f27, f8
  2847. FMUL f18, f27, f11
  2848. FMUL f19, f27, f10
  2849. FMADD f8, f26, f8, f16
  2850. FMSUB f9, f26, f9, f17
  2851. FMADD f10, f26, f10, f18
  2852. FMSUB f11, f26, f11, f19
  2853. FMSUB f12, f29, f9, f12
  2854. FNMADD f13, f29, f8, f13
  2855. FMSUB f14, f29, f11, f14
  2856. FNMADD f15, f29, f10, f15
  2857. FNMADD f12, f28, f8, f12
  2858. FNMADD f13, f28, f9, f13
  2859. FNMADD f14, f28, f10, f14
  2860. FNMADD f15, f28, f11, f15
  2861. FMUL f16, f31, f13
  2862. FMUL f17, f31, f12
  2863. FMUL f18, f31, f15
  2864. FMUL f19, f31, f14
  2865. FMADD f12, f30, f12, f16
  2866. FMSUB f13, f30, f13, f17
  2867. FMADD f14, f30, f14, f18
  2868. FMSUB f15, f30, f15, f19
  2869. #endif
  2870. #endif
  2871. #ifdef RT
  2872. LFD f24, 30 * SIZE(BO)
  2873. LFD f25, 31 * SIZE(BO)
  2874. LFD f26, 28 * SIZE(BO)
  2875. LFD f27, 29 * SIZE(BO)
  2876. LFD f28, 26 * SIZE(BO)
  2877. LFD f29, 27 * SIZE(BO)
  2878. LFD f30, 24 * SIZE(BO)
  2879. LFD f31, 25 * SIZE(BO)
  2880. FMUL f16, f25, f13
  2881. FMUL f17, f25, f12
  2882. FMUL f18, f25, f15
  2883. FMUL f19, f25, f14
  2884. #ifndef CONJ
  2885. FMSUB f12, f24, f12, f16
  2886. FMADD f13, f24, f13, f17
  2887. FMSUB f14, f24, f14, f18
  2888. FMADD f15, f24, f15, f19
  2889. FMADD f8, f27, f13, f8
  2890. FNMSUB f9, f27, f12, f9
  2891. FMADD f10, f27, f15, f10
  2892. FNMSUB f11, f27, f14, f11
  2893. FNMSUB f8, f26, f12, f8
  2894. FNMSUB f9, f26, f13, f9
  2895. FNMSUB f10, f26, f14, f10
  2896. FNMSUB f11, f26, f15, f11
  2897. FMADD f4, f29, f13, f4
  2898. FNMSUB f5, f29, f12, f5
  2899. FMADD f6, f29, f15, f6
  2900. FNMSUB f7, f29, f14, f7
  2901. FNMSUB f4, f28, f12, f4
  2902. FNMSUB f5, f28, f13, f5
  2903. FNMSUB f6, f28, f14, f6
  2904. FNMSUB f7, f28, f15, f7
  2905. FMADD f0, f31, f13, f0
  2906. FNMSUB f1, f31, f12, f1
  2907. FMADD f2, f31, f15, f2
  2908. FNMSUB f3, f31, f14, f3
  2909. FNMSUB f0, f30, f12, f0
  2910. FNMSUB f1, f30, f13, f1
  2911. FNMSUB f2, f30, f14, f2
  2912. FNMSUB f3, f30, f15, f3
  2913. LFD f26, 20 * SIZE(BO)
  2914. LFD f27, 21 * SIZE(BO)
  2915. LFD f28, 18 * SIZE(BO)
  2916. LFD f29, 19 * SIZE(BO)
  2917. LFD f30, 16 * SIZE(BO)
  2918. LFD f31, 17 * SIZE(BO)
  2919. FMUL f16, f27, f9
  2920. FMUL f17, f27, f8
  2921. FMUL f18, f27, f11
  2922. FMUL f19, f27, f10
  2923. FMSUB f8, f26, f8, f16
  2924. FMADD f9, f26, f9, f17
  2925. FMSUB f10, f26, f10, f18
  2926. FMADD f11, f26, f11, f19
  2927. FMADD f4, f29, f9, f4
  2928. FNMSUB f5, f29, f8, f5
  2929. FMADD f6, f29, f11, f6
  2930. FNMSUB f7, f29, f10, f7
  2931. FNMSUB f4, f28, f8, f4
  2932. FNMSUB f5, f28, f9, f5
  2933. FNMSUB f6, f28, f10, f6
  2934. FNMSUB f7, f28, f11, f7
  2935. FMADD f0, f31, f9, f0
  2936. FNMSUB f1, f31, f8, f1
  2937. FMADD f2, f31, f11, f2
  2938. FNMSUB f3, f31, f10, f3
  2939. FNMSUB f0, f30, f8, f0
  2940. FNMSUB f1, f30, f9, f1
  2941. FNMSUB f2, f30, f10, f2
  2942. FNMSUB f3, f30, f11, f3
  2943. LFD f26, 10 * SIZE(BO)
  2944. LFD f27, 11 * SIZE(BO)
  2945. LFD f28, 8 * SIZE(BO)
  2946. LFD f29, 9 * SIZE(BO)
  2947. LFD f30, 0 * SIZE(BO)
  2948. LFD f31, 1 * SIZE(BO)
  2949. FMUL f16, f27, f5
  2950. FMUL f17, f27, f4
  2951. FMUL f18, f27, f7
  2952. FMUL f19, f27, f6
  2953. FMSUB f4, f26, f4, f16
  2954. FMADD f5, f26, f5, f17
  2955. FMSUB f6, f26, f6, f18
  2956. FMADD f7, f26, f7, f19
  2957. FMADD f0, f29, f5, f0
  2958. FNMSUB f1, f29, f4, f1
  2959. FMADD f2, f29, f7, f2
  2960. FNMSUB f3, f29, f6, f3
  2961. FNMSUB f0, f28, f4, f0
  2962. FNMSUB f1, f28, f5, f1
  2963. FNMSUB f2, f28, f6, f2
  2964. FNMSUB f3, f28, f7, f3
  2965. FMUL f16, f31, f1
  2966. FMUL f17, f31, f0
  2967. FMUL f18, f31, f3
  2968. FMUL f19, f31, f2
  2969. FMSUB f0, f30, f0, f16
  2970. FMADD f1, f30, f1, f17
  2971. FMSUB f2, f30, f2, f18
  2972. FMADD f3, f30, f3, f19
  2973. #else
  2974. FMADD f12, f24, f12, f16
  2975. FMSUB f13, f24, f13, f17
  2976. FMADD f14, f24, f14, f18
  2977. FMSUB f15, f24, f15, f19
  2978. FMSUB f8, f27, f13, f8
  2979. FNMADD f9, f27, f12, f9
  2980. FMSUB f10, f27, f15, f10
  2981. FNMADD f11, f27, f14, f11
  2982. FNMADD f8, f26, f12, f8
  2983. FNMADD f9, f26, f13, f9
  2984. FNMADD f10, f26, f14, f10
  2985. FNMADD f11, f26, f15, f11
  2986. FMSUB f4, f29, f13, f4
  2987. FNMADD f5, f29, f12, f5
  2988. FMSUB f6, f29, f15, f6
  2989. FNMADD f7, f29, f14, f7
  2990. FNMADD f4, f28, f12, f4
  2991. FNMADD f5, f28, f13, f5
  2992. FNMADD f6, f28, f14, f6
  2993. FNMADD f7, f28, f15, f7
  2994. FMSUB f0, f31, f13, f0
  2995. FNMADD f1, f31, f12, f1
  2996. FMSUB f2, f31, f15, f2
  2997. FNMADD f3, f31, f14, f3
  2998. FNMADD f0, f30, f12, f0
  2999. FNMADD f1, f30, f13, f1
  3000. FNMADD f2, f30, f14, f2
  3001. FNMADD f3, f30, f15, f3
  3002. LFD f26, 20 * SIZE(BO)
  3003. LFD f27, 21 * SIZE(BO)
  3004. LFD f28, 18 * SIZE(BO)
  3005. LFD f29, 19 * SIZE(BO)
  3006. LFD f30, 16 * SIZE(BO)
  3007. LFD f31, 17 * SIZE(BO)
  3008. FMUL f16, f27, f9
  3009. FMUL f17, f27, f8
  3010. FMUL f18, f27, f11
  3011. FMUL f19, f27, f10
  3012. FMADD f8, f26, f8, f16
  3013. FMSUB f9, f26, f9, f17
  3014. FMADD f10, f26, f10, f18
  3015. FMSUB f11, f26, f11, f19
  3016. FMSUB f4, f29, f9, f4
  3017. FNMADD f5, f29, f8, f5
  3018. FMSUB f6, f29, f11, f6
  3019. FNMADD f7, f29, f10, f7
  3020. FNMADD f4, f28, f8, f4
  3021. FNMADD f5, f28, f9, f5
  3022. FNMADD f6, f28, f10, f6
  3023. FNMADD f7, f28, f11, f7
  3024. FMSUB f0, f31, f9, f0
  3025. FNMADD f1, f31, f8, f1
  3026. FMSUB f2, f31, f11, f2
  3027. FNMADD f3, f31, f10, f3
  3028. FNMADD f0, f30, f8, f0
  3029. FNMADD f1, f30, f9, f1
  3030. FNMADD f2, f30, f10, f2
  3031. FNMADD f3, f30, f11, f3
  3032. LFD f26, 10 * SIZE(BO)
  3033. LFD f27, 11 * SIZE(BO)
  3034. LFD f28, 8 * SIZE(BO)
  3035. LFD f29, 9 * SIZE(BO)
  3036. LFD f30, 0 * SIZE(BO)
  3037. LFD f31, 1 * SIZE(BO)
  3038. FMUL f16, f27, f5
  3039. FMUL f17, f27, f4
  3040. FMUL f18, f27, f7
  3041. FMUL f19, f27, f6
  3042. FMADD f4, f26, f4, f16
  3043. FMSUB f5, f26, f5, f17
  3044. FMADD f6, f26, f6, f18
  3045. FMSUB f7, f26, f7, f19
  3046. FMSUB f0, f29, f5, f0
  3047. FNMADD f1, f29, f4, f1
  3048. FMSUB f2, f29, f7, f2
  3049. FNMADD f3, f29, f6, f3
  3050. FNMADD f0, f28, f4, f0
  3051. FNMADD f1, f28, f5, f1
  3052. FNMADD f2, f28, f6, f2
  3053. FNMADD f3, f28, f7, f3
  3054. FMUL f16, f31, f1
  3055. FMUL f17, f31, f0
  3056. FMUL f18, f31, f3
  3057. FMUL f19, f31, f2
  3058. FMADD f0, f30, f0, f16
  3059. FMSUB f1, f30, f1, f17
  3060. FMADD f2, f30, f2, f18
  3061. FMSUB f3, f30, f3, f19
  3062. #endif
  3063. #endif
  3064. #ifdef LN
  3065. subi CO1, CO1, 4 * SIZE
  3066. subi CO2, CO2, 4 * SIZE
  3067. subi CO3, CO3, 4 * SIZE
  3068. subi CO4, CO4, 4 * SIZE
  3069. #endif
  3070. #if defined(LN) || defined(LT)
  3071. STFD f0, 0 * SIZE(BO)
  3072. STFD f1, 1 * SIZE(BO)
  3073. STFD f4, 2 * SIZE(BO)
  3074. STFD f5, 3 * SIZE(BO)
  3075. STFD f8, 4 * SIZE(BO)
  3076. STFD f9, 5 * SIZE(BO)
  3077. STFD f12, 6 * SIZE(BO)
  3078. STFD f13, 7 * SIZE(BO)
  3079. STFD f2, 8 * SIZE(BO)
  3080. STFD f3, 9 * SIZE(BO)
  3081. STFD f6, 10 * SIZE(BO)
  3082. STFD f7, 11 * SIZE(BO)
  3083. STFD f10, 12 * SIZE(BO)
  3084. STFD f11, 13 * SIZE(BO)
  3085. STFD f14, 14 * SIZE(BO)
  3086. STFD f15, 15 * SIZE(BO)
  3087. #else
  3088. STFD f0, 0 * SIZE(AO)
  3089. STFD f1, 1 * SIZE(AO)
  3090. STFD f2, 2 * SIZE(AO)
  3091. STFD f3, 3 * SIZE(AO)
  3092. STFD f4, 4 * SIZE(AO)
  3093. STFD f5, 5 * SIZE(AO)
  3094. STFD f6, 6 * SIZE(AO)
  3095. STFD f7, 7 * SIZE(AO)
  3096. STFD f8, 8 * SIZE(AO)
  3097. STFD f9, 9 * SIZE(AO)
  3098. STFD f10, 10 * SIZE(AO)
  3099. STFD f11, 11 * SIZE(AO)
  3100. STFD f12, 12 * SIZE(AO)
  3101. STFD f13, 13 * SIZE(AO)
  3102. STFD f14, 14 * SIZE(AO)
  3103. STFD f15, 15 * SIZE(AO)
  3104. #endif
  3105. STFD f0, 0 * SIZE(CO1)
  3106. STFD f1, 1 * SIZE(CO1)
  3107. STFD f2, 2 * SIZE(CO1)
  3108. STFD f3, 3 * SIZE(CO1)
  3109. lfs f0, FZERO
  3110. fmr f1, f0
  3111. fmr f2, f0
  3112. fmr f3, f0
  3113. STFD f4, 0 * SIZE(CO2)
  3114. STFD f5, 1 * SIZE(CO2)
  3115. STFD f6, 2 * SIZE(CO2)
  3116. STFD f7, 3 * SIZE(CO2)
  3117. fmr f4, f0
  3118. fmr f5, f0
  3119. fmr f6, f0
  3120. fmr f7, f0
  3121. STFD f8, 0 * SIZE(CO3)
  3122. STFD f9, 1 * SIZE(CO3)
  3123. STFD f10, 2 * SIZE(CO3)
  3124. STFD f11, 3 * SIZE(CO3)
  3125. fmr f8, f0
  3126. fmr f9, f0
  3127. fmr f10, f0
  3128. fmr f11, f0
  3129. STFD f12, 0 * SIZE(CO4)
  3130. STFD f13, 1 * SIZE(CO4)
  3131. STFD f14, 2 * SIZE(CO4)
  3132. STFD f15, 3 * SIZE(CO4)
  3133. fmr f12, f0
  3134. fmr f13, f0
  3135. fmr f14, f0
  3136. fmr f15, f0
  3137. #ifndef LN
  3138. addi CO1, CO1, 4 * SIZE
  3139. addi CO2, CO2, 4 * SIZE
  3140. addi CO3, CO3, 4 * SIZE
  3141. addi CO4, CO4, 4 * SIZE
  3142. #endif
  3143. #ifdef RT
  3144. slwi r0, K, 1 + ZBASE_SHIFT
  3145. add AORIG, AORIG, r0
  3146. #endif
  3147. #if defined(LT) || defined(RN)
  3148. sub TEMP, K, KK
  3149. slwi r0, TEMP, 1 + ZBASE_SHIFT
  3150. slwi TEMP, TEMP, 2 + ZBASE_SHIFT
  3151. add AO, AO, r0
  3152. add BO, BO, TEMP
  3153. #endif
  3154. #ifdef LT
  3155. addi KK, KK, 2
  3156. #endif
  3157. #ifdef LN
  3158. subi KK, KK, 2
  3159. #endif
  3160. addic. I, I, -1
  3161. bgt LL(11)
  3162. .align 4
  3163. LL(20):
  3164. andi. I, M, 1
  3165. ble LL(29)
  3166. #if defined(LT) || defined(RN)
  3167. LFD f16, 0 * SIZE(AO)
  3168. LFD f17, 1 * SIZE(AO)
  3169. LFD f18, 2 * SIZE(AO)
  3170. LFD f19, 3 * SIZE(AO)
  3171. LFD f20, 0 * SIZE(B)
  3172. LFD f21, 1 * SIZE(B)
  3173. LFD f22, 2 * SIZE(B)
  3174. LFD f23, 3 * SIZE(B)
  3175. LFD f24, 4 * SIZE(B)
  3176. LFD f25, 5 * SIZE(B)
  3177. LFD f26, 6 * SIZE(B)
  3178. LFD f27, 7 * SIZE(B)
  3179. srawi. r0, KK, 2
  3180. mr BO, B
  3181. mtspr CTR, r0
  3182. #else
  3183. #ifdef LN
  3184. slwi r0, K, 0 + ZBASE_SHIFT
  3185. sub AORIG, AORIG, r0
  3186. #endif
  3187. slwi r0, KK, 0 + ZBASE_SHIFT
  3188. slwi TEMP, KK, 2 + ZBASE_SHIFT
  3189. add AO, AORIG, r0
  3190. add BO, B, TEMP
  3191. sub TEMP, K, KK
  3192. LFD f16, 0 * SIZE(AO)
  3193. LFD f17, 1 * SIZE(AO)
  3194. LFD f18, 2 * SIZE(AO)
  3195. LFD f19, 3 * SIZE(AO)
  3196. LFD f20, 0 * SIZE(BO)
  3197. LFD f21, 1 * SIZE(BO)
  3198. LFD f22, 2 * SIZE(BO)
  3199. LFD f23, 3 * SIZE(BO)
  3200. LFD f24, 4 * SIZE(BO)
  3201. LFD f25, 5 * SIZE(BO)
  3202. LFD f26, 6 * SIZE(BO)
  3203. LFD f27, 7 * SIZE(BO)
  3204. srawi. r0, TEMP, 2
  3205. mtspr CTR, r0
  3206. #endif
  3207. ble LL(25)
  3208. .align 4
  3209. LL(22):
  3210. FMA1 f0, f16, f20, f0
  3211. FMA4 f3, f17, f20, f3
  3212. FMA2 f1, f16, f21, f1
  3213. FMA3 f2, f17, f21, f2
  3214. LFD f28, 4 * SIZE(AO)
  3215. LFD f29, 5 * SIZE(AO)
  3216. LFD f30, 6 * SIZE(AO)
  3217. LFD f31, 7 * SIZE(AO)
  3218. FMA1 f4, f16, f22, f4
  3219. FMA4 f7, f17, f22, f7
  3220. FMA2 f5, f16, f23, f5
  3221. FMA3 f6, f17, f23, f6
  3222. LFD f20, 8 * SIZE(BO)
  3223. LFD f21, 9 * SIZE(BO)
  3224. LFD f22, 10 * SIZE(BO)
  3225. LFD f23, 11 * SIZE(BO)
  3226. FMA1 f8, f16, f24, f8
  3227. FMA4 f11, f17, f24, f11
  3228. FMA2 f9, f16, f25, f9
  3229. FMA3 f10, f17, f25, f10
  3230. FMA1 f12, f16, f26, f12
  3231. FMA4 f15, f17, f26, f15
  3232. FMA2 f13, f16, f27, f13
  3233. FMA3 f14, f17, f27, f14
  3234. LFD f24, 12 * SIZE(BO)
  3235. LFD f25, 13 * SIZE(BO)
  3236. LFD f26, 14 * SIZE(BO)
  3237. LFD f27, 15 * SIZE(BO)
  3238. FMA1 f0, f18, f20, f0
  3239. FMA4 f3, f19, f20, f3
  3240. FMA2 f1, f18, f21, f1
  3241. FMA3 f2, f19, f21, f2
  3242. FMA1 f4, f18, f22, f4
  3243. FMA4 f7, f19, f22, f7
  3244. FMA2 f5, f18, f23, f5
  3245. FMA3 f6, f19, f23, f6
  3246. LFD f20, 16 * SIZE(BO)
  3247. LFD f21, 17 * SIZE(BO)
  3248. LFD f22, 18 * SIZE(BO)
  3249. LFD f23, 19 * SIZE(BO)
  3250. FMA1 f8, f18, f24, f8
  3251. FMA4 f11, f19, f24, f11
  3252. FMA2 f9, f18, f25, f9
  3253. FMA3 f10, f19, f25, f10
  3254. FMA1 f12, f18, f26, f12
  3255. FMA4 f15, f19, f26, f15
  3256. FMA2 f13, f18, f27, f13
  3257. FMA3 f14, f19, f27, f14
  3258. LFD f24, 20 * SIZE(BO)
  3259. LFD f25, 21 * SIZE(BO)
  3260. LFD f26, 22 * SIZE(BO)
  3261. LFD f27, 23 * SIZE(BO)
  3262. FMA1 f0, f28, f20, f0
  3263. FMA4 f3, f29, f20, f3
  3264. FMA2 f1, f28, f21, f1
  3265. FMA3 f2, f29, f21, f2
  3266. LFD f16, 8 * SIZE(AO)
  3267. LFD f17, 9 * SIZE(AO)
  3268. LFD f18, 10 * SIZE(AO)
  3269. LFD f19, 11 * SIZE(AO)
  3270. FMA1 f4, f28, f22, f4
  3271. FMA4 f7, f29, f22, f7
  3272. FMA2 f5, f28, f23, f5
  3273. FMA3 f6, f29, f23, f6
  3274. LFD f20, 24 * SIZE(BO)
  3275. LFD f21, 25 * SIZE(BO)
  3276. LFD f22, 26 * SIZE(BO)
  3277. LFD f23, 27 * SIZE(BO)
  3278. FMA1 f8, f28, f24, f8
  3279. FMA4 f11, f29, f24, f11
  3280. FMA2 f9, f28, f25, f9
  3281. FMA3 f10, f29, f25, f10
  3282. FMA1 f12, f28, f26, f12
  3283. FMA4 f15, f29, f26, f15
  3284. FMA2 f13, f28, f27, f13
  3285. FMA3 f14, f29, f27, f14
  3286. LFD f24, 28 * SIZE(BO)
  3287. LFD f25, 29 * SIZE(BO)
  3288. LFD f26, 30 * SIZE(BO)
  3289. LFD f27, 31 * SIZE(BO)
  3290. FMA1 f0, f30, f20, f0
  3291. FMA4 f3, f31, f20, f3
  3292. FMA2 f1, f30, f21, f1
  3293. FMA3 f2, f31, f21, f2
  3294. FMA1 f4, f30, f22, f4
  3295. FMA4 f7, f31, f22, f7
  3296. FMA2 f5, f30, f23, f5
  3297. FMA3 f6, f31, f23, f6
  3298. LFD f20, 32 * SIZE(BO)
  3299. LFD f21, 33 * SIZE(BO)
  3300. LFD f22, 34 * SIZE(BO)
  3301. LFD f23, 35 * SIZE(BO)
  3302. FMA1 f8, f30, f24, f8
  3303. FMA4 f11, f31, f24, f11
  3304. FMA2 f9, f30, f25, f9
  3305. FMA3 f10, f31, f25, f10
  3306. FMA1 f12, f30, f26, f12
  3307. FMA4 f15, f31, f26, f15
  3308. FMA2 f13, f30, f27, f13
  3309. FMA3 f14, f31, f27, f14
  3310. LFD f24, 36 * SIZE(BO)
  3311. LFD f25, 37 * SIZE(BO)
  3312. LFD f26, 38 * SIZE(BO)
  3313. LFD f27, 39 * SIZE(BO)
  3314. addi AO, AO, 8 * SIZE
  3315. addi BO, BO, 32 * SIZE
  3316. bdnz LL(22)
  3317. .align 4
  3318. LL(25):
  3319. #if defined(LT) || defined(RN)
  3320. andi. r0, KK, 3
  3321. #else
  3322. andi. r0, TEMP, 3
  3323. #endif
  3324. mtspr CTR, r0
  3325. ble LL(27)
  3326. .align 4
  3327. LL(26):
  3328. FMA1 f0, f16, f20, f0
  3329. FMA4 f3, f17, f20, f3
  3330. FMA2 f1, f16, f21, f1
  3331. FMA3 f2, f17, f21, f2
  3332. FMA1 f4, f16, f22, f4
  3333. FMA4 f7, f17, f22, f7
  3334. FMA2 f5, f16, f23, f5
  3335. FMA3 f6, f17, f23, f6
  3336. LFD f20, 8 * SIZE(BO)
  3337. LFD f21, 9 * SIZE(BO)
  3338. LFD f22, 10 * SIZE(BO)
  3339. LFD f23, 11 * SIZE(BO)
  3340. FMA1 f8, f16, f24, f8
  3341. FMA4 f11, f17, f24, f11
  3342. FMA2 f9, f16, f25, f9
  3343. FMA3 f10, f17, f25, f10
  3344. FMA1 f12, f16, f26, f12
  3345. FMA4 f15, f17, f26, f15
  3346. FMA2 f13, f16, f27, f13
  3347. FMA3 f14, f17, f27, f14
  3348. LFD f16, 2 * SIZE(AO)
  3349. LFD f17, 3 * SIZE(AO)
  3350. LFD f24, 12 * SIZE(BO)
  3351. LFD f25, 13 * SIZE(BO)
  3352. LFD f26, 14 * SIZE(BO)
  3353. LFD f27, 15 * SIZE(BO)
  3354. addi AO, AO, 2 * SIZE
  3355. addi BO, BO, 8 * SIZE
  3356. bdnz LL(26)
  3357. .align 4
  3358. LL(27):
  3359. #if defined(LN) || defined(RT)
  3360. #ifdef LN
  3361. subi r0, KK, 1
  3362. #else
  3363. subi r0, KK, 4
  3364. #endif
  3365. slwi TEMP, r0, 0 + ZBASE_SHIFT
  3366. slwi r0, r0, 2 + ZBASE_SHIFT
  3367. add AO, AORIG, TEMP
  3368. add BO, B, r0
  3369. #endif
  3370. FADD f0, f0, f2
  3371. FADD f1, f1, f3
  3372. FADD f4, f4, f6
  3373. FADD f5, f5, f7
  3374. FADD f8, f8, f10
  3375. FADD f9, f9, f11
  3376. FADD f12, f12, f14
  3377. FADD f13, f13, f15
  3378. #if defined(LN) || defined(LT)
  3379. LFD f16, 0 * SIZE(BO)
  3380. LFD f17, 1 * SIZE(BO)
  3381. LFD f18, 2 * SIZE(BO)
  3382. LFD f19, 3 * SIZE(BO)
  3383. FSUB f0, f16, f0
  3384. FSUB f1, f17, f1
  3385. FSUB f4, f18, f4
  3386. FSUB f5, f19, f5
  3387. LFD f20, 4 * SIZE(BO)
  3388. LFD f21, 5 * SIZE(BO)
  3389. LFD f22, 6 * SIZE(BO)
  3390. LFD f23, 7 * SIZE(BO)
  3391. FSUB f8, f20, f8
  3392. FSUB f9, f21, f9
  3393. FSUB f12, f22, f12
  3394. FSUB f13, f23, f13
  3395. #else
  3396. LFD f16, 0 * SIZE(AO)
  3397. LFD f17, 1 * SIZE(AO)
  3398. LFD f20, 2 * SIZE(AO)
  3399. LFD f21, 3 * SIZE(AO)
  3400. FSUB f0, f16, f0
  3401. FSUB f1, f17, f1
  3402. FSUB f4, f20, f4
  3403. FSUB f5, f21, f5
  3404. LFD f24, 4 * SIZE(AO)
  3405. LFD f25, 5 * SIZE(AO)
  3406. LFD f28, 6 * SIZE(AO)
  3407. LFD f29, 7 * SIZE(AO)
  3408. FSUB f8, f24, f8
  3409. FSUB f9, f25, f9
  3410. FSUB f12, f28, f12
  3411. FSUB f13, f29, f13
  3412. #endif
  3413. #ifdef LN
  3414. LFD f28, 0 * SIZE(AO)
  3415. LFD f29, 1 * SIZE(AO)
  3416. FMUL f16, f29, f1
  3417. FMUL f17, f29, f0
  3418. FMUL f18, f29, f5
  3419. FMUL f19, f29, f4
  3420. FMUL f20, f29, f9
  3421. FMUL f21, f29, f8
  3422. FMUL f22, f29, f13
  3423. FMUL f23, f29, f12
  3424. #ifndef CONJ
  3425. FMSUB f0, f28, f0, f16
  3426. FMADD f1, f28, f1, f17
  3427. FMSUB f4, f28, f4, f18
  3428. FMADD f5, f28, f5, f19
  3429. FMSUB f8, f28, f8, f20
  3430. FMADD f9, f28, f9, f21
  3431. FMSUB f12, f28, f12, f22
  3432. FMADD f13, f28, f13, f23
  3433. #else
  3434. FMADD f0, f28, f0, f16
  3435. FMSUB f1, f28, f1, f17
  3436. FMADD f4, f28, f4, f18
  3437. FMSUB f5, f28, f5, f19
  3438. FMADD f8, f28, f8, f20
  3439. FMSUB f9, f28, f9, f21
  3440. FMADD f12, f28, f12, f22
  3441. FMSUB f13, f28, f13, f23
  3442. #endif
  3443. #endif
  3444. #ifdef LT
  3445. LFD f24, 0 * SIZE(AO)
  3446. LFD f25, 1 * SIZE(AO)
  3447. FMUL f16, f25, f1
  3448. FMUL f17, f25, f0
  3449. FMUL f18, f25, f5
  3450. FMUL f19, f25, f4
  3451. FMUL f20, f25, f9
  3452. FMUL f21, f25, f8
  3453. FMUL f22, f25, f13
  3454. FMUL f23, f25, f12
  3455. #ifndef CONJ
  3456. FMSUB f0, f24, f0, f16
  3457. FMADD f1, f24, f1, f17
  3458. FMSUB f4, f24, f4, f18
  3459. FMADD f5, f24, f5, f19
  3460. FMSUB f8, f24, f8, f20
  3461. FMADD f9, f24, f9, f21
  3462. FMSUB f12, f24, f12, f22
  3463. FMADD f13, f24, f13, f23
  3464. #else
  3465. FMADD f0, f24, f0, f16
  3466. FMSUB f1, f24, f1, f17
  3467. FMADD f4, f24, f4, f18
  3468. FMSUB f5, f24, f5, f19
  3469. FMADD f8, f24, f8, f20
  3470. FMSUB f9, f24, f9, f21
  3471. FMADD f12, f24, f12, f22
  3472. FMSUB f13, f24, f13, f23
  3473. #endif
  3474. #endif
  3475. #ifdef RN
  3476. LFD f24, 0 * SIZE(BO)
  3477. LFD f25, 1 * SIZE(BO)
  3478. LFD f26, 2 * SIZE(BO)
  3479. LFD f27, 3 * SIZE(BO)
  3480. LFD f28, 4 * SIZE(BO)
  3481. LFD f29, 5 * SIZE(BO)
  3482. LFD f30, 6 * SIZE(BO)
  3483. LFD f31, 7 * SIZE(BO)
  3484. FMUL f16, f25, f1
  3485. FMUL f17, f25, f0
  3486. #ifndef CONJ
  3487. FMSUB f0, f24, f0, f16
  3488. FMADD f1, f24, f1, f17
  3489. FMADD f4, f27, f1, f4
  3490. FNMSUB f5, f27, f0, f5
  3491. FNMSUB f4, f26, f0, f4
  3492. FNMSUB f5, f26, f1, f5
  3493. FMADD f8, f29, f1, f8
  3494. FNMSUB f9, f29, f0, f9
  3495. FNMSUB f8, f28, f0, f8
  3496. FNMSUB f9, f28, f1, f9
  3497. FMADD f12, f31, f1, f12
  3498. FNMSUB f13, f31, f0, f13
  3499. FNMSUB f12, f30, f0, f12
  3500. FNMSUB f13, f30, f1, f13
  3501. LFD f26, 10 * SIZE(BO)
  3502. LFD f27, 11 * SIZE(BO)
  3503. LFD f28, 12 * SIZE(BO)
  3504. LFD f29, 13 * SIZE(BO)
  3505. LFD f30, 14 * SIZE(BO)
  3506. LFD f31, 15 * SIZE(BO)
  3507. FMUL f16, f27, f5
  3508. FMUL f17, f27, f4
  3509. FMSUB f4, f26, f4, f16
  3510. FMADD f5, f26, f5, f17
  3511. FMADD f8, f29, f5, f8
  3512. FNMSUB f9, f29, f4, f9
  3513. FNMSUB f8, f28, f4, f8
  3514. FNMSUB f9, f28, f5, f9
  3515. FMADD f12, f31, f5, f12
  3516. FNMSUB f13, f31, f4, f13
  3517. FNMSUB f12, f30, f4, f12
  3518. FNMSUB f13, f30, f5, f13
  3519. LFD f26, 20 * SIZE(BO)
  3520. LFD f27, 21 * SIZE(BO)
  3521. LFD f28, 22 * SIZE(BO)
  3522. LFD f29, 23 * SIZE(BO)
  3523. LFD f30, 30 * SIZE(BO)
  3524. LFD f31, 31 * SIZE(BO)
  3525. FMUL f16, f27, f9
  3526. FMUL f17, f27, f8
  3527. FMSUB f8, f26, f8, f16
  3528. FMADD f9, f26, f9, f17
  3529. FMADD f12, f29, f9, f12
  3530. FNMSUB f13, f29, f8, f13
  3531. FNMSUB f12, f28, f8, f12
  3532. FNMSUB f13, f28, f9, f13
  3533. FMUL f16, f31, f13
  3534. FMUL f17, f31, f12
  3535. FMSUB f12, f30, f12, f16
  3536. FMADD f13, f30, f13, f17
  3537. #else
  3538. FMADD f0, f24, f0, f16
  3539. FMSUB f1, f24, f1, f17
  3540. FMSUB f4, f27, f1, f4
  3541. FNMADD f5, f27, f0, f5
  3542. FNMADD f4, f26, f0, f4
  3543. FNMADD f5, f26, f1, f5
  3544. FMSUB f8, f29, f1, f8
  3545. FNMADD f9, f29, f0, f9
  3546. FNMADD f8, f28, f0, f8
  3547. FNMADD f9, f28, f1, f9
  3548. FMSUB f12, f31, f1, f12
  3549. FNMADD f13, f31, f0, f13
  3550. FNMADD f12, f30, f0, f12
  3551. FNMADD f13, f30, f1, f13
  3552. LFD f26, 10 * SIZE(BO)
  3553. LFD f27, 11 * SIZE(BO)
  3554. LFD f28, 12 * SIZE(BO)
  3555. LFD f29, 13 * SIZE(BO)
  3556. LFD f30, 14 * SIZE(BO)
  3557. LFD f31, 15 * SIZE(BO)
  3558. FMUL f16, f27, f5
  3559. FMUL f17, f27, f4
  3560. FMADD f4, f26, f4, f16
  3561. FMSUB f5, f26, f5, f17
  3562. FMSUB f8, f29, f5, f8
  3563. FNMADD f9, f29, f4, f9
  3564. FNMADD f8, f28, f4, f8
  3565. FNMADD f9, f28, f5, f9
  3566. FMSUB f12, f31, f5, f12
  3567. FNMADD f13, f31, f4, f13
  3568. FNMADD f12, f30, f4, f12
  3569. FNMADD f13, f30, f5, f13
  3570. LFD f26, 20 * SIZE(BO)
  3571. LFD f27, 21 * SIZE(BO)
  3572. LFD f28, 22 * SIZE(BO)
  3573. LFD f29, 23 * SIZE(BO)
  3574. LFD f30, 30 * SIZE(BO)
  3575. LFD f31, 31 * SIZE(BO)
  3576. FMUL f16, f27, f9
  3577. FMUL f17, f27, f8
  3578. FMADD f8, f26, f8, f16
  3579. FMSUB f9, f26, f9, f17
  3580. FMSUB f12, f29, f9, f12
  3581. FNMADD f13, f29, f8, f13
  3582. FNMADD f12, f28, f8, f12
  3583. FNMADD f13, f28, f9, f13
  3584. FMUL f16, f31, f13
  3585. FMUL f17, f31, f12
  3586. FMADD f12, f30, f12, f16
  3587. FMSUB f13, f30, f13, f17
  3588. #endif
  3589. #endif
  3590. #ifdef RT
  3591. LFD f24, 30 * SIZE(BO)
  3592. LFD f25, 31 * SIZE(BO)
  3593. LFD f26, 28 * SIZE(BO)
  3594. LFD f27, 29 * SIZE(BO)
  3595. LFD f28, 26 * SIZE(BO)
  3596. LFD f29, 27 * SIZE(BO)
  3597. LFD f30, 24 * SIZE(BO)
  3598. LFD f31, 25 * SIZE(BO)
  3599. FMUL f16, f25, f13
  3600. FMUL f17, f25, f12
  3601. #ifndef CONJ
  3602. FMSUB f12, f24, f12, f16
  3603. FMADD f13, f24, f13, f17
  3604. FMADD f8, f27, f13, f8
  3605. FNMSUB f9, f27, f12, f9
  3606. FNMSUB f8, f26, f12, f8
  3607. FNMSUB f9, f26, f13, f9
  3608. FMADD f4, f29, f13, f4
  3609. FNMSUB f5, f29, f12, f5
  3610. FNMSUB f4, f28, f12, f4
  3611. FNMSUB f5, f28, f13, f5
  3612. FMADD f0, f31, f13, f0
  3613. FNMSUB f1, f31, f12, f1
  3614. FNMSUB f0, f30, f12, f0
  3615. FNMSUB f1, f30, f13, f1
  3616. LFD f26, 20 * SIZE(BO)
  3617. LFD f27, 21 * SIZE(BO)
  3618. LFD f28, 18 * SIZE(BO)
  3619. LFD f29, 19 * SIZE(BO)
  3620. LFD f30, 16 * SIZE(BO)
  3621. LFD f31, 17 * SIZE(BO)
  3622. FMUL f16, f27, f9
  3623. FMUL f17, f27, f8
  3624. FMSUB f8, f26, f8, f16
  3625. FMADD f9, f26, f9, f17
  3626. FMADD f4, f29, f9, f4
  3627. FNMSUB f5, f29, f8, f5
  3628. FNMSUB f4, f28, f8, f4
  3629. FNMSUB f5, f28, f9, f5
  3630. FMADD f0, f31, f9, f0
  3631. FNMSUB f1, f31, f8, f1
  3632. FNMSUB f0, f30, f8, f0
  3633. FNMSUB f1, f30, f9, f1
  3634. LFD f26, 10 * SIZE(BO)
  3635. LFD f27, 11 * SIZE(BO)
  3636. LFD f28, 8 * SIZE(BO)
  3637. LFD f29, 9 * SIZE(BO)
  3638. LFD f30, 0 * SIZE(BO)
  3639. LFD f31, 1 * SIZE(BO)
  3640. FMUL f16, f27, f5
  3641. FMUL f17, f27, f4
  3642. FMSUB f4, f26, f4, f16
  3643. FMADD f5, f26, f5, f17
  3644. FMADD f0, f29, f5, f0
  3645. FNMSUB f1, f29, f4, f1
  3646. FNMSUB f0, f28, f4, f0
  3647. FNMSUB f1, f28, f5, f1
  3648. FMUL f16, f31, f1
  3649. FMUL f17, f31, f0
  3650. FMSUB f0, f30, f0, f16
  3651. FMADD f1, f30, f1, f17
  3652. #else
  3653. FMADD f12, f24, f12, f16
  3654. FMSUB f13, f24, f13, f17
  3655. FMSUB f8, f27, f13, f8
  3656. FNMADD f9, f27, f12, f9
  3657. FNMADD f8, f26, f12, f8
  3658. FNMADD f9, f26, f13, f9
  3659. FMSUB f4, f29, f13, f4
  3660. FNMADD f5, f29, f12, f5
  3661. FNMADD f4, f28, f12, f4
  3662. FNMADD f5, f28, f13, f5
  3663. FMSUB f0, f31, f13, f0
  3664. FNMADD f1, f31, f12, f1
  3665. FNMADD f0, f30, f12, f0
  3666. FNMADD f1, f30, f13, f1
  3667. LFD f26, 20 * SIZE(BO)
  3668. LFD f27, 21 * SIZE(BO)
  3669. LFD f28, 18 * SIZE(BO)
  3670. LFD f29, 19 * SIZE(BO)
  3671. LFD f30, 16 * SIZE(BO)
  3672. LFD f31, 17 * SIZE(BO)
  3673. FMUL f16, f27, f9
  3674. FMUL f17, f27, f8
  3675. FMADD f8, f26, f8, f16
  3676. FMSUB f9, f26, f9, f17
  3677. FMSUB f4, f29, f9, f4
  3678. FNMADD f5, f29, f8, f5
  3679. FNMADD f4, f28, f8, f4
  3680. FNMADD f5, f28, f9, f5
  3681. FMSUB f0, f31, f9, f0
  3682. FNMADD f1, f31, f8, f1
  3683. FNMADD f0, f30, f8, f0
  3684. FNMADD f1, f30, f9, f1
  3685. LFD f26, 10 * SIZE(BO)
  3686. LFD f27, 11 * SIZE(BO)
  3687. LFD f28, 8 * SIZE(BO)
  3688. LFD f29, 9 * SIZE(BO)
  3689. LFD f30, 0 * SIZE(BO)
  3690. LFD f31, 1 * SIZE(BO)
  3691. FMUL f16, f27, f5
  3692. FMUL f17, f27, f4
  3693. FMADD f4, f26, f4, f16
  3694. FMSUB f5, f26, f5, f17
  3695. FMSUB f0, f29, f5, f0
  3696. FNMADD f1, f29, f4, f1
  3697. FNMADD f0, f28, f4, f0
  3698. FNMADD f1, f28, f5, f1
  3699. FMUL f16, f31, f1
  3700. FMUL f17, f31, f0
  3701. FMADD f0, f30, f0, f16
  3702. FMSUB f1, f30, f1, f17
  3703. #endif
  3704. #endif
  3705. #ifdef LN
  3706. subi CO1, CO1, 2 * SIZE
  3707. subi CO2, CO2, 2 * SIZE
  3708. subi CO3, CO3, 2 * SIZE
  3709. subi CO4, CO4, 2 * SIZE
  3710. #endif
  3711. #if defined(LN) || defined(LT)
  3712. STFD f0, 0 * SIZE(BO)
  3713. STFD f1, 1 * SIZE(BO)
  3714. STFD f4, 2 * SIZE(BO)
  3715. STFD f5, 3 * SIZE(BO)
  3716. STFD f8, 4 * SIZE(BO)
  3717. STFD f9, 5 * SIZE(BO)
  3718. STFD f12, 6 * SIZE(BO)
  3719. STFD f13, 7 * SIZE(BO)
  3720. #else
  3721. STFD f0, 0 * SIZE(AO)
  3722. STFD f1, 1 * SIZE(AO)
  3723. STFD f4, 2 * SIZE(AO)
  3724. STFD f5, 3 * SIZE(AO)
  3725. STFD f8, 4 * SIZE(AO)
  3726. STFD f9, 5 * SIZE(AO)
  3727. STFD f12, 6 * SIZE(AO)
  3728. STFD f13, 7 * SIZE(AO)
  3729. #endif
  3730. STFD f0, 0 * SIZE(CO1)
  3731. STFD f1, 1 * SIZE(CO1)
  3732. STFD f4, 0 * SIZE(CO2)
  3733. STFD f5, 1 * SIZE(CO2)
  3734. STFD f8, 0 * SIZE(CO3)
  3735. STFD f9, 1 * SIZE(CO3)
  3736. STFD f12, 0 * SIZE(CO4)
  3737. STFD f13, 1 * SIZE(CO4)
  3738. #ifndef LN
  3739. addi CO1, CO1, 2 * SIZE
  3740. addi CO2, CO2, 2 * SIZE
  3741. addi CO3, CO3, 2 * SIZE
  3742. addi CO4, CO4, 2 * SIZE
  3743. #endif
  3744. #ifdef RT
  3745. slwi r0, K, 0 + ZBASE_SHIFT
  3746. add AORIG, AORIG, r0
  3747. #endif
  3748. #if defined(LT) || defined(RN)
  3749. sub TEMP, K, KK
  3750. slwi r0, TEMP, 0 + ZBASE_SHIFT
  3751. slwi TEMP, TEMP, 2 + ZBASE_SHIFT
  3752. add AO, AO, r0
  3753. add BO, BO, TEMP
  3754. #endif
  3755. #ifdef LT
  3756. addi KK, KK, 1
  3757. #endif
  3758. #ifdef LN
  3759. subi KK, KK, 1
  3760. #endif
  3761. .align 4
  3762. LL(29):
  3763. #ifdef LN
  3764. slwi r0, K, 2 + ZBASE_SHIFT
  3765. add B, B, r0
  3766. #endif
  3767. #if defined(LT) || defined(RN)
  3768. mr B, BO
  3769. #endif
  3770. #ifdef RN
  3771. addi KK, KK, 4
  3772. #endif
  3773. #ifdef RT
  3774. subi KK, KK, 4
  3775. #endif
  3776. addic. J, J, -1
  3777. bgt LL(10)
  3778. .align 4
  3779. LL(999):
  3780. addi r3, 0, 0
  3781. lfd f14, 0(SP)
  3782. lfd f15, 8(SP)
  3783. lfd f16, 16(SP)
  3784. lfd f17, 24(SP)
  3785. lfd f18, 32(SP)
  3786. lfd f19, 40(SP)
  3787. lfd f20, 48(SP)
  3788. lfd f21, 56(SP)
  3789. lfd f22, 64(SP)
  3790. lfd f23, 72(SP)
  3791. lfd f24, 80(SP)
  3792. lfd f25, 88(SP)
  3793. lfd f26, 96(SP)
  3794. lfd f27, 104(SP)
  3795. lfd f28, 112(SP)
  3796. lfd f29, 120(SP)
  3797. lfd f30, 128(SP)
  3798. lfd f31, 136(SP)
  3799. #ifdef __64BIT__
  3800. ld r31, 144(SP)
  3801. ld r30, 152(SP)
  3802. ld r29, 160(SP)
  3803. ld r28, 168(SP)
  3804. ld r27, 176(SP)
  3805. ld r26, 184(SP)
  3806. ld r25, 192(SP)
  3807. ld r24, 200(SP)
  3808. ld r23, 208(SP)
  3809. ld r22, 216(SP)
  3810. ld r21, 224(SP)
  3811. ld r20, 232(SP)
  3812. ld r19, 240(SP)
  3813. #else
  3814. lwz r31, 144(SP)
  3815. lwz r30, 148(SP)
  3816. lwz r29, 152(SP)
  3817. lwz r28, 156(SP)
  3818. lwz r27, 160(SP)
  3819. lwz r26, 164(SP)
  3820. lwz r25, 168(SP)
  3821. lwz r24, 172(SP)
  3822. lwz r23, 176(SP)
  3823. lwz r22, 180(SP)
  3824. lwz r21, 184(SP)
  3825. lwz r20, 188(SP)
  3826. lwz r19, 192(SP)
  3827. #endif
  3828. addi SP, SP, STACKSIZE
  3829. blr
  3830. EPILOGUE
  3831. #endif