You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_power6_LN.S 85 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define FZERO 312(SP)
  48. #else
  49. #define STACKSIZE 256
  50. #define FZERO 240(SP)
  51. #endif
  52. #define M r3
  53. #define N r4
  54. #define K r5
  55. #if defined(linux) || defined(__FreeBSD__)
  56. #ifndef __64BIT__
  57. #define A r6
  58. #define B r7
  59. #define C r8
  60. #define LDC r9
  61. #define OFFSET r10
  62. #else
  63. #define A r8
  64. #define B r9
  65. #define C r10
  66. #define LDC r6
  67. #define OFFSET r7
  68. #endif
  69. #endif
  70. #if defined(_AIX) || defined(__APPLE__)
  71. #if !defined(__64BIT__) && defined(DOUBLE)
  72. #define A r10
  73. #define B r6
  74. #define C r7
  75. #define LDC r8
  76. #define OFFSET r9
  77. #else
  78. #define A r8
  79. #define B r9
  80. #define C r10
  81. #define LDC r6
  82. #define OFFSET r7
  83. #endif
  84. #endif
  85. #define AORIG r19
  86. #define TEMP r20
  87. #define KK r21
  88. #define I r22
  89. #define J r23
  90. #define AO r24
  91. #define BO r25
  92. #define CO1 r26
  93. #define CO2 r27
  94. #define CO3 r28
  95. #define CO4 r29
  96. #define PREA r30
  97. #define PREC r31
  98. #ifndef CONJ
  99. #define FMA1 FMADD
  100. #define FMA2 FMADD
  101. #define FMA3 FNMSUB
  102. #define FMA4 FMADD
  103. #elif defined(LN) || defined(LT)
  104. #define FMA1 FMADD
  105. #define FMA2 FMADD
  106. #define FMA3 FMADD
  107. #define FMA4 FNMSUB
  108. #else
  109. #define FMA1 FMADD
  110. #define FMA2 FNMSUB
  111. #define FMA3 FMADD
  112. #define FMA4 FMADD
  113. #endif
  114. #ifndef NEEDPARAM
  115. PROLOGUE
  116. PROFCODE
  117. addi SP, SP, -STACKSIZE
  118. li r0, 0
  119. stfd f14, 0(SP)
  120. stfd f15, 8(SP)
  121. stfd f16, 16(SP)
  122. stfd f17, 24(SP)
  123. stfd f18, 32(SP)
  124. stfd f19, 40(SP)
  125. stfd f20, 48(SP)
  126. stfd f21, 56(SP)
  127. stfd f22, 64(SP)
  128. stfd f23, 72(SP)
  129. stfd f24, 80(SP)
  130. stfd f25, 88(SP)
  131. stfd f26, 96(SP)
  132. stfd f27, 104(SP)
  133. stfd f28, 112(SP)
  134. stfd f29, 120(SP)
  135. stfd f30, 128(SP)
  136. stfd f31, 136(SP)
  137. #ifdef __64BIT__
  138. std r31, 144(SP)
  139. std r30, 152(SP)
  140. std r29, 160(SP)
  141. std r28, 168(SP)
  142. std r27, 176(SP)
  143. std r26, 184(SP)
  144. std r25, 192(SP)
  145. std r24, 200(SP)
  146. std r23, 208(SP)
  147. std r22, 216(SP)
  148. std r21, 224(SP)
  149. std r20, 232(SP)
  150. std r19, 240(SP)
  151. #else
  152. stw r31, 144(SP)
  153. stw r30, 148(SP)
  154. stw r29, 152(SP)
  155. stw r28, 156(SP)
  156. stw r27, 160(SP)
  157. stw r26, 164(SP)
  158. stw r25, 168(SP)
  159. stw r24, 172(SP)
  160. stw r23, 176(SP)
  161. stw r22, 180(SP)
  162. stw r21, 184(SP)
  163. stw r20, 188(SP)
  164. stw r19, 192(SP)
  165. #endif
  166. stw r0, FZERO
  167. #if defined(linux) || defined(__FreeBSD__)
  168. #ifdef __64BIT__
  169. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  170. #endif
  171. #endif
  172. #if defined(_AIX) || defined(__APPLE__)
  173. #ifdef __64BIT__
  174. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  175. #else
  176. #ifdef DOUBLE
  177. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  178. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  179. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  180. #else
  181. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  182. #endif
  183. #endif
  184. #endif
  185. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  186. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  187. #endif
  188. #if defined(_AIX) || defined(__APPLE__)
  189. #ifdef __64BIT__
  190. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  191. #else
  192. #ifdef DOUBLE
  193. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  194. #else
  195. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  196. #endif
  197. #endif
  198. #endif
  199. slwi LDC, LDC, ZBASE_SHIFT
  200. #ifdef LN
  201. mullw r0, M, K
  202. slwi r0, r0, ZBASE_SHIFT
  203. add A, A, r0
  204. slwi r0, M, ZBASE_SHIFT
  205. add C, C, r0
  206. #endif
  207. #ifdef RN
  208. neg KK, OFFSET
  209. #endif
  210. #ifdef RT
  211. mullw r0, N, K
  212. slwi r0, r0, ZBASE_SHIFT
  213. add B, B, r0
  214. mullw r0, N, LDC
  215. add C, C, r0
  216. sub KK, N, OFFSET
  217. #endif
  218. cmpwi cr0, M, 0
  219. ble LL(999)
  220. cmpwi cr0, N, 0
  221. ble LL(999)
  222. cmpwi cr0, K, 0
  223. ble LL(999)
  224. li PREA, 48 * SIZE
  225. li PREC, -4 * SIZE
  226. srawi. J, N, 2
  227. ble LL(30)
  228. .align 4
  229. LL(10):
  230. #ifdef RT
  231. slwi r0, K, 2 + ZBASE_SHIFT
  232. sub B, B, r0
  233. slwi r0, LDC, 2
  234. sub C, C, r0
  235. #endif
  236. mr CO1, C
  237. add CO2, C, LDC
  238. add CO3, CO2, LDC
  239. add CO4, CO3, LDC
  240. #ifdef LN
  241. add KK, M, OFFSET
  242. #endif
  243. #ifdef LT
  244. mr KK, OFFSET
  245. #endif
  246. lfs f0, FZERO
  247. fmr f1, f0
  248. fmr f2, f0
  249. fmr f3, f0
  250. fmr f4, f0
  251. fmr f5, f0
  252. fmr f6, f0
  253. fmr f7, f0
  254. fmr f8, f0
  255. fmr f9, f0
  256. fmr f10, f0
  257. fmr f11, f0
  258. fmr f12, f0
  259. fmr f13, f0
  260. fmr f14, f0
  261. fmr f15, f0
  262. #if defined(LN) || defined(RT)
  263. mr AORIG, A
  264. #else
  265. mr AO, A
  266. #endif
  267. #ifndef RT
  268. add C, CO4, LDC
  269. #endif
  270. andi. I, M, 1
  271. ble LL(20)
  272. #if defined(LT) || defined(RN)
  273. LFD f16, 0 * SIZE(AO)
  274. LFD f17, 1 * SIZE(AO)
  275. LFD f18, 2 * SIZE(AO)
  276. LFD f19, 3 * SIZE(AO)
  277. LFD f20, 0 * SIZE(B)
  278. LFD f21, 1 * SIZE(B)
  279. LFD f22, 2 * SIZE(B)
  280. LFD f23, 3 * SIZE(B)
  281. LFD f24, 4 * SIZE(B)
  282. LFD f25, 5 * SIZE(B)
  283. LFD f26, 6 * SIZE(B)
  284. LFD f27, 7 * SIZE(B)
  285. srawi. r0, KK, 2
  286. mr BO, B
  287. mtspr CTR, r0
  288. #else
  289. #ifdef LN
  290. slwi r0, K, 0 + ZBASE_SHIFT
  291. sub AORIG, AORIG, r0
  292. #endif
  293. slwi r0, KK, 0 + ZBASE_SHIFT
  294. slwi TEMP, KK, 2 + ZBASE_SHIFT
  295. add AO, AORIG, r0
  296. add BO, B, TEMP
  297. sub TEMP, K, KK
  298. LFD f16, 0 * SIZE(AO)
  299. LFD f17, 1 * SIZE(AO)
  300. LFD f18, 2 * SIZE(AO)
  301. LFD f19, 3 * SIZE(AO)
  302. LFD f20, 0 * SIZE(BO)
  303. LFD f21, 1 * SIZE(BO)
  304. LFD f22, 2 * SIZE(BO)
  305. LFD f23, 3 * SIZE(BO)
  306. LFD f24, 4 * SIZE(BO)
  307. LFD f25, 5 * SIZE(BO)
  308. LFD f26, 6 * SIZE(BO)
  309. LFD f27, 7 * SIZE(BO)
  310. srawi. r0, TEMP, 2
  311. mtspr CTR, r0
  312. #endif
  313. ble LL(25)
  314. .align 4
  315. LL(22):
  316. FMA1 f0, f16, f20, f0
  317. FMA4 f3, f17, f20, f3
  318. FMA2 f1, f16, f21, f1
  319. FMA3 f2, f17, f21, f2
  320. LFD f28, 4 * SIZE(AO)
  321. LFD f29, 5 * SIZE(AO)
  322. LFD f30, 6 * SIZE(AO)
  323. LFD f31, 7 * SIZE(AO)
  324. FMA1 f4, f16, f22, f4
  325. FMA4 f7, f17, f22, f7
  326. FMA2 f5, f16, f23, f5
  327. FMA3 f6, f17, f23, f6
  328. LFD f20, 8 * SIZE(BO)
  329. LFD f21, 9 * SIZE(BO)
  330. LFD f22, 10 * SIZE(BO)
  331. LFD f23, 11 * SIZE(BO)
  332. FMA1 f8, f16, f24, f8
  333. FMA4 f11, f17, f24, f11
  334. FMA2 f9, f16, f25, f9
  335. FMA3 f10, f17, f25, f10
  336. FMA1 f12, f16, f26, f12
  337. FMA4 f15, f17, f26, f15
  338. FMA2 f13, f16, f27, f13
  339. FMA3 f14, f17, f27, f14
  340. LFD f24, 12 * SIZE(BO)
  341. LFD f25, 13 * SIZE(BO)
  342. LFD f26, 14 * SIZE(BO)
  343. LFD f27, 15 * SIZE(BO)
  344. FMA1 f0, f18, f20, f0
  345. FMA4 f3, f19, f20, f3
  346. FMA2 f1, f18, f21, f1
  347. FMA3 f2, f19, f21, f2
  348. FMA1 f4, f18, f22, f4
  349. FMA4 f7, f19, f22, f7
  350. FMA2 f5, f18, f23, f5
  351. FMA3 f6, f19, f23, f6
  352. LFD f20, 16 * SIZE(BO)
  353. LFD f21, 17 * SIZE(BO)
  354. LFD f22, 18 * SIZE(BO)
  355. LFD f23, 19 * SIZE(BO)
  356. FMA1 f8, f18, f24, f8
  357. FMA4 f11, f19, f24, f11
  358. FMA2 f9, f18, f25, f9
  359. FMA3 f10, f19, f25, f10
  360. FMA1 f12, f18, f26, f12
  361. FMA4 f15, f19, f26, f15
  362. FMA2 f13, f18, f27, f13
  363. FMA3 f14, f19, f27, f14
  364. LFD f24, 20 * SIZE(BO)
  365. LFD f25, 21 * SIZE(BO)
  366. LFD f26, 22 * SIZE(BO)
  367. LFD f27, 23 * SIZE(BO)
  368. FMA1 f0, f28, f20, f0
  369. FMA4 f3, f29, f20, f3
  370. FMA2 f1, f28, f21, f1
  371. FMA3 f2, f29, f21, f2
  372. LFD f16, 8 * SIZE(AO)
  373. LFD f17, 9 * SIZE(AO)
  374. LFD f18, 10 * SIZE(AO)
  375. LFD f19, 11 * SIZE(AO)
  376. FMA1 f4, f28, f22, f4
  377. FMA4 f7, f29, f22, f7
  378. FMA2 f5, f28, f23, f5
  379. FMA3 f6, f29, f23, f6
  380. LFD f20, 24 * SIZE(BO)
  381. LFD f21, 25 * SIZE(BO)
  382. LFD f22, 26 * SIZE(BO)
  383. LFD f23, 27 * SIZE(BO)
  384. FMA1 f8, f28, f24, f8
  385. FMA4 f11, f29, f24, f11
  386. FMA2 f9, f28, f25, f9
  387. FMA3 f10, f29, f25, f10
  388. FMA1 f12, f28, f26, f12
  389. FMA4 f15, f29, f26, f15
  390. FMA2 f13, f28, f27, f13
  391. FMA3 f14, f29, f27, f14
  392. LFD f24, 28 * SIZE(BO)
  393. LFD f25, 29 * SIZE(BO)
  394. LFD f26, 30 * SIZE(BO)
  395. LFD f27, 31 * SIZE(BO)
  396. FMA1 f0, f30, f20, f0
  397. FMA4 f3, f31, f20, f3
  398. FMA2 f1, f30, f21, f1
  399. FMA3 f2, f31, f21, f2
  400. FMA1 f4, f30, f22, f4
  401. FMA4 f7, f31, f22, f7
  402. FMA2 f5, f30, f23, f5
  403. FMA3 f6, f31, f23, f6
  404. LFD f20, 32 * SIZE(BO)
  405. LFD f21, 33 * SIZE(BO)
  406. LFD f22, 34 * SIZE(BO)
  407. LFD f23, 35 * SIZE(BO)
  408. FMA1 f8, f30, f24, f8
  409. FMA4 f11, f31, f24, f11
  410. FMA2 f9, f30, f25, f9
  411. FMA3 f10, f31, f25, f10
  412. FMA1 f12, f30, f26, f12
  413. FMA4 f15, f31, f26, f15
  414. FMA2 f13, f30, f27, f13
  415. FMA3 f14, f31, f27, f14
  416. LFD f24, 36 * SIZE(BO)
  417. LFD f25, 37 * SIZE(BO)
  418. LFD f26, 38 * SIZE(BO)
  419. LFD f27, 39 * SIZE(BO)
  420. addi AO, AO, 8 * SIZE
  421. addi BO, BO, 32 * SIZE
  422. bdnz LL(22)
  423. .align 4
  424. LL(25):
  425. #if defined(LT) || defined(RN)
  426. andi. r0, KK, 3
  427. #else
  428. andi. r0, TEMP, 3
  429. #endif
  430. mtspr CTR, r0
  431. ble LL(27)
  432. .align 4
  433. LL(26):
  434. FMA1 f0, f16, f20, f0
  435. FMA4 f3, f17, f20, f3
  436. FMA2 f1, f16, f21, f1
  437. FMA3 f2, f17, f21, f2
  438. FMA1 f4, f16, f22, f4
  439. FMA4 f7, f17, f22, f7
  440. FMA2 f5, f16, f23, f5
  441. FMA3 f6, f17, f23, f6
  442. LFD f20, 8 * SIZE(BO)
  443. LFD f21, 9 * SIZE(BO)
  444. LFD f22, 10 * SIZE(BO)
  445. LFD f23, 11 * SIZE(BO)
  446. FMA1 f8, f16, f24, f8
  447. FMA4 f11, f17, f24, f11
  448. FMA2 f9, f16, f25, f9
  449. FMA3 f10, f17, f25, f10
  450. FMA1 f12, f16, f26, f12
  451. FMA4 f15, f17, f26, f15
  452. FMA2 f13, f16, f27, f13
  453. FMA3 f14, f17, f27, f14
  454. LFD f16, 2 * SIZE(AO)
  455. LFD f17, 3 * SIZE(AO)
  456. LFD f24, 12 * SIZE(BO)
  457. LFD f25, 13 * SIZE(BO)
  458. LFD f26, 14 * SIZE(BO)
  459. LFD f27, 15 * SIZE(BO)
  460. addi AO, AO, 2 * SIZE
  461. addi BO, BO, 8 * SIZE
  462. bdnz LL(26)
  463. .align 4
  464. LL(27):
  465. #if defined(LN) || defined(RT)
  466. #ifdef LN
  467. subi r0, KK, 1
  468. #else
  469. subi r0, KK, 4
  470. #endif
  471. slwi TEMP, r0, 0 + ZBASE_SHIFT
  472. slwi r0, r0, 2 + ZBASE_SHIFT
  473. add AO, AORIG, TEMP
  474. add BO, B, r0
  475. #endif
  476. FADD f0, f0, f2
  477. FADD f1, f1, f3
  478. FADD f4, f4, f6
  479. FADD f5, f5, f7
  480. FADD f8, f8, f10
  481. FADD f9, f9, f11
  482. FADD f12, f12, f14
  483. FADD f13, f13, f15
  484. #if defined(LN) || defined(LT)
  485. LFD f16, 0 * SIZE(BO)
  486. LFD f17, 1 * SIZE(BO)
  487. LFD f18, 2 * SIZE(BO)
  488. LFD f19, 3 * SIZE(BO)
  489. FSUB f0, f16, f0
  490. FSUB f1, f17, f1
  491. FSUB f4, f18, f4
  492. FSUB f5, f19, f5
  493. LFD f20, 4 * SIZE(BO)
  494. LFD f21, 5 * SIZE(BO)
  495. LFD f22, 6 * SIZE(BO)
  496. LFD f23, 7 * SIZE(BO)
  497. FSUB f8, f20, f8
  498. FSUB f9, f21, f9
  499. FSUB f12, f22, f12
  500. FSUB f13, f23, f13
  501. #else
  502. LFD f16, 0 * SIZE(AO)
  503. LFD f17, 1 * SIZE(AO)
  504. LFD f20, 2 * SIZE(AO)
  505. LFD f21, 3 * SIZE(AO)
  506. FSUB f0, f16, f0
  507. FSUB f1, f17, f1
  508. FSUB f4, f20, f4
  509. FSUB f5, f21, f5
  510. LFD f24, 4 * SIZE(AO)
  511. LFD f25, 5 * SIZE(AO)
  512. LFD f28, 6 * SIZE(AO)
  513. LFD f29, 7 * SIZE(AO)
  514. FSUB f8, f24, f8
  515. FSUB f9, f25, f9
  516. FSUB f12, f28, f12
  517. FSUB f13, f29, f13
  518. #endif
  519. #ifdef LN
  520. LFD f28, 0 * SIZE(AO)
  521. LFD f29, 1 * SIZE(AO)
  522. FMUL f16, f29, f1
  523. FMUL f17, f29, f0
  524. FMUL f18, f29, f5
  525. FMUL f19, f29, f4
  526. FMUL f20, f29, f9
  527. FMUL f21, f29, f8
  528. FMUL f22, f29, f13
  529. FMUL f23, f29, f12
  530. #ifndef CONJ
  531. FMSUB f0, f28, f0, f16
  532. FMADD f1, f28, f1, f17
  533. FMSUB f4, f28, f4, f18
  534. FMADD f5, f28, f5, f19
  535. FMSUB f8, f28, f8, f20
  536. FMADD f9, f28, f9, f21
  537. FMSUB f12, f28, f12, f22
  538. FMADD f13, f28, f13, f23
  539. #else
  540. FMADD f0, f28, f0, f16
  541. FMSUB f1, f28, f1, f17
  542. FMADD f4, f28, f4, f18
  543. FMSUB f5, f28, f5, f19
  544. FMADD f8, f28, f8, f20
  545. FMSUB f9, f28, f9, f21
  546. FMADD f12, f28, f12, f22
  547. FMSUB f13, f28, f13, f23
  548. #endif
  549. #endif
  550. #ifdef LT
  551. LFD f24, 0 * SIZE(AO)
  552. LFD f25, 1 * SIZE(AO)
  553. FMUL f16, f25, f1
  554. FMUL f17, f25, f0
  555. FMUL f18, f25, f5
  556. FMUL f19, f25, f4
  557. FMUL f20, f25, f9
  558. FMUL f21, f25, f8
  559. FMUL f22, f25, f13
  560. FMUL f23, f25, f12
  561. #ifndef CONJ
  562. FMSUB f0, f24, f0, f16
  563. FMADD f1, f24, f1, f17
  564. FMSUB f4, f24, f4, f18
  565. FMADD f5, f24, f5, f19
  566. FMSUB f8, f24, f8, f20
  567. FMADD f9, f24, f9, f21
  568. FMSUB f12, f24, f12, f22
  569. FMADD f13, f24, f13, f23
  570. #else
  571. FMADD f0, f24, f0, f16
  572. FMSUB f1, f24, f1, f17
  573. FMADD f4, f24, f4, f18
  574. FMSUB f5, f24, f5, f19
  575. FMADD f8, f24, f8, f20
  576. FMSUB f9, f24, f9, f21
  577. FMADD f12, f24, f12, f22
  578. FMSUB f13, f24, f13, f23
  579. #endif
  580. #endif
  581. #ifdef RN
  582. LFD f24, 0 * SIZE(BO)
  583. LFD f25, 1 * SIZE(BO)
  584. LFD f26, 2 * SIZE(BO)
  585. LFD f27, 3 * SIZE(BO)
  586. LFD f28, 4 * SIZE(BO)
  587. LFD f29, 5 * SIZE(BO)
  588. LFD f30, 6 * SIZE(BO)
  589. LFD f31, 7 * SIZE(BO)
  590. FMUL f16, f25, f1
  591. FMUL f17, f25, f0
  592. #ifndef CONJ
  593. FMSUB f0, f24, f0, f16
  594. FMADD f1, f24, f1, f17
  595. FMADD f4, f27, f1, f4
  596. FNMSUB f5, f27, f0, f5
  597. FNMSUB f4, f26, f0, f4
  598. FNMSUB f5, f26, f1, f5
  599. FMADD f8, f29, f1, f8
  600. FNMSUB f9, f29, f0, f9
  601. FNMSUB f8, f28, f0, f8
  602. FNMSUB f9, f28, f1, f9
  603. FMADD f12, f31, f1, f12
  604. FNMSUB f13, f31, f0, f13
  605. FNMSUB f12, f30, f0, f12
  606. FNMSUB f13, f30, f1, f13
  607. LFD f26, 10 * SIZE(BO)
  608. LFD f27, 11 * SIZE(BO)
  609. LFD f28, 12 * SIZE(BO)
  610. LFD f29, 13 * SIZE(BO)
  611. LFD f30, 14 * SIZE(BO)
  612. LFD f31, 15 * SIZE(BO)
  613. FMUL f16, f27, f5
  614. FMUL f17, f27, f4
  615. FMSUB f4, f26, f4, f16
  616. FMADD f5, f26, f5, f17
  617. FMADD f8, f29, f5, f8
  618. FNMSUB f9, f29, f4, f9
  619. FNMSUB f8, f28, f4, f8
  620. FNMSUB f9, f28, f5, f9
  621. FMADD f12, f31, f5, f12
  622. FNMSUB f13, f31, f4, f13
  623. FNMSUB f12, f30, f4, f12
  624. FNMSUB f13, f30, f5, f13
  625. LFD f26, 20 * SIZE(BO)
  626. LFD f27, 21 * SIZE(BO)
  627. LFD f28, 22 * SIZE(BO)
  628. LFD f29, 23 * SIZE(BO)
  629. LFD f30, 30 * SIZE(BO)
  630. LFD f31, 31 * SIZE(BO)
  631. FMUL f16, f27, f9
  632. FMUL f17, f27, f8
  633. FMSUB f8, f26, f8, f16
  634. FMADD f9, f26, f9, f17
  635. FMADD f12, f29, f9, f12
  636. FNMSUB f13, f29, f8, f13
  637. FNMSUB f12, f28, f8, f12
  638. FNMSUB f13, f28, f9, f13
  639. FMUL f16, f31, f13
  640. FMUL f17, f31, f12
  641. FMSUB f12, f30, f12, f16
  642. FMADD f13, f30, f13, f17
  643. #else
  644. FMADD f0, f24, f0, f16
  645. FMSUB f1, f24, f1, f17
  646. FMSUB f4, f27, f1, f4
  647. FNMADD f5, f27, f0, f5
  648. FNMADD f4, f26, f0, f4
  649. FNMADD f5, f26, f1, f5
  650. FMSUB f8, f29, f1, f8
  651. FNMADD f9, f29, f0, f9
  652. FNMADD f8, f28, f0, f8
  653. FNMADD f9, f28, f1, f9
  654. FMSUB f12, f31, f1, f12
  655. FNMADD f13, f31, f0, f13
  656. FNMADD f12, f30, f0, f12
  657. FNMADD f13, f30, f1, f13
  658. LFD f26, 10 * SIZE(BO)
  659. LFD f27, 11 * SIZE(BO)
  660. LFD f28, 12 * SIZE(BO)
  661. LFD f29, 13 * SIZE(BO)
  662. LFD f30, 14 * SIZE(BO)
  663. LFD f31, 15 * SIZE(BO)
  664. FMUL f16, f27, f5
  665. FMUL f17, f27, f4
  666. FMADD f4, f26, f4, f16
  667. FMSUB f5, f26, f5, f17
  668. FMSUB f8, f29, f5, f8
  669. FNMADD f9, f29, f4, f9
  670. FNMADD f8, f28, f4, f8
  671. FNMADD f9, f28, f5, f9
  672. FMSUB f12, f31, f5, f12
  673. FNMADD f13, f31, f4, f13
  674. FNMADD f12, f30, f4, f12
  675. FNMADD f13, f30, f5, f13
  676. LFD f26, 20 * SIZE(BO)
  677. LFD f27, 21 * SIZE(BO)
  678. LFD f28, 22 * SIZE(BO)
  679. LFD f29, 23 * SIZE(BO)
  680. LFD f30, 30 * SIZE(BO)
  681. LFD f31, 31 * SIZE(BO)
  682. FMUL f16, f27, f9
  683. FMUL f17, f27, f8
  684. FMADD f8, f26, f8, f16
  685. FMSUB f9, f26, f9, f17
  686. FMSUB f12, f29, f9, f12
  687. FNMADD f13, f29, f8, f13
  688. FNMADD f12, f28, f8, f12
  689. FNMADD f13, f28, f9, f13
  690. FMUL f16, f31, f13
  691. FMUL f17, f31, f12
  692. FMADD f12, f30, f12, f16
  693. FMSUB f13, f30, f13, f17
  694. #endif
  695. #endif
  696. #ifdef RT
  697. LFD f24, 30 * SIZE(BO)
  698. LFD f25, 31 * SIZE(BO)
  699. LFD f26, 28 * SIZE(BO)
  700. LFD f27, 29 * SIZE(BO)
  701. LFD f28, 26 * SIZE(BO)
  702. LFD f29, 27 * SIZE(BO)
  703. LFD f30, 24 * SIZE(BO)
  704. LFD f31, 25 * SIZE(BO)
  705. FMUL f16, f25, f13
  706. FMUL f17, f25, f12
  707. #ifndef CONJ
  708. FMSUB f12, f24, f12, f16
  709. FMADD f13, f24, f13, f17
  710. FMADD f8, f27, f13, f8
  711. FNMSUB f9, f27, f12, f9
  712. FNMSUB f8, f26, f12, f8
  713. FNMSUB f9, f26, f13, f9
  714. FMADD f4, f29, f13, f4
  715. FNMSUB f5, f29, f12, f5
  716. FNMSUB f4, f28, f12, f4
  717. FNMSUB f5, f28, f13, f5
  718. FMADD f0, f31, f13, f0
  719. FNMSUB f1, f31, f12, f1
  720. FNMSUB f0, f30, f12, f0
  721. FNMSUB f1, f30, f13, f1
  722. LFD f26, 20 * SIZE(BO)
  723. LFD f27, 21 * SIZE(BO)
  724. LFD f28, 18 * SIZE(BO)
  725. LFD f29, 19 * SIZE(BO)
  726. LFD f30, 16 * SIZE(BO)
  727. LFD f31, 17 * SIZE(BO)
  728. FMUL f16, f27, f9
  729. FMUL f17, f27, f8
  730. FMSUB f8, f26, f8, f16
  731. FMADD f9, f26, f9, f17
  732. FMADD f4, f29, f9, f4
  733. FNMSUB f5, f29, f8, f5
  734. FNMSUB f4, f28, f8, f4
  735. FNMSUB f5, f28, f9, f5
  736. FMADD f0, f31, f9, f0
  737. FNMSUB f1, f31, f8, f1
  738. FNMSUB f0, f30, f8, f0
  739. FNMSUB f1, f30, f9, f1
  740. LFD f26, 10 * SIZE(BO)
  741. LFD f27, 11 * SIZE(BO)
  742. LFD f28, 8 * SIZE(BO)
  743. LFD f29, 9 * SIZE(BO)
  744. LFD f30, 0 * SIZE(BO)
  745. LFD f31, 1 * SIZE(BO)
  746. FMUL f16, f27, f5
  747. FMUL f17, f27, f4
  748. FMSUB f4, f26, f4, f16
  749. FMADD f5, f26, f5, f17
  750. FMADD f0, f29, f5, f0
  751. FNMSUB f1, f29, f4, f1
  752. FNMSUB f0, f28, f4, f0
  753. FNMSUB f1, f28, f5, f1
  754. FMUL f16, f31, f1
  755. FMUL f17, f31, f0
  756. FMSUB f0, f30, f0, f16
  757. FMADD f1, f30, f1, f17
  758. #else
  759. FMADD f12, f24, f12, f16
  760. FMSUB f13, f24, f13, f17
  761. FMSUB f8, f27, f13, f8
  762. FNMADD f9, f27, f12, f9
  763. FNMADD f8, f26, f12, f8
  764. FNMADD f9, f26, f13, f9
  765. FMSUB f4, f29, f13, f4
  766. FNMADD f5, f29, f12, f5
  767. FNMADD f4, f28, f12, f4
  768. FNMADD f5, f28, f13, f5
  769. FMSUB f0, f31, f13, f0
  770. FNMADD f1, f31, f12, f1
  771. FNMADD f0, f30, f12, f0
  772. FNMADD f1, f30, f13, f1
  773. LFD f26, 20 * SIZE(BO)
  774. LFD f27, 21 * SIZE(BO)
  775. LFD f28, 18 * SIZE(BO)
  776. LFD f29, 19 * SIZE(BO)
  777. LFD f30, 16 * SIZE(BO)
  778. LFD f31, 17 * SIZE(BO)
  779. FMUL f16, f27, f9
  780. FMUL f17, f27, f8
  781. FMADD f8, f26, f8, f16
  782. FMSUB f9, f26, f9, f17
  783. FMSUB f4, f29, f9, f4
  784. FNMADD f5, f29, f8, f5
  785. FNMADD f4, f28, f8, f4
  786. FNMADD f5, f28, f9, f5
  787. FMSUB f0, f31, f9, f0
  788. FNMADD f1, f31, f8, f1
  789. FNMADD f0, f30, f8, f0
  790. FNMADD f1, f30, f9, f1
  791. LFD f26, 10 * SIZE(BO)
  792. LFD f27, 11 * SIZE(BO)
  793. LFD f28, 8 * SIZE(BO)
  794. LFD f29, 9 * SIZE(BO)
  795. LFD f30, 0 * SIZE(BO)
  796. LFD f31, 1 * SIZE(BO)
  797. FMUL f16, f27, f5
  798. FMUL f17, f27, f4
  799. FMADD f4, f26, f4, f16
  800. FMSUB f5, f26, f5, f17
  801. FMSUB f0, f29, f5, f0
  802. FNMADD f1, f29, f4, f1
  803. FNMADD f0, f28, f4, f0
  804. FNMADD f1, f28, f5, f1
  805. FMUL f16, f31, f1
  806. FMUL f17, f31, f0
  807. FMADD f0, f30, f0, f16
  808. FMSUB f1, f30, f1, f17
  809. #endif
  810. #endif
  811. #ifdef LN
  812. subi CO1, CO1, 2 * SIZE
  813. subi CO2, CO2, 2 * SIZE
  814. subi CO3, CO3, 2 * SIZE
  815. subi CO4, CO4, 2 * SIZE
  816. #endif
  817. #if defined(LN) || defined(LT)
  818. STFD f0, 0 * SIZE(BO)
  819. STFD f1, 1 * SIZE(BO)
  820. STFD f4, 2 * SIZE(BO)
  821. STFD f5, 3 * SIZE(BO)
  822. STFD f8, 4 * SIZE(BO)
  823. STFD f9, 5 * SIZE(BO)
  824. STFD f12, 6 * SIZE(BO)
  825. STFD f13, 7 * SIZE(BO)
  826. #else
  827. STFD f0, 0 * SIZE(AO)
  828. STFD f1, 1 * SIZE(AO)
  829. STFD f4, 2 * SIZE(AO)
  830. STFD f5, 3 * SIZE(AO)
  831. STFD f8, 4 * SIZE(AO)
  832. STFD f9, 5 * SIZE(AO)
  833. STFD f12, 6 * SIZE(AO)
  834. STFD f13, 7 * SIZE(AO)
  835. #endif
  836. STFD f0, 0 * SIZE(CO1)
  837. STFD f1, 1 * SIZE(CO1)
  838. STFD f4, 0 * SIZE(CO2)
  839. STFD f5, 1 * SIZE(CO2)
  840. lfs f0, FZERO
  841. fmr f1, f0
  842. fmr f2, f0
  843. fmr f3, f0
  844. fmr f4, f0
  845. fmr f5, f0
  846. fmr f6, f0
  847. fmr f7, f0
  848. STFD f8, 0 * SIZE(CO3)
  849. STFD f9, 1 * SIZE(CO3)
  850. STFD f12, 0 * SIZE(CO4)
  851. STFD f13, 1 * SIZE(CO4)
  852. fmr f8, f0
  853. fmr f9, f0
  854. fmr f10, f0
  855. fmr f11, f0
  856. fmr f12, f0
  857. fmr f13, f0
  858. fmr f14, f0
  859. fmr f15, f0
  860. #ifndef LN
  861. addi CO1, CO1, 2 * SIZE
  862. addi CO2, CO2, 2 * SIZE
  863. addi CO3, CO3, 2 * SIZE
  864. addi CO4, CO4, 2 * SIZE
  865. #endif
  866. #ifdef RT
  867. slwi r0, K, 0 + ZBASE_SHIFT
  868. add AORIG, AORIG, r0
  869. #endif
  870. #if defined(LT) || defined(RN)
  871. sub TEMP, K, KK
  872. slwi r0, TEMP, 0 + ZBASE_SHIFT
  873. slwi TEMP, TEMP, 2 + ZBASE_SHIFT
  874. add AO, AO, r0
  875. add BO, BO, TEMP
  876. #endif
  877. #ifdef LT
  878. addi KK, KK, 1
  879. #endif
  880. #ifdef LN
  881. subi KK, KK, 1
  882. #endif
  883. .align 4
  884. LL(20):
  885. srawi. I, M, 1
  886. ble LL(29)
  887. .align 4
  888. LL(11):
  889. #if defined(LT) || defined(RN)
  890. LFD f16, 0 * SIZE(AO)
  891. LFD f20, 0 * SIZE(B)
  892. LFD f17, 1 * SIZE(AO)
  893. LFD f21, 1 * SIZE(B)
  894. LFD f18, 2 * SIZE(AO)
  895. LFD f22, 2 * SIZE(B)
  896. LFD f19, 3 * SIZE(AO)
  897. LFD f23, 3 * SIZE(B)
  898. LFD f24, 4 * SIZE(B)
  899. LFD f25, 5 * SIZE(B)
  900. LFD f26, 6 * SIZE(B)
  901. LFD f27, 7 * SIZE(B)
  902. dcbtst CO1, PREC
  903. dcbtst CO2, PREC
  904. dcbtst CO3, PREC
  905. dcbtst CO4, PREC
  906. srawi. r0, KK, 3
  907. mtspr CTR, r0
  908. mr BO, B
  909. #else
  910. #ifdef LN
  911. slwi r0, K, 1 + ZBASE_SHIFT
  912. sub AORIG, AORIG, r0
  913. #endif
  914. slwi r0, KK, 1 + ZBASE_SHIFT
  915. slwi TEMP, KK, 2 + ZBASE_SHIFT
  916. add AO, AORIG, r0
  917. add BO, B, TEMP
  918. sub TEMP, K, KK
  919. LFD f16, 0 * SIZE(AO)
  920. LFD f20, 0 * SIZE(BO)
  921. LFD f17, 1 * SIZE(AO)
  922. LFD f21, 1 * SIZE(BO)
  923. LFD f18, 2 * SIZE(AO)
  924. LFD f22, 2 * SIZE(BO)
  925. LFD f19, 3 * SIZE(AO)
  926. LFD f23, 3 * SIZE(BO)
  927. LFD f24, 4 * SIZE(BO)
  928. LFD f25, 5 * SIZE(BO)
  929. LFD f26, 6 * SIZE(BO)
  930. LFD f27, 7 * SIZE(BO)
  931. dcbtst CO1, PREC
  932. dcbtst CO2, PREC
  933. dcbtst CO3, PREC
  934. dcbtst CO4, PREC
  935. srawi. r0, TEMP, 3
  936. mtspr CTR, r0
  937. #endif
  938. ble LL(15)
  939. .align 4
  940. LL(12):
  941. dcbt AO, PREA
  942. dcbtst BO, PREA
  943. FMA1 f0, f16, f20, f0
  944. FMA1 f2, f18, f20, f2
  945. FMA2 f1, f16, f21, f1
  946. FMA2 f3, f18, f21, f3
  947. LFD f28, 4 * SIZE(AO)
  948. LFD f29, 5 * SIZE(AO)
  949. LFD f30, 6 * SIZE(AO)
  950. LFD f31, 7 * SIZE(AO)
  951. FMA1 f4, f16, f22, f4
  952. FMA1 f6, f18, f22, f6
  953. FMA2 f5, f16, f23, f5
  954. FMA2 f7, f18, f23, f7
  955. FMA1 f8, f16, f24, f8
  956. FMA1 f10, f18, f24, f10
  957. FMA2 f9, f16, f25, f9
  958. FMA2 f11, f18, f25, f11
  959. FMA1 f12, f16, f26, f12
  960. FMA1 f14, f18, f26, f14
  961. FMA2 f13, f16, f27, f13
  962. FMA2 f15, f18, f27, f15
  963. FMA4 f1, f17, f20, f1
  964. FMA4 f3, f19, f20, f3
  965. FMA3 f0, f17, f21, f0
  966. FMA3 f2, f19, f21, f2
  967. FMA4 f5, f17, f22, f5
  968. FMA4 f7, f19, f22, f7
  969. FMA3 f4, f17, f23, f4
  970. FMA3 f6, f19, f23, f6
  971. LFD f20, 8 * SIZE(BO)
  972. LFD f21, 9 * SIZE(BO)
  973. LFD f22, 10 * SIZE(BO)
  974. LFD f23, 11 * SIZE(BO)
  975. FMA4 f9, f17, f24, f9
  976. FMA4 f11, f19, f24, f11
  977. FMA3 f8, f17, f25, f8
  978. FMA3 f10, f19, f25, f10
  979. FMA4 f13, f17, f26, f13
  980. FMA4 f15, f19, f26, f15
  981. FMA3 f12, f17, f27, f12
  982. FMA3 f14, f19, f27, f14
  983. LFD f24, 12 * SIZE(BO)
  984. LFD f25, 13 * SIZE(BO)
  985. LFD f26, 14 * SIZE(BO)
  986. LFD f27, 15 * SIZE(BO)
  987. FMA1 f0, f28, f20, f0
  988. FMA1 f2, f30, f20, f2
  989. FMA2 f1, f28, f21, f1
  990. FMA2 f3, f30, f21, f3
  991. LFD f16, 8 * SIZE(AO)
  992. LFD f17, 9 * SIZE(AO)
  993. LFD f18, 10 * SIZE(AO)
  994. LFD f19, 11 * SIZE(AO)
  995. FMA1 f4, f28, f22, f4
  996. FMA1 f6, f30, f22, f6
  997. FMA2 f5, f28, f23, f5
  998. FMA2 f7, f30, f23, f7
  999. FMA1 f8, f28, f24, f8
  1000. FMA1 f10, f30, f24, f10
  1001. FMA2 f9, f28, f25, f9
  1002. FMA2 f11, f30, f25, f11
  1003. FMA1 f12, f28, f26, f12
  1004. FMA1 f14, f30, f26, f14
  1005. FMA2 f13, f28, f27, f13
  1006. FMA2 f15, f30, f27, f15
  1007. FMA4 f1, f29, f20, f1
  1008. FMA4 f3, f31, f20, f3
  1009. FMA3 f0, f29, f21, f0
  1010. FMA3 f2, f31, f21, f2
  1011. FMA4 f5, f29, f22, f5
  1012. FMA4 f7, f31, f22, f7
  1013. FMA3 f4, f29, f23, f4
  1014. FMA3 f6, f31, f23, f6
  1015. LFD f20, 16 * SIZE(BO)
  1016. LFD f21, 17 * SIZE(BO)
  1017. LFD f22, 18 * SIZE(BO)
  1018. LFD f23, 19 * SIZE(BO)
  1019. FMA4 f9, f29, f24, f9
  1020. FMA4 f11, f31, f24, f11
  1021. FMA3 f8, f29, f25, f8
  1022. FMA3 f10, f31, f25, f10
  1023. FMA4 f13, f29, f26, f13
  1024. FMA4 f15, f31, f26, f15
  1025. FMA3 f12, f29, f27, f12
  1026. FMA3 f14, f31, f27, f14
  1027. LFD f24, 20 * SIZE(BO)
  1028. LFD f25, 21 * SIZE(BO)
  1029. LFD f26, 22 * SIZE(BO)
  1030. LFD f27, 23 * SIZE(BO)
  1031. FMA1 f0, f16, f20, f0
  1032. FMA1 f2, f18, f20, f2
  1033. FMA2 f1, f16, f21, f1
  1034. FMA2 f3, f18, f21, f3
  1035. LFD f28, 12 * SIZE(AO)
  1036. LFD f29, 13 * SIZE(AO)
  1037. LFD f30, 14 * SIZE(AO)
  1038. LFD f31, 15 * SIZE(AO)
  1039. FMA1 f4, f16, f22, f4
  1040. FMA1 f6, f18, f22, f6
  1041. FMA2 f5, f16, f23, f5
  1042. FMA2 f7, f18, f23, f7
  1043. FMA1 f8, f16, f24, f8
  1044. FMA1 f10, f18, f24, f10
  1045. FMA2 f9, f16, f25, f9
  1046. FMA2 f11, f18, f25, f11
  1047. FMA1 f12, f16, f26, f12
  1048. FMA1 f14, f18, f26, f14
  1049. FMA2 f13, f16, f27, f13
  1050. FMA2 f15, f18, f27, f15
  1051. FMA4 f1, f17, f20, f1
  1052. FMA4 f3, f19, f20, f3
  1053. FMA3 f0, f17, f21, f0
  1054. FMA3 f2, f19, f21, f2
  1055. FMA4 f5, f17, f22, f5
  1056. FMA4 f7, f19, f22, f7
  1057. FMA3 f4, f17, f23, f4
  1058. FMA3 f6, f19, f23, f6
  1059. LFD f20, 24 * SIZE(BO)
  1060. LFD f21, 25 * SIZE(BO)
  1061. LFD f22, 26 * SIZE(BO)
  1062. LFD f23, 27 * SIZE(BO)
  1063. FMA4 f9, f17, f24, f9
  1064. FMA4 f11, f19, f24, f11
  1065. FMA3 f8, f17, f25, f8
  1066. FMA3 f10, f19, f25, f10
  1067. FMA4 f13, f17, f26, f13
  1068. FMA4 f15, f19, f26, f15
  1069. FMA3 f12, f17, f27, f12
  1070. FMA3 f14, f19, f27, f14
  1071. LFD f24, 28 * SIZE(BO)
  1072. LFD f25, 29 * SIZE(BO)
  1073. LFD f26, 30 * SIZE(BO)
  1074. LFD f27, 31 * SIZE(BO)
  1075. FMA1 f0, f28, f20, f0
  1076. FMA1 f2, f30, f20, f2
  1077. FMA2 f1, f28, f21, f1
  1078. FMA2 f3, f30, f21, f3
  1079. LFD f16, 16 * SIZE(AO)
  1080. LFD f17, 17 * SIZE(AO)
  1081. LFD f18, 18 * SIZE(AO)
  1082. LFD f19, 19 * SIZE(AO)
  1083. FMA1 f4, f28, f22, f4
  1084. FMA1 f6, f30, f22, f6
  1085. FMA2 f5, f28, f23, f5
  1086. FMA2 f7, f30, f23, f7
  1087. FMA1 f8, f28, f24, f8
  1088. FMA1 f10, f30, f24, f10
  1089. FMA2 f9, f28, f25, f9
  1090. FMA2 f11, f30, f25, f11
  1091. FMA1 f12, f28, f26, f12
  1092. FMA1 f14, f30, f26, f14
  1093. FMA2 f13, f28, f27, f13
  1094. FMA2 f15, f30, f27, f15
  1095. FMA4 f1, f29, f20, f1
  1096. FMA4 f3, f31, f20, f3
  1097. FMA3 f0, f29, f21, f0
  1098. FMA3 f2, f31, f21, f2
  1099. FMA4 f5, f29, f22, f5
  1100. FMA4 f7, f31, f22, f7
  1101. FMA3 f4, f29, f23, f4
  1102. FMA3 f6, f31, f23, f6
  1103. LFD f20, 32 * SIZE(BO)
  1104. LFD f21, 33 * SIZE(BO)
  1105. LFD f22, 34 * SIZE(BO)
  1106. LFD f23, 35 * SIZE(BO)
  1107. FMA4 f9, f29, f24, f9
  1108. FMA4 f11, f31, f24, f11
  1109. FMA3 f8, f29, f25, f8
  1110. FMA3 f10, f31, f25, f10
  1111. FMA4 f13, f29, f26, f13
  1112. FMA4 f15, f31, f26, f15
  1113. FMA3 f12, f29, f27, f12
  1114. FMA3 f14, f31, f27, f14
  1115. LFD f24, 36 * SIZE(BO)
  1116. LFD f25, 37 * SIZE(BO)
  1117. LFD f26, 38 * SIZE(BO)
  1118. LFD f27, 39 * SIZE(BO)
  1119. FMA1 f0, f16, f20, f0
  1120. FMA1 f2, f18, f20, f2
  1121. FMA2 f1, f16, f21, f1
  1122. FMA2 f3, f18, f21, f3
  1123. LFD f28, 20 * SIZE(AO)
  1124. LFD f29, 21 * SIZE(AO)
  1125. LFD f30, 22 * SIZE(AO)
  1126. LFD f31, 23 * SIZE(AO)
  1127. FMA1 f4, f16, f22, f4
  1128. FMA1 f6, f18, f22, f6
  1129. FMA2 f5, f16, f23, f5
  1130. FMA2 f7, f18, f23, f7
  1131. FMA1 f8, f16, f24, f8
  1132. FMA1 f10, f18, f24, f10
  1133. FMA2 f9, f16, f25, f9
  1134. FMA2 f11, f18, f25, f11
  1135. FMA1 f12, f16, f26, f12
  1136. FMA1 f14, f18, f26, f14
  1137. FMA2 f13, f16, f27, f13
  1138. FMA2 f15, f18, f27, f15
  1139. FMA4 f1, f17, f20, f1
  1140. FMA4 f3, f19, f20, f3
  1141. FMA3 f0, f17, f21, f0
  1142. FMA3 f2, f19, f21, f2
  1143. FMA4 f5, f17, f22, f5
  1144. FMA4 f7, f19, f22, f7
  1145. FMA3 f4, f17, f23, f4
  1146. FMA3 f6, f19, f23, f6
  1147. LFD f20, 40 * SIZE(BO)
  1148. LFD f21, 41 * SIZE(BO)
  1149. LFD f22, 42 * SIZE(BO)
  1150. LFD f23, 43 * SIZE(BO)
  1151. FMA4 f9, f17, f24, f9
  1152. FMA4 f11, f19, f24, f11
  1153. FMA3 f8, f17, f25, f8
  1154. FMA3 f10, f19, f25, f10
  1155. FMA4 f13, f17, f26, f13
  1156. FMA4 f15, f19, f26, f15
  1157. FMA3 f12, f17, f27, f12
  1158. FMA3 f14, f19, f27, f14
  1159. LFD f24, 44 * SIZE(BO)
  1160. LFD f25, 45 * SIZE(BO)
  1161. LFD f26, 46 * SIZE(BO)
  1162. LFD f27, 47 * SIZE(BO)
  1163. FMA1 f0, f28, f20, f0
  1164. FMA1 f2, f30, f20, f2
  1165. FMA2 f1, f28, f21, f1
  1166. FMA2 f3, f30, f21, f3
  1167. LFD f16, 24 * SIZE(AO)
  1168. LFD f17, 25 * SIZE(AO)
  1169. LFD f18, 26 * SIZE(AO)
  1170. LFD f19, 27 * SIZE(AO)
  1171. FMA1 f4, f28, f22, f4
  1172. FMA1 f6, f30, f22, f6
  1173. FMA2 f5, f28, f23, f5
  1174. FMA2 f7, f30, f23, f7
  1175. FMA1 f8, f28, f24, f8
  1176. FMA1 f10, f30, f24, f10
  1177. FMA2 f9, f28, f25, f9
  1178. FMA2 f11, f30, f25, f11
  1179. FMA1 f12, f28, f26, f12
  1180. FMA1 f14, f30, f26, f14
  1181. FMA2 f13, f28, f27, f13
  1182. FMA2 f15, f30, f27, f15
  1183. FMA4 f1, f29, f20, f1
  1184. FMA4 f3, f31, f20, f3
  1185. FMA3 f0, f29, f21, f0
  1186. FMA3 f2, f31, f21, f2
  1187. FMA4 f5, f29, f22, f5
  1188. FMA4 f7, f31, f22, f7
  1189. FMA3 f4, f29, f23, f4
  1190. FMA3 f6, f31, f23, f6
  1191. LFD f20, 48 * SIZE(BO)
  1192. LFD f21, 49 * SIZE(BO)
  1193. LFD f22, 50 * SIZE(BO)
  1194. LFD f23, 51 * SIZE(BO)
  1195. FMA4 f9, f29, f24, f9
  1196. FMA4 f11, f31, f24, f11
  1197. FMA3 f8, f29, f25, f8
  1198. FMA3 f10, f31, f25, f10
  1199. FMA4 f13, f29, f26, f13
  1200. FMA4 f15, f31, f26, f15
  1201. FMA3 f12, f29, f27, f12
  1202. FMA3 f14, f31, f27, f14
  1203. LFD f24, 52 * SIZE(BO)
  1204. LFD f25, 53 * SIZE(BO)
  1205. LFD f26, 54 * SIZE(BO)
  1206. LFD f27, 55 * SIZE(BO)
  1207. FMA1 f0, f16, f20, f0
  1208. FMA1 f2, f18, f20, f2
  1209. FMA2 f1, f16, f21, f1
  1210. FMA2 f3, f18, f21, f3
  1211. LFD f28, 28 * SIZE(AO)
  1212. LFD f29, 29 * SIZE(AO)
  1213. LFD f30, 30 * SIZE(AO)
  1214. LFD f31, 31 * SIZE(AO)
  1215. FMA1 f4, f16, f22, f4
  1216. FMA1 f6, f18, f22, f6
  1217. FMA2 f5, f16, f23, f5
  1218. FMA2 f7, f18, f23, f7
  1219. FMA1 f8, f16, f24, f8
  1220. FMA1 f10, f18, f24, f10
  1221. FMA2 f9, f16, f25, f9
  1222. FMA2 f11, f18, f25, f11
  1223. FMA1 f12, f16, f26, f12
  1224. FMA1 f14, f18, f26, f14
  1225. FMA2 f13, f16, f27, f13
  1226. FMA2 f15, f18, f27, f15
  1227. FMA4 f1, f17, f20, f1
  1228. FMA4 f3, f19, f20, f3
  1229. FMA3 f0, f17, f21, f0
  1230. FMA3 f2, f19, f21, f2
  1231. FMA4 f5, f17, f22, f5
  1232. FMA4 f7, f19, f22, f7
  1233. FMA3 f4, f17, f23, f4
  1234. FMA3 f6, f19, f23, f6
  1235. LFD f20, 56 * SIZE(BO)
  1236. LFD f21, 57 * SIZE(BO)
  1237. LFD f22, 58 * SIZE(BO)
  1238. LFD f23, 59 * SIZE(BO)
  1239. FMA4 f9, f17, f24, f9
  1240. FMA4 f11, f19, f24, f11
  1241. FMA3 f8, f17, f25, f8
  1242. FMA3 f10, f19, f25, f10
  1243. FMA4 f13, f17, f26, f13
  1244. FMA4 f15, f19, f26, f15
  1245. FMA3 f12, f17, f27, f12
  1246. FMA3 f14, f19, f27, f14
  1247. LFD f24, 60 * SIZE(BO)
  1248. LFD f25, 61 * SIZE(BO)
  1249. LFD f26, 62 * SIZE(BO)
  1250. LFD f27, 63 * SIZE(BO)
  1251. FMA1 f0, f28, f20, f0
  1252. FMA1 f2, f30, f20, f2
  1253. FMA2 f1, f28, f21, f1
  1254. FMA2 f3, f30, f21, f3
  1255. LFD f16, 32 * SIZE(AO)
  1256. LFD f17, 33 * SIZE(AO)
  1257. LFD f18, 34 * SIZE(AO)
  1258. LFD f19, 35 * SIZE(AO)
  1259. FMA1 f4, f28, f22, f4
  1260. FMA1 f6, f30, f22, f6
  1261. FMA2 f5, f28, f23, f5
  1262. FMA2 f7, f30, f23, f7
  1263. FMA1 f8, f28, f24, f8
  1264. FMA1 f10, f30, f24, f10
  1265. FMA2 f9, f28, f25, f9
  1266. FMA2 f11, f30, f25, f11
  1267. FMA1 f12, f28, f26, f12
  1268. FMA1 f14, f30, f26, f14
  1269. FMA2 f13, f28, f27, f13
  1270. FMA2 f15, f30, f27, f15
  1271. FMA4 f1, f29, f20, f1
  1272. FMA4 f3, f31, f20, f3
  1273. FMA3 f0, f29, f21, f0
  1274. FMA3 f2, f31, f21, f2
  1275. FMA4 f5, f29, f22, f5
  1276. FMA4 f7, f31, f22, f7
  1277. FMA3 f4, f29, f23, f4
  1278. FMA3 f6, f31, f23, f6
  1279. LFD f20, 64 * SIZE(BO)
  1280. LFD f21, 65 * SIZE(BO)
  1281. LFD f22, 66 * SIZE(BO)
  1282. LFD f23, 67 * SIZE(BO)
  1283. FMA4 f9, f29, f24, f9
  1284. FMA4 f11, f31, f24, f11
  1285. FMA3 f8, f29, f25, f8
  1286. FMA3 f10, f31, f25, f10
  1287. FMA4 f13, f29, f26, f13
  1288. FMA4 f15, f31, f26, f15
  1289. FMA3 f12, f29, f27, f12
  1290. FMA3 f14, f31, f27, f14
  1291. LFD f24, 68 * SIZE(BO)
  1292. LFD f25, 69 * SIZE(BO)
  1293. LFD f26, 70 * SIZE(BO)
  1294. LFD f27, 71 * SIZE(BO)
  1295. addi AO, AO, 32 * SIZE
  1296. addi BO, BO, 64 * SIZE
  1297. bdnz LL(12)
  1298. .align 4
  1299. LL(15):
  1300. #if defined(LT) || defined(RN)
  1301. andi. r0, KK, 7
  1302. #else
  1303. andi. r0, TEMP, 7
  1304. #endif
  1305. mtspr CTR, r0
  1306. ble LL(18)
  1307. .align 4
  1308. LL(16):
  1309. FMA1 f0, f16, f20, f0
  1310. FMA1 f2, f18, f20, f2
  1311. FMA2 f1, f16, f21, f1
  1312. FMA2 f3, f18, f21, f3
  1313. FMA1 f4, f16, f22, f4
  1314. FMA1 f6, f18, f22, f6
  1315. FMA2 f5, f16, f23, f5
  1316. FMA2 f7, f18, f23, f7
  1317. FMA1 f8, f16, f24, f8
  1318. FMA1 f10, f18, f24, f10
  1319. FMA2 f9, f16, f25, f9
  1320. FMA2 f11, f18, f25, f11
  1321. FMA1 f12, f16, f26, f12
  1322. FMA1 f14, f18, f26, f14
  1323. FMA2 f13, f16, f27, f13
  1324. FMA2 f15, f18, f27, f15
  1325. FMA4 f1, f17, f20, f1
  1326. FMA4 f3, f19, f20, f3
  1327. FMA3 f0, f17, f21, f0
  1328. FMA3 f2, f19, f21, f2
  1329. FMA4 f5, f17, f22, f5
  1330. FMA4 f7, f19, f22, f7
  1331. FMA3 f4, f17, f23, f4
  1332. FMA3 f6, f19, f23, f6
  1333. FMA4 f9, f17, f24, f9
  1334. FMA4 f11, f19, f24, f11
  1335. FMA3 f8, f17, f25, f8
  1336. FMA3 f10, f19, f25, f10
  1337. FMA4 f13, f17, f26, f13
  1338. FMA4 f15, f19, f26, f15
  1339. FMA3 f12, f17, f27, f12
  1340. FMA3 f14, f19, f27, f14
  1341. LFD f16, 4 * SIZE(AO)
  1342. LFD f17, 5 * SIZE(AO)
  1343. LFD f18, 6 * SIZE(AO)
  1344. LFD f19, 7 * SIZE(AO)
  1345. LFD f20, 8 * SIZE(BO)
  1346. LFD f21, 9 * SIZE(BO)
  1347. LFD f22, 10 * SIZE(BO)
  1348. LFD f23, 11 * SIZE(BO)
  1349. LFD f24, 12 * SIZE(BO)
  1350. LFD f25, 13 * SIZE(BO)
  1351. LFD f26, 14 * SIZE(BO)
  1352. LFD f27, 15 * SIZE(BO)
  1353. addi AO, AO, 4 * SIZE
  1354. addi BO, BO, 8 * SIZE
  1355. bdnz LL(16)
  1356. .align 4
  1357. LL(18):
  1358. #if defined(LN) || defined(RT)
  1359. #ifdef LN
  1360. subi r0, KK, 2
  1361. #else
  1362. subi r0, KK, 4
  1363. #endif
  1364. slwi TEMP, r0, 1 + ZBASE_SHIFT
  1365. slwi r0, r0, 2 + ZBASE_SHIFT
  1366. add AO, AORIG, TEMP
  1367. add BO, B, r0
  1368. #endif
  1369. #if defined(LN) || defined(LT)
  1370. LFD f16, 0 * SIZE(BO)
  1371. LFD f17, 1 * SIZE(BO)
  1372. LFD f18, 2 * SIZE(BO)
  1373. LFD f19, 3 * SIZE(BO)
  1374. FSUB f0, f16, f0
  1375. FSUB f1, f17, f1
  1376. FSUB f4, f18, f4
  1377. FSUB f5, f19, f5
  1378. LFD f20, 4 * SIZE(BO)
  1379. LFD f21, 5 * SIZE(BO)
  1380. LFD f22, 6 * SIZE(BO)
  1381. LFD f23, 7 * SIZE(BO)
  1382. FSUB f8, f20, f8
  1383. FSUB f9, f21, f9
  1384. FSUB f12, f22, f12
  1385. FSUB f13, f23, f13
  1386. LFD f24, 8 * SIZE(BO)
  1387. LFD f25, 9 * SIZE(BO)
  1388. LFD f26, 10 * SIZE(BO)
  1389. LFD f27, 11 * SIZE(BO)
  1390. FSUB f2, f24, f2
  1391. FSUB f3, f25, f3
  1392. FSUB f6, f26, f6
  1393. FSUB f7, f27, f7
  1394. LFD f28, 12 * SIZE(BO)
  1395. LFD f29, 13 * SIZE(BO)
  1396. LFD f30, 14 * SIZE(BO)
  1397. LFD f31, 15 * SIZE(BO)
  1398. FSUB f10, f28, f10
  1399. FSUB f11, f29, f11
  1400. FSUB f14, f30, f14
  1401. FSUB f15, f31, f15
  1402. #else
  1403. LFD f16, 0 * SIZE(AO)
  1404. LFD f17, 1 * SIZE(AO)
  1405. LFD f18, 2 * SIZE(AO)
  1406. LFD f19, 3 * SIZE(AO)
  1407. FSUB f0, f16, f0
  1408. FSUB f1, f17, f1
  1409. FSUB f2, f18, f2
  1410. FSUB f3, f19, f3
  1411. LFD f20, 4 * SIZE(AO)
  1412. LFD f21, 5 * SIZE(AO)
  1413. LFD f22, 6 * SIZE(AO)
  1414. LFD f23, 7 * SIZE(AO)
  1415. FSUB f4, f20, f4
  1416. FSUB f5, f21, f5
  1417. FSUB f6, f22, f6
  1418. FSUB f7, f23, f7
  1419. LFD f24, 8 * SIZE(AO)
  1420. LFD f25, 9 * SIZE(AO)
  1421. LFD f26, 10 * SIZE(AO)
  1422. LFD f27, 11 * SIZE(AO)
  1423. FSUB f8, f24, f8
  1424. FSUB f9, f25, f9
  1425. FSUB f10, f26, f10
  1426. FSUB f11, f27, f11
  1427. LFD f28, 12 * SIZE(AO)
  1428. LFD f29, 13 * SIZE(AO)
  1429. LFD f30, 14 * SIZE(AO)
  1430. LFD f31, 15 * SIZE(AO)
  1431. FSUB f12, f28, f12
  1432. FSUB f13, f29, f13
  1433. FSUB f14, f30, f14
  1434. FSUB f15, f31, f15
  1435. #endif
  1436. #ifdef LN
  1437. LFD f24, 6 * SIZE(AO)
  1438. LFD f25, 7 * SIZE(AO)
  1439. LFD f26, 4 * SIZE(AO)
  1440. LFD f27, 5 * SIZE(AO)
  1441. LFD f28, 0 * SIZE(AO)
  1442. LFD f29, 1 * SIZE(AO)
  1443. FMUL f16, f25, f3
  1444. FMUL f17, f25, f2
  1445. FMUL f18, f25, f7
  1446. FMUL f19, f25, f6
  1447. FMUL f20, f25, f11
  1448. FMUL f21, f25, f10
  1449. FMUL f22, f25, f15
  1450. FMUL f23, f25, f14
  1451. #ifndef CONJ
  1452. FMSUB f2, f24, f2, f16
  1453. FMADD f3, f24, f3, f17
  1454. FMSUB f6, f24, f6, f18
  1455. FMADD f7, f24, f7, f19
  1456. FMSUB f10, f24, f10, f20
  1457. FMADD f11, f24, f11, f21
  1458. FMSUB f14, f24, f14, f22
  1459. FMADD f15, f24, f15, f23
  1460. FMADD f0, f27, f3, f0
  1461. FNMSUB f1, f27, f2, f1
  1462. FMADD f4, f27, f7, f4
  1463. FNMSUB f5, f27, f6, f5
  1464. FMADD f8, f27, f11, f8
  1465. FNMSUB f9, f27, f10, f9
  1466. FMADD f12, f27, f15, f12
  1467. FNMSUB f13, f27, f14, f13
  1468. FNMSUB f0, f26, f2, f0
  1469. FNMSUB f1, f26, f3, f1
  1470. FNMSUB f4, f26, f6, f4
  1471. FNMSUB f5, f26, f7, f5
  1472. FNMSUB f8, f26, f10, f8
  1473. FNMSUB f9, f26, f11, f9
  1474. FNMSUB f12, f26, f14, f12
  1475. FNMSUB f13, f26, f15, f13
  1476. FMUL f16, f29, f1
  1477. FMUL f17, f29, f0
  1478. FMUL f18, f29, f5
  1479. FMUL f19, f29, f4
  1480. FMUL f20, f29, f9
  1481. FMUL f21, f29, f8
  1482. FMUL f22, f29, f13
  1483. FMUL f23, f29, f12
  1484. FMSUB f0, f28, f0, f16
  1485. FMADD f1, f28, f1, f17
  1486. FMSUB f4, f28, f4, f18
  1487. FMADD f5, f28, f5, f19
  1488. FMSUB f8, f28, f8, f20
  1489. FMADD f9, f28, f9, f21
  1490. FMSUB f12, f28, f12, f22
  1491. FMADD f13, f28, f13, f23
  1492. #else
  1493. FMADD f2, f24, f2, f16
  1494. FMSUB f3, f24, f3, f17
  1495. FMADD f6, f24, f6, f18
  1496. FMSUB f7, f24, f7, f19
  1497. FMADD f10, f24, f10, f20
  1498. FMSUB f11, f24, f11, f21
  1499. FMADD f14, f24, f14, f22
  1500. FMSUB f15, f24, f15, f23
  1501. FMSUB f0, f27, f3, f0
  1502. FNMADD f1, f27, f2, f1
  1503. FMSUB f4, f27, f7, f4
  1504. FNMADD f5, f27, f6, f5
  1505. FMSUB f8, f27, f11, f8
  1506. FNMADD f9, f27, f10, f9
  1507. FMSUB f12, f27, f15, f12
  1508. FNMADD f13, f27, f14, f13
  1509. FNMADD f0, f26, f2, f0
  1510. FNMADD f1, f26, f3, f1
  1511. FNMADD f4, f26, f6, f4
  1512. FNMADD f5, f26, f7, f5
  1513. FNMADD f8, f26, f10, f8
  1514. FNMADD f9, f26, f11, f9
  1515. FNMADD f12, f26, f14, f12
  1516. FNMADD f13, f26, f15, f13
  1517. FMUL f16, f29, f1
  1518. FMUL f17, f29, f0
  1519. FMUL f18, f29, f5
  1520. FMUL f19, f29, f4
  1521. FMUL f20, f29, f9
  1522. FMUL f21, f29, f8
  1523. FMUL f22, f29, f13
  1524. FMUL f23, f29, f12
  1525. FMADD f0, f28, f0, f16
  1526. FMSUB f1, f28, f1, f17
  1527. FMADD f4, f28, f4, f18
  1528. FMSUB f5, f28, f5, f19
  1529. FMADD f8, f28, f8, f20
  1530. FMSUB f9, f28, f9, f21
  1531. FMADD f12, f28, f12, f22
  1532. FMSUB f13, f28, f13, f23
  1533. #endif
  1534. #endif
  1535. #ifdef LT
  1536. LFD f24, 0 * SIZE(AO)
  1537. LFD f25, 1 * SIZE(AO)
  1538. LFD f26, 2 * SIZE(AO)
  1539. LFD f27, 3 * SIZE(AO)
  1540. LFD f28, 6 * SIZE(AO)
  1541. LFD f29, 7 * SIZE(AO)
  1542. FMUL f16, f25, f1
  1543. FMUL f17, f25, f0
  1544. FMUL f18, f25, f5
  1545. FMUL f19, f25, f4
  1546. FMUL f20, f25, f9
  1547. FMUL f21, f25, f8
  1548. FMUL f22, f25, f13
  1549. FMUL f23, f25, f12
  1550. #ifndef CONJ
  1551. FMSUB f0, f24, f0, f16
  1552. FMADD f1, f24, f1, f17
  1553. FMSUB f4, f24, f4, f18
  1554. FMADD f5, f24, f5, f19
  1555. FMSUB f8, f24, f8, f20
  1556. FMADD f9, f24, f9, f21
  1557. FMSUB f12, f24, f12, f22
  1558. FMADD f13, f24, f13, f23
  1559. FMADD f2, f27, f1, f2
  1560. FNMSUB f3, f27, f0, f3
  1561. FMADD f6, f27, f5, f6
  1562. FNMSUB f7, f27, f4, f7
  1563. FMADD f10, f27, f9, f10
  1564. FNMSUB f11, f27, f8, f11
  1565. FMADD f14, f27, f13, f14
  1566. FNMSUB f15, f27, f12, f15
  1567. FNMSUB f2, f26, f0, f2
  1568. FNMSUB f3, f26, f1, f3
  1569. FNMSUB f6, f26, f4, f6
  1570. FNMSUB f7, f26, f5, f7
  1571. FNMSUB f10, f26, f8, f10
  1572. FNMSUB f11, f26, f9, f11
  1573. FNMSUB f14, f26, f12, f14
  1574. FNMSUB f15, f26, f13, f15
  1575. FMUL f16, f29, f3
  1576. FMUL f17, f29, f2
  1577. FMUL f18, f29, f7
  1578. FMUL f19, f29, f6
  1579. FMUL f20, f29, f11
  1580. FMUL f21, f29, f10
  1581. FMUL f22, f29, f15
  1582. FMUL f23, f29, f14
  1583. FMSUB f2, f28, f2, f16
  1584. FMADD f3, f28, f3, f17
  1585. FMSUB f6, f28, f6, f18
  1586. FMADD f7, f28, f7, f19
  1587. FMSUB f10, f28, f10, f20
  1588. FMADD f11, f28, f11, f21
  1589. FMSUB f14, f28, f14, f22
  1590. FMADD f15, f28, f15, f23
  1591. #else
  1592. FMADD f0, f24, f0, f16
  1593. FMSUB f1, f24, f1, f17
  1594. FMADD f4, f24, f4, f18
  1595. FMSUB f5, f24, f5, f19
  1596. FMADD f8, f24, f8, f20
  1597. FMSUB f9, f24, f9, f21
  1598. FMADD f12, f24, f12, f22
  1599. FMSUB f13, f24, f13, f23
  1600. FMSUB f2, f27, f1, f2
  1601. FNMADD f3, f27, f0, f3
  1602. FMSUB f6, f27, f5, f6
  1603. FNMADD f7, f27, f4, f7
  1604. FMSUB f10, f27, f9, f10
  1605. FNMADD f11, f27, f8, f11
  1606. FMSUB f14, f27, f13, f14
  1607. FNMADD f15, f27, f12, f15
  1608. FNMADD f2, f26, f0, f2
  1609. FNMADD f3, f26, f1, f3
  1610. FNMADD f6, f26, f4, f6
  1611. FNMADD f7, f26, f5, f7
  1612. FNMADD f10, f26, f8, f10
  1613. FNMADD f11, f26, f9, f11
  1614. FNMADD f14, f26, f12, f14
  1615. FNMADD f15, f26, f13, f15
  1616. FMUL f16, f29, f3
  1617. FMUL f17, f29, f2
  1618. FMUL f18, f29, f7
  1619. FMUL f19, f29, f6
  1620. FMUL f20, f29, f11
  1621. FMUL f21, f29, f10
  1622. FMUL f22, f29, f15
  1623. FMUL f23, f29, f14
  1624. FMADD f2, f28, f2, f16
  1625. FMSUB f3, f28, f3, f17
  1626. FMADD f6, f28, f6, f18
  1627. FMSUB f7, f28, f7, f19
  1628. FMADD f10, f28, f10, f20
  1629. FMSUB f11, f28, f11, f21
  1630. FMADD f14, f28, f14, f22
  1631. FMSUB f15, f28, f15, f23
  1632. #endif
  1633. #endif
  1634. #ifdef RN
  1635. LFD f24, 0 * SIZE(BO)
  1636. LFD f25, 1 * SIZE(BO)
  1637. LFD f26, 2 * SIZE(BO)
  1638. LFD f27, 3 * SIZE(BO)
  1639. LFD f28, 4 * SIZE(BO)
  1640. LFD f29, 5 * SIZE(BO)
  1641. LFD f30, 6 * SIZE(BO)
  1642. LFD f31, 7 * SIZE(BO)
  1643. FMUL f16, f25, f1
  1644. FMUL f17, f25, f0
  1645. FMUL f18, f25, f3
  1646. FMUL f19, f25, f2
  1647. #ifndef CONJ
  1648. FMSUB f0, f24, f0, f16
  1649. FMADD f1, f24, f1, f17
  1650. FMSUB f2, f24, f2, f18
  1651. FMADD f3, f24, f3, f19
  1652. FMADD f4, f27, f1, f4
  1653. FNMSUB f5, f27, f0, f5
  1654. FMADD f6, f27, f3, f6
  1655. FNMSUB f7, f27, f2, f7
  1656. FNMSUB f4, f26, f0, f4
  1657. FNMSUB f5, f26, f1, f5
  1658. FNMSUB f6, f26, f2, f6
  1659. FNMSUB f7, f26, f3, f7
  1660. FMADD f8, f29, f1, f8
  1661. FNMSUB f9, f29, f0, f9
  1662. FMADD f10, f29, f3, f10
  1663. FNMSUB f11, f29, f2, f11
  1664. FNMSUB f8, f28, f0, f8
  1665. FNMSUB f9, f28, f1, f9
  1666. FNMSUB f10, f28, f2, f10
  1667. FNMSUB f11, f28, f3, f11
  1668. FMADD f12, f31, f1, f12
  1669. FNMSUB f13, f31, f0, f13
  1670. FMADD f14, f31, f3, f14
  1671. FNMSUB f15, f31, f2, f15
  1672. FNMSUB f12, f30, f0, f12
  1673. FNMSUB f13, f30, f1, f13
  1674. FNMSUB f14, f30, f2, f14
  1675. FNMSUB f15, f30, f3, f15
  1676. LFD f26, 10 * SIZE(BO)
  1677. LFD f27, 11 * SIZE(BO)
  1678. LFD f28, 12 * SIZE(BO)
  1679. LFD f29, 13 * SIZE(BO)
  1680. LFD f30, 14 * SIZE(BO)
  1681. LFD f31, 15 * SIZE(BO)
  1682. FMUL f16, f27, f5
  1683. FMUL f17, f27, f4
  1684. FMUL f18, f27, f7
  1685. FMUL f19, f27, f6
  1686. FMSUB f4, f26, f4, f16
  1687. FMADD f5, f26, f5, f17
  1688. FMSUB f6, f26, f6, f18
  1689. FMADD f7, f26, f7, f19
  1690. FMADD f8, f29, f5, f8
  1691. FNMSUB f9, f29, f4, f9
  1692. FMADD f10, f29, f7, f10
  1693. FNMSUB f11, f29, f6, f11
  1694. FNMSUB f8, f28, f4, f8
  1695. FNMSUB f9, f28, f5, f9
  1696. FNMSUB f10, f28, f6, f10
  1697. FNMSUB f11, f28, f7, f11
  1698. FMADD f12, f31, f5, f12
  1699. FNMSUB f13, f31, f4, f13
  1700. FMADD f14, f31, f7, f14
  1701. FNMSUB f15, f31, f6, f15
  1702. FNMSUB f12, f30, f4, f12
  1703. FNMSUB f13, f30, f5, f13
  1704. FNMSUB f14, f30, f6, f14
  1705. FNMSUB f15, f30, f7, f15
  1706. LFD f26, 20 * SIZE(BO)
  1707. LFD f27, 21 * SIZE(BO)
  1708. LFD f28, 22 * SIZE(BO)
  1709. LFD f29, 23 * SIZE(BO)
  1710. LFD f30, 30 * SIZE(BO)
  1711. LFD f31, 31 * SIZE(BO)
  1712. FMUL f16, f27, f9
  1713. FMUL f17, f27, f8
  1714. FMUL f18, f27, f11
  1715. FMUL f19, f27, f10
  1716. FMSUB f8, f26, f8, f16
  1717. FMADD f9, f26, f9, f17
  1718. FMSUB f10, f26, f10, f18
  1719. FMADD f11, f26, f11, f19
  1720. FMADD f12, f29, f9, f12
  1721. FNMSUB f13, f29, f8, f13
  1722. FMADD f14, f29, f11, f14
  1723. FNMSUB f15, f29, f10, f15
  1724. FNMSUB f12, f28, f8, f12
  1725. FNMSUB f13, f28, f9, f13
  1726. FNMSUB f14, f28, f10, f14
  1727. FNMSUB f15, f28, f11, f15
  1728. FMUL f16, f31, f13
  1729. FMUL f17, f31, f12
  1730. FMUL f18, f31, f15
  1731. FMUL f19, f31, f14
  1732. FMSUB f12, f30, f12, f16
  1733. FMADD f13, f30, f13, f17
  1734. FMSUB f14, f30, f14, f18
  1735. FMADD f15, f30, f15, f19
  1736. #else
  1737. FMADD f0, f24, f0, f16
  1738. FMSUB f1, f24, f1, f17
  1739. FMADD f2, f24, f2, f18
  1740. FMSUB f3, f24, f3, f19
  1741. FMSUB f4, f27, f1, f4
  1742. FNMADD f5, f27, f0, f5
  1743. FMSUB f6, f27, f3, f6
  1744. FNMADD f7, f27, f2, f7
  1745. FNMADD f4, f26, f0, f4
  1746. FNMADD f5, f26, f1, f5
  1747. FNMADD f6, f26, f2, f6
  1748. FNMADD f7, f26, f3, f7
  1749. FMSUB f8, f29, f1, f8
  1750. FNMADD f9, f29, f0, f9
  1751. FMSUB f10, f29, f3, f10
  1752. FNMADD f11, f29, f2, f11
  1753. FNMADD f8, f28, f0, f8
  1754. FNMADD f9, f28, f1, f9
  1755. FNMADD f10, f28, f2, f10
  1756. FNMADD f11, f28, f3, f11
  1757. FMSUB f12, f31, f1, f12
  1758. FNMADD f13, f31, f0, f13
  1759. FMSUB f14, f31, f3, f14
  1760. FNMADD f15, f31, f2, f15
  1761. FNMADD f12, f30, f0, f12
  1762. FNMADD f13, f30, f1, f13
  1763. FNMADD f14, f30, f2, f14
  1764. FNMADD f15, f30, f3, f15
  1765. LFD f26, 10 * SIZE(BO)
  1766. LFD f27, 11 * SIZE(BO)
  1767. LFD f28, 12 * SIZE(BO)
  1768. LFD f29, 13 * SIZE(BO)
  1769. LFD f30, 14 * SIZE(BO)
  1770. LFD f31, 15 * SIZE(BO)
  1771. FMUL f16, f27, f5
  1772. FMUL f17, f27, f4
  1773. FMUL f18, f27, f7
  1774. FMUL f19, f27, f6
  1775. FMADD f4, f26, f4, f16
  1776. FMSUB f5, f26, f5, f17
  1777. FMADD f6, f26, f6, f18
  1778. FMSUB f7, f26, f7, f19
  1779. FMSUB f8, f29, f5, f8
  1780. FNMADD f9, f29, f4, f9
  1781. FMSUB f10, f29, f7, f10
  1782. FNMADD f11, f29, f6, f11
  1783. FNMADD f8, f28, f4, f8
  1784. FNMADD f9, f28, f5, f9
  1785. FNMADD f10, f28, f6, f10
  1786. FNMADD f11, f28, f7, f11
  1787. FMSUB f12, f31, f5, f12
  1788. FNMADD f13, f31, f4, f13
  1789. FMSUB f14, f31, f7, f14
  1790. FNMADD f15, f31, f6, f15
  1791. FNMADD f12, f30, f4, f12
  1792. FNMADD f13, f30, f5, f13
  1793. FNMADD f14, f30, f6, f14
  1794. FNMADD f15, f30, f7, f15
  1795. LFD f26, 20 * SIZE(BO)
  1796. LFD f27, 21 * SIZE(BO)
  1797. LFD f28, 22 * SIZE(BO)
  1798. LFD f29, 23 * SIZE(BO)
  1799. LFD f30, 30 * SIZE(BO)
  1800. LFD f31, 31 * SIZE(BO)
  1801. FMUL f16, f27, f9
  1802. FMUL f17, f27, f8
  1803. FMUL f18, f27, f11
  1804. FMUL f19, f27, f10
  1805. FMADD f8, f26, f8, f16
  1806. FMSUB f9, f26, f9, f17
  1807. FMADD f10, f26, f10, f18
  1808. FMSUB f11, f26, f11, f19
  1809. FMSUB f12, f29, f9, f12
  1810. FNMADD f13, f29, f8, f13
  1811. FMSUB f14, f29, f11, f14
  1812. FNMADD f15, f29, f10, f15
  1813. FNMADD f12, f28, f8, f12
  1814. FNMADD f13, f28, f9, f13
  1815. FNMADD f14, f28, f10, f14
  1816. FNMADD f15, f28, f11, f15
  1817. FMUL f16, f31, f13
  1818. FMUL f17, f31, f12
  1819. FMUL f18, f31, f15
  1820. FMUL f19, f31, f14
  1821. FMADD f12, f30, f12, f16
  1822. FMSUB f13, f30, f13, f17
  1823. FMADD f14, f30, f14, f18
  1824. FMSUB f15, f30, f15, f19
  1825. #endif
  1826. #endif
  1827. #ifdef RT
  1828. LFD f24, 30 * SIZE(BO)
  1829. LFD f25, 31 * SIZE(BO)
  1830. LFD f26, 28 * SIZE(BO)
  1831. LFD f27, 29 * SIZE(BO)
  1832. LFD f28, 26 * SIZE(BO)
  1833. LFD f29, 27 * SIZE(BO)
  1834. LFD f30, 24 * SIZE(BO)
  1835. LFD f31, 25 * SIZE(BO)
  1836. FMUL f16, f25, f13
  1837. FMUL f17, f25, f12
  1838. FMUL f18, f25, f15
  1839. FMUL f19, f25, f14
  1840. #ifndef CONJ
  1841. FMSUB f12, f24, f12, f16
  1842. FMADD f13, f24, f13, f17
  1843. FMSUB f14, f24, f14, f18
  1844. FMADD f15, f24, f15, f19
  1845. FMADD f8, f27, f13, f8
  1846. FNMSUB f9, f27, f12, f9
  1847. FMADD f10, f27, f15, f10
  1848. FNMSUB f11, f27, f14, f11
  1849. FNMSUB f8, f26, f12, f8
  1850. FNMSUB f9, f26, f13, f9
  1851. FNMSUB f10, f26, f14, f10
  1852. FNMSUB f11, f26, f15, f11
  1853. FMADD f4, f29, f13, f4
  1854. FNMSUB f5, f29, f12, f5
  1855. FMADD f6, f29, f15, f6
  1856. FNMSUB f7, f29, f14, f7
  1857. FNMSUB f4, f28, f12, f4
  1858. FNMSUB f5, f28, f13, f5
  1859. FNMSUB f6, f28, f14, f6
  1860. FNMSUB f7, f28, f15, f7
  1861. FMADD f0, f31, f13, f0
  1862. FNMSUB f1, f31, f12, f1
  1863. FMADD f2, f31, f15, f2
  1864. FNMSUB f3, f31, f14, f3
  1865. FNMSUB f0, f30, f12, f0
  1866. FNMSUB f1, f30, f13, f1
  1867. FNMSUB f2, f30, f14, f2
  1868. FNMSUB f3, f30, f15, f3
  1869. LFD f26, 20 * SIZE(BO)
  1870. LFD f27, 21 * SIZE(BO)
  1871. LFD f28, 18 * SIZE(BO)
  1872. LFD f29, 19 * SIZE(BO)
  1873. LFD f30, 16 * SIZE(BO)
  1874. LFD f31, 17 * SIZE(BO)
  1875. FMUL f16, f27, f9
  1876. FMUL f17, f27, f8
  1877. FMUL f18, f27, f11
  1878. FMUL f19, f27, f10
  1879. FMSUB f8, f26, f8, f16
  1880. FMADD f9, f26, f9, f17
  1881. FMSUB f10, f26, f10, f18
  1882. FMADD f11, f26, f11, f19
  1883. FMADD f4, f29, f9, f4
  1884. FNMSUB f5, f29, f8, f5
  1885. FMADD f6, f29, f11, f6
  1886. FNMSUB f7, f29, f10, f7
  1887. FNMSUB f4, f28, f8, f4
  1888. FNMSUB f5, f28, f9, f5
  1889. FNMSUB f6, f28, f10, f6
  1890. FNMSUB f7, f28, f11, f7
  1891. FMADD f0, f31, f9, f0
  1892. FNMSUB f1, f31, f8, f1
  1893. FMADD f2, f31, f11, f2
  1894. FNMSUB f3, f31, f10, f3
  1895. FNMSUB f0, f30, f8, f0
  1896. FNMSUB f1, f30, f9, f1
  1897. FNMSUB f2, f30, f10, f2
  1898. FNMSUB f3, f30, f11, f3
  1899. LFD f26, 10 * SIZE(BO)
  1900. LFD f27, 11 * SIZE(BO)
  1901. LFD f28, 8 * SIZE(BO)
  1902. LFD f29, 9 * SIZE(BO)
  1903. LFD f30, 0 * SIZE(BO)
  1904. LFD f31, 1 * SIZE(BO)
  1905. FMUL f16, f27, f5
  1906. FMUL f17, f27, f4
  1907. FMUL f18, f27, f7
  1908. FMUL f19, f27, f6
  1909. FMSUB f4, f26, f4, f16
  1910. FMADD f5, f26, f5, f17
  1911. FMSUB f6, f26, f6, f18
  1912. FMADD f7, f26, f7, f19
  1913. FMADD f0, f29, f5, f0
  1914. FNMSUB f1, f29, f4, f1
  1915. FMADD f2, f29, f7, f2
  1916. FNMSUB f3, f29, f6, f3
  1917. FNMSUB f0, f28, f4, f0
  1918. FNMSUB f1, f28, f5, f1
  1919. FNMSUB f2, f28, f6, f2
  1920. FNMSUB f3, f28, f7, f3
  1921. FMUL f16, f31, f1
  1922. FMUL f17, f31, f0
  1923. FMUL f18, f31, f3
  1924. FMUL f19, f31, f2
  1925. FMSUB f0, f30, f0, f16
  1926. FMADD f1, f30, f1, f17
  1927. FMSUB f2, f30, f2, f18
  1928. FMADD f3, f30, f3, f19
  1929. #else
  1930. FMADD f12, f24, f12, f16
  1931. FMSUB f13, f24, f13, f17
  1932. FMADD f14, f24, f14, f18
  1933. FMSUB f15, f24, f15, f19
  1934. FMSUB f8, f27, f13, f8
  1935. FNMADD f9, f27, f12, f9
  1936. FMSUB f10, f27, f15, f10
  1937. FNMADD f11, f27, f14, f11
  1938. FNMADD f8, f26, f12, f8
  1939. FNMADD f9, f26, f13, f9
  1940. FNMADD f10, f26, f14, f10
  1941. FNMADD f11, f26, f15, f11
  1942. FMSUB f4, f29, f13, f4
  1943. FNMADD f5, f29, f12, f5
  1944. FMSUB f6, f29, f15, f6
  1945. FNMADD f7, f29, f14, f7
  1946. FNMADD f4, f28, f12, f4
  1947. FNMADD f5, f28, f13, f5
  1948. FNMADD f6, f28, f14, f6
  1949. FNMADD f7, f28, f15, f7
  1950. FMSUB f0, f31, f13, f0
  1951. FNMADD f1, f31, f12, f1
  1952. FMSUB f2, f31, f15, f2
  1953. FNMADD f3, f31, f14, f3
  1954. FNMADD f0, f30, f12, f0
  1955. FNMADD f1, f30, f13, f1
  1956. FNMADD f2, f30, f14, f2
  1957. FNMADD f3, f30, f15, f3
  1958. LFD f26, 20 * SIZE(BO)
  1959. LFD f27, 21 * SIZE(BO)
  1960. LFD f28, 18 * SIZE(BO)
  1961. LFD f29, 19 * SIZE(BO)
  1962. LFD f30, 16 * SIZE(BO)
  1963. LFD f31, 17 * SIZE(BO)
  1964. FMUL f16, f27, f9
  1965. FMUL f17, f27, f8
  1966. FMUL f18, f27, f11
  1967. FMUL f19, f27, f10
  1968. FMADD f8, f26, f8, f16
  1969. FMSUB f9, f26, f9, f17
  1970. FMADD f10, f26, f10, f18
  1971. FMSUB f11, f26, f11, f19
  1972. FMSUB f4, f29, f9, f4
  1973. FNMADD f5, f29, f8, f5
  1974. FMSUB f6, f29, f11, f6
  1975. FNMADD f7, f29, f10, f7
  1976. FNMADD f4, f28, f8, f4
  1977. FNMADD f5, f28, f9, f5
  1978. FNMADD f6, f28, f10, f6
  1979. FNMADD f7, f28, f11, f7
  1980. FMSUB f0, f31, f9, f0
  1981. FNMADD f1, f31, f8, f1
  1982. FMSUB f2, f31, f11, f2
  1983. FNMADD f3, f31, f10, f3
  1984. FNMADD f0, f30, f8, f0
  1985. FNMADD f1, f30, f9, f1
  1986. FNMADD f2, f30, f10, f2
  1987. FNMADD f3, f30, f11, f3
  1988. LFD f26, 10 * SIZE(BO)
  1989. LFD f27, 11 * SIZE(BO)
  1990. LFD f28, 8 * SIZE(BO)
  1991. LFD f29, 9 * SIZE(BO)
  1992. LFD f30, 0 * SIZE(BO)
  1993. LFD f31, 1 * SIZE(BO)
  1994. FMUL f16, f27, f5
  1995. FMUL f17, f27, f4
  1996. FMUL f18, f27, f7
  1997. FMUL f19, f27, f6
  1998. FMADD f4, f26, f4, f16
  1999. FMSUB f5, f26, f5, f17
  2000. FMADD f6, f26, f6, f18
  2001. FMSUB f7, f26, f7, f19
  2002. FMSUB f0, f29, f5, f0
  2003. FNMADD f1, f29, f4, f1
  2004. FMSUB f2, f29, f7, f2
  2005. FNMADD f3, f29, f6, f3
  2006. FNMADD f0, f28, f4, f0
  2007. FNMADD f1, f28, f5, f1
  2008. FNMADD f2, f28, f6, f2
  2009. FNMADD f3, f28, f7, f3
  2010. FMUL f16, f31, f1
  2011. FMUL f17, f31, f0
  2012. FMUL f18, f31, f3
  2013. FMUL f19, f31, f2
  2014. FMADD f0, f30, f0, f16
  2015. FMSUB f1, f30, f1, f17
  2016. FMADD f2, f30, f2, f18
  2017. FMSUB f3, f30, f3, f19
  2018. #endif
  2019. #endif
  2020. #ifdef LN
  2021. subi CO1, CO1, 4 * SIZE
  2022. subi CO2, CO2, 4 * SIZE
  2023. subi CO3, CO3, 4 * SIZE
  2024. subi CO4, CO4, 4 * SIZE
  2025. #endif
  2026. #if defined(LN) || defined(LT)
  2027. STFD f0, 0 * SIZE(BO)
  2028. STFD f1, 1 * SIZE(BO)
  2029. STFD f4, 2 * SIZE(BO)
  2030. STFD f5, 3 * SIZE(BO)
  2031. STFD f8, 4 * SIZE(BO)
  2032. STFD f9, 5 * SIZE(BO)
  2033. STFD f12, 6 * SIZE(BO)
  2034. STFD f13, 7 * SIZE(BO)
  2035. STFD f2, 8 * SIZE(BO)
  2036. STFD f3, 9 * SIZE(BO)
  2037. STFD f6, 10 * SIZE(BO)
  2038. STFD f7, 11 * SIZE(BO)
  2039. STFD f10, 12 * SIZE(BO)
  2040. STFD f11, 13 * SIZE(BO)
  2041. STFD f14, 14 * SIZE(BO)
  2042. STFD f15, 15 * SIZE(BO)
  2043. #else
  2044. STFD f0, 0 * SIZE(AO)
  2045. STFD f1, 1 * SIZE(AO)
  2046. STFD f2, 2 * SIZE(AO)
  2047. STFD f3, 3 * SIZE(AO)
  2048. STFD f4, 4 * SIZE(AO)
  2049. STFD f5, 5 * SIZE(AO)
  2050. STFD f6, 6 * SIZE(AO)
  2051. STFD f7, 7 * SIZE(AO)
  2052. STFD f8, 8 * SIZE(AO)
  2053. STFD f9, 9 * SIZE(AO)
  2054. STFD f10, 10 * SIZE(AO)
  2055. STFD f11, 11 * SIZE(AO)
  2056. STFD f12, 12 * SIZE(AO)
  2057. STFD f13, 13 * SIZE(AO)
  2058. STFD f14, 14 * SIZE(AO)
  2059. STFD f15, 15 * SIZE(AO)
  2060. #endif
  2061. STFD f0, 0 * SIZE(CO1)
  2062. STFD f1, 1 * SIZE(CO1)
  2063. STFD f2, 2 * SIZE(CO1)
  2064. STFD f3, 3 * SIZE(CO1)
  2065. lfs f0, FZERO
  2066. fmr f1, f0
  2067. fmr f2, f0
  2068. fmr f3, f0
  2069. STFD f4, 0 * SIZE(CO2)
  2070. STFD f5, 1 * SIZE(CO2)
  2071. STFD f6, 2 * SIZE(CO2)
  2072. STFD f7, 3 * SIZE(CO2)
  2073. fmr f4, f0
  2074. fmr f5, f0
  2075. fmr f6, f0
  2076. fmr f7, f0
  2077. STFD f8, 0 * SIZE(CO3)
  2078. STFD f9, 1 * SIZE(CO3)
  2079. STFD f10, 2 * SIZE(CO3)
  2080. STFD f11, 3 * SIZE(CO3)
  2081. fmr f8, f0
  2082. fmr f9, f0
  2083. fmr f10, f0
  2084. fmr f11, f0
  2085. STFD f12, 0 * SIZE(CO4)
  2086. STFD f13, 1 * SIZE(CO4)
  2087. STFD f14, 2 * SIZE(CO4)
  2088. STFD f15, 3 * SIZE(CO4)
  2089. fmr f12, f0
  2090. fmr f13, f0
  2091. fmr f14, f0
  2092. fmr f15, f0
  2093. #ifndef LN
  2094. addi CO1, CO1, 4 * SIZE
  2095. addi CO2, CO2, 4 * SIZE
  2096. addi CO3, CO3, 4 * SIZE
  2097. addi CO4, CO4, 4 * SIZE
  2098. #endif
  2099. #ifdef RT
  2100. slwi r0, K, 1 + ZBASE_SHIFT
  2101. add AORIG, AORIG, r0
  2102. #endif
  2103. #if defined(LT) || defined(RN)
  2104. sub TEMP, K, KK
  2105. slwi r0, TEMP, 1 + ZBASE_SHIFT
  2106. slwi TEMP, TEMP, 2 + ZBASE_SHIFT
  2107. add AO, AO, r0
  2108. add BO, BO, TEMP
  2109. #endif
  2110. #ifdef LT
  2111. addi KK, KK, 2
  2112. #endif
  2113. #ifdef LN
  2114. subi KK, KK, 2
  2115. #endif
  2116. addic. I, I, -1
  2117. bgt LL(11)
  2118. .align 4
  2119. LL(29):
  2120. #ifdef LN
  2121. slwi r0, K, 2 + ZBASE_SHIFT
  2122. add B, B, r0
  2123. #endif
  2124. #if defined(LT) || defined(RN)
  2125. mr B, BO
  2126. #endif
  2127. #ifdef RN
  2128. addi KK, KK, 4
  2129. #endif
  2130. #ifdef RT
  2131. subi KK, KK, 4
  2132. #endif
  2133. addic. J, J, -1
  2134. bgt LL(10)
  2135. .align 4
  2136. LL(30):
  2137. andi. J, N, 2
  2138. ble LL(50)
  2139. .align 4
  2140. #ifdef RT
  2141. slwi r0, K, 1 + ZBASE_SHIFT
  2142. sub B, B, r0
  2143. slwi r0, LDC, 1
  2144. sub C, C, r0
  2145. #endif
  2146. mr CO1, C
  2147. add CO2, C, LDC
  2148. #ifdef LN
  2149. add KK, M, OFFSET
  2150. #endif
  2151. #ifdef LT
  2152. mr KK, OFFSET
  2153. #endif
  2154. #if defined(LN) || defined(RT)
  2155. mr AORIG, A
  2156. #else
  2157. mr AO, A
  2158. #endif
  2159. #ifndef RT
  2160. add C, CO2, LDC
  2161. #endif
  2162. andi. I, M, 1
  2163. ble LL(40)
  2164. #if defined(LT) || defined(RN)
  2165. LFD f16, 0 * SIZE(AO)
  2166. LFD f17, 1 * SIZE(AO)
  2167. LFD f18, 2 * SIZE(AO)
  2168. LFD f19, 3 * SIZE(AO)
  2169. LFD f20, 0 * SIZE(B)
  2170. LFD f21, 1 * SIZE(B)
  2171. LFD f22, 2 * SIZE(B)
  2172. LFD f23, 3 * SIZE(B)
  2173. LFD f24, 4 * SIZE(B)
  2174. LFD f25, 5 * SIZE(B)
  2175. LFD f26, 6 * SIZE(B)
  2176. LFD f27, 7 * SIZE(B)
  2177. lfs f0, FZERO
  2178. fmr f1, f0
  2179. fmr f2, f0
  2180. fmr f3, f0
  2181. fmr f4, f0
  2182. fmr f5, f0
  2183. fmr f6, f0
  2184. fmr f7, f0
  2185. srawi. r0, KK, 2
  2186. mr BO, B
  2187. mtspr CTR, r0
  2188. #else
  2189. #ifdef LN
  2190. slwi r0, K, 0 + ZBASE_SHIFT
  2191. sub AORIG, AORIG, r0
  2192. #endif
  2193. slwi r0, KK, 0 + ZBASE_SHIFT
  2194. slwi TEMP, KK, 1 + ZBASE_SHIFT
  2195. add AO, AORIG, r0
  2196. add BO, B, TEMP
  2197. sub TEMP, K, KK
  2198. LFD f16, 0 * SIZE(AO)
  2199. LFD f17, 1 * SIZE(AO)
  2200. LFD f18, 2 * SIZE(AO)
  2201. LFD f19, 3 * SIZE(AO)
  2202. LFD f20, 0 * SIZE(BO)
  2203. LFD f21, 1 * SIZE(BO)
  2204. LFD f22, 2 * SIZE(BO)
  2205. LFD f23, 3 * SIZE(BO)
  2206. LFD f24, 4 * SIZE(BO)
  2207. LFD f25, 5 * SIZE(BO)
  2208. LFD f26, 6 * SIZE(BO)
  2209. LFD f27, 7 * SIZE(BO)
  2210. lfs f0, FZERO
  2211. fmr f1, f0
  2212. fmr f2, f0
  2213. fmr f3, f0
  2214. fmr f4, f0
  2215. fmr f5, f0
  2216. fmr f6, f0
  2217. fmr f7, f0
  2218. srawi. r0, TEMP, 2
  2219. mtspr CTR, r0
  2220. #endif
  2221. ble LL(45)
  2222. .align 4
  2223. LL(42):
  2224. FMADD f0, f16, f20, f0
  2225. FMADD f1, f16, f21, f1
  2226. FMADD f2, f16, f22, f2
  2227. FMADD f3, f16, f23, f3
  2228. FMADD f4, f17, f20, f4
  2229. FMADD f5, f17, f21, f5
  2230. FMADD f6, f17, f22, f6
  2231. FMADD f7, f17, f23, f7
  2232. LFD f20, 8 * SIZE(BO)
  2233. LFD f21, 9 * SIZE(BO)
  2234. LFD f22, 10 * SIZE(BO)
  2235. LFD f23, 11 * SIZE(BO)
  2236. FMADD f0, f18, f24, f0
  2237. FMADD f1, f18, f25, f1
  2238. FMADD f2, f18, f26, f2
  2239. FMADD f3, f18, f27, f3
  2240. FMADD f4, f19, f24, f4
  2241. FMADD f5, f19, f25, f5
  2242. FMADD f6, f19, f26, f6
  2243. FMADD f7, f19, f27, f7
  2244. LFD f24, 12 * SIZE(BO)
  2245. LFD f25, 13 * SIZE(BO)
  2246. LFD f26, 14 * SIZE(BO)
  2247. LFD f27, 15 * SIZE(BO)
  2248. LFD f16, 4 * SIZE(AO)
  2249. LFD f17, 5 * SIZE(AO)
  2250. LFD f18, 6 * SIZE(AO)
  2251. LFD f19, 7 * SIZE(AO)
  2252. FMADD f0, f16, f20, f0
  2253. FMADD f1, f16, f21, f1
  2254. FMADD f2, f16, f22, f2
  2255. FMADD f3, f16, f23, f3
  2256. FMADD f4, f17, f20, f4
  2257. FMADD f5, f17, f21, f5
  2258. FMADD f6, f17, f22, f6
  2259. FMADD f7, f17, f23, f7
  2260. LFD f20, 16 * SIZE(BO)
  2261. LFD f21, 17 * SIZE(BO)
  2262. LFD f22, 18 * SIZE(BO)
  2263. LFD f23, 19 * SIZE(BO)
  2264. FMADD f0, f18, f24, f0
  2265. FMADD f1, f18, f25, f1
  2266. FMADD f2, f18, f26, f2
  2267. FMADD f3, f18, f27, f3
  2268. FMADD f4, f19, f24, f4
  2269. FMADD f5, f19, f25, f5
  2270. FMADD f6, f19, f26, f6
  2271. FMADD f7, f19, f27, f7
  2272. LFD f16, 8 * SIZE(AO)
  2273. LFD f17, 9 * SIZE(AO)
  2274. LFD f18, 10 * SIZE(AO)
  2275. LFD f19, 11 * SIZE(AO)
  2276. LFD f24, 20 * SIZE(BO)
  2277. LFD f25, 21 * SIZE(BO)
  2278. LFD f26, 22 * SIZE(BO)
  2279. LFD f27, 23 * SIZE(BO)
  2280. addi BO, BO, 16 * SIZE
  2281. addi AO, AO, 8 * SIZE
  2282. bdnz LL(42)
  2283. .align 4
  2284. LL(45):
  2285. #if defined(LT) || defined(RN)
  2286. andi. r0, KK, 3
  2287. #else
  2288. andi. r0, TEMP, 3
  2289. #endif
  2290. mtspr CTR, r0
  2291. ble LL(47)
  2292. .align 4
  2293. LL(46):
  2294. FMADD f0, f16, f20, f0
  2295. FMADD f1, f16, f21, f1
  2296. FMADD f2, f16, f22, f2
  2297. FMADD f3, f16, f23, f3
  2298. FMADD f4, f17, f20, f4
  2299. FMADD f5, f17, f21, f5
  2300. FMADD f6, f17, f22, f6
  2301. FMADD f7, f17, f23, f7
  2302. LFD f20, 4 * SIZE(BO)
  2303. LFD f21, 5 * SIZE(BO)
  2304. LFD f22, 6 * SIZE(BO)
  2305. LFD f23, 7 * SIZE(BO)
  2306. LFD f16, 2 * SIZE(AO)
  2307. LFD f17, 3 * SIZE(AO)
  2308. addi AO, AO, 2 * SIZE
  2309. addi BO, BO, 4 * SIZE
  2310. bdnz LL(46)
  2311. .align 4
  2312. LL(47):
  2313. #ifndef CONJ
  2314. FSUB f0, f0, f5
  2315. FADD f1, f1, f4
  2316. FSUB f2, f2, f7
  2317. FADD f3, f3, f6
  2318. #else
  2319. #if defined(LN) || defined(LT)
  2320. FADD f0, f0, f5
  2321. FSUB f1, f1, f4
  2322. FADD f2, f2, f7
  2323. FSUB f3, f3, f6
  2324. #else
  2325. FADD f0, f0, f5
  2326. FSUB f1, f4, f1
  2327. FADD f2, f2, f7
  2328. FSUB f3, f6, f3
  2329. #endif
  2330. #endif
  2331. #if defined(LN) || defined(RT)
  2332. #ifdef LN
  2333. subi r0, KK, 1
  2334. #else
  2335. subi r0, KK, 2
  2336. #endif
  2337. slwi TEMP, r0, 0 + ZBASE_SHIFT
  2338. slwi r0, r0, 1 + ZBASE_SHIFT
  2339. add AO, AORIG, TEMP
  2340. add BO, B, r0
  2341. #endif
  2342. #if defined(LN) || defined(LT)
  2343. LFD f16, 0 * SIZE(BO)
  2344. LFD f17, 1 * SIZE(BO)
  2345. LFD f18, 2 * SIZE(BO)
  2346. LFD f19, 3 * SIZE(BO)
  2347. FSUB f0, f16, f0
  2348. FSUB f1, f17, f1
  2349. FSUB f2, f18, f2
  2350. FSUB f3, f19, f3
  2351. #else
  2352. LFD f16, 0 * SIZE(AO)
  2353. LFD f17, 1 * SIZE(AO)
  2354. LFD f20, 2 * SIZE(AO)
  2355. LFD f21, 3 * SIZE(AO)
  2356. FSUB f0, f16, f0
  2357. FSUB f1, f17, f1
  2358. FSUB f2, f20, f2
  2359. FSUB f3, f21, f3
  2360. #endif
  2361. #ifdef LN
  2362. LFD f20, 0 * SIZE(AO)
  2363. LFD f21, 1 * SIZE(AO)
  2364. FMUL f4, f21, f1
  2365. FMUL f5, f21, f0
  2366. FMUL f12, f21, f3
  2367. FMUL f13, f21, f2
  2368. #ifndef CONJ
  2369. FMSUB f0, f20, f0, f4
  2370. FMADD f1, f20, f1, f5
  2371. FMSUB f2, f20, f2, f12
  2372. FMADD f3, f20, f3, f13
  2373. #else
  2374. FMADD f0, f20, f0, f4
  2375. FMSUB f1, f20, f1, f5
  2376. FMADD f2, f20, f2, f12
  2377. FMSUB f3, f20, f3, f13
  2378. #endif
  2379. #endif
  2380. #ifdef LT
  2381. LFD f16, 0 * SIZE(AO)
  2382. LFD f17, 1 * SIZE(AO)
  2383. FMUL f4, f17, f1
  2384. FMUL f5, f17, f0
  2385. FMUL f12, f17, f3
  2386. FMUL f13, f17, f2
  2387. #ifndef CONJ
  2388. FMSUB f0, f16, f0, f4
  2389. FMADD f1, f16, f1, f5
  2390. FMSUB f2, f16, f2, f12
  2391. FMADD f3, f16, f3, f13
  2392. #else
  2393. FMADD f0, f16, f0, f4
  2394. FMSUB f1, f16, f1, f5
  2395. FMADD f2, f16, f2, f12
  2396. FMSUB f3, f16, f3, f13
  2397. #endif
  2398. #endif
  2399. #ifdef RN
  2400. LFD f16, 0 * SIZE(BO)
  2401. LFD f17, 1 * SIZE(BO)
  2402. LFD f18, 2 * SIZE(BO)
  2403. LFD f19, 3 * SIZE(BO)
  2404. LFD f20, 6 * SIZE(BO)
  2405. LFD f21, 7 * SIZE(BO)
  2406. FMUL f4, f17, f1
  2407. FMUL f5, f17, f0
  2408. #ifndef CONJ
  2409. FMSUB f0, f16, f0, f4
  2410. FMADD f1, f16, f1, f5
  2411. FMADD f2, f19, f1, f2
  2412. FNMSUB f3, f19, f0, f3
  2413. FNMSUB f2, f18, f0, f2
  2414. FNMSUB f3, f18, f1, f3
  2415. FMUL f4, f21, f3
  2416. FMUL f5, f21, f2
  2417. FMSUB f2, f20, f2, f4
  2418. FMADD f3, f20, f3, f5
  2419. #else
  2420. FMADD f0, f16, f0, f4
  2421. FMSUB f1, f16, f1, f5
  2422. FMSUB f2, f19, f1, f2
  2423. FNMADD f3, f19, f0, f3
  2424. FNMADD f2, f18, f0, f2
  2425. FNMADD f3, f18, f1, f3
  2426. FMUL f4, f21, f3
  2427. FMUL f5, f21, f2
  2428. FMADD f2, f20, f2, f4
  2429. FMSUB f3, f20, f3, f5
  2430. #endif
  2431. #endif
  2432. #ifdef RT
  2433. LFD f16, 6 * SIZE(BO)
  2434. LFD f17, 7 * SIZE(BO)
  2435. LFD f18, 4 * SIZE(BO)
  2436. LFD f19, 5 * SIZE(BO)
  2437. LFD f20, 0 * SIZE(BO)
  2438. LFD f21, 1 * SIZE(BO)
  2439. FMUL f12, f17, f3
  2440. FMUL f13, f17, f2
  2441. #ifndef CONJ
  2442. FMSUB f2, f16, f2, f12
  2443. FMADD f3, f16, f3, f13
  2444. FMADD f0, f19, f3, f0
  2445. FNMSUB f1, f19, f2, f1
  2446. FNMSUB f0, f18, f2, f0
  2447. FNMSUB f1, f18, f3, f1
  2448. FMUL f4, f21, f1
  2449. FMUL f5, f21, f0
  2450. FMSUB f0, f20, f0, f4
  2451. FMADD f1, f20, f1, f5
  2452. #else
  2453. FMADD f2, f16, f2, f12
  2454. FMSUB f3, f16, f3, f13
  2455. FMSUB f0, f19, f3, f0
  2456. FNMADD f1, f19, f2, f1
  2457. FNMADD f0, f18, f2, f0
  2458. FNMADD f1, f18, f3, f1
  2459. FMUL f4, f21, f1
  2460. FMUL f5, f21, f0
  2461. FMADD f0, f20, f0, f4
  2462. FMSUB f1, f20, f1, f5
  2463. #endif
  2464. #endif
  2465. #ifdef LN
  2466. subi CO1, CO1, 2 * SIZE
  2467. subi CO2, CO2, 2 * SIZE
  2468. #endif
  2469. #if defined(LN) || defined(LT)
  2470. STFD f0, 0 * SIZE(BO)
  2471. STFD f1, 1 * SIZE(BO)
  2472. STFD f2, 2 * SIZE(BO)
  2473. STFD f3, 3 * SIZE(BO)
  2474. #else
  2475. STFD f0, 0 * SIZE(AO)
  2476. STFD f1, 1 * SIZE(AO)
  2477. STFD f2, 2 * SIZE(AO)
  2478. STFD f3, 3 * SIZE(AO)
  2479. #endif
  2480. STFD f0, 0 * SIZE(CO1)
  2481. STFD f1, 1 * SIZE(CO1)
  2482. STFD f2, 0 * SIZE(CO2)
  2483. STFD f3, 1 * SIZE(CO2)
  2484. #ifndef LN
  2485. addi CO1, CO1, 2 * SIZE
  2486. addi CO2, CO2, 2 * SIZE
  2487. #endif
  2488. #ifdef RT
  2489. slwi r0, K, 0 + ZBASE_SHIFT
  2490. add AORIG, AORIG, r0
  2491. #endif
  2492. #if defined(LT) || defined(RN)
  2493. sub TEMP, K, KK
  2494. slwi r0, TEMP, 0 + ZBASE_SHIFT
  2495. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  2496. add AO, AO, r0
  2497. add BO, BO, TEMP
  2498. #endif
  2499. #ifdef LT
  2500. addi KK, KK, 1
  2501. #endif
  2502. #ifdef LN
  2503. subi KK, KK, 1
  2504. #endif
  2505. .align 4
  2506. LL(40):
  2507. srawi. I, M, 1
  2508. ble LL(49)
  2509. .align 4
  2510. LL(31):
  2511. #if defined(LT) || defined(RN)
  2512. LFD f16, 0 * SIZE(AO)
  2513. LFD f17, 1 * SIZE(AO)
  2514. LFD f18, 2 * SIZE(AO)
  2515. LFD f19, 3 * SIZE(AO)
  2516. LFD f20, 0 * SIZE(B)
  2517. LFD f21, 1 * SIZE(B)
  2518. LFD f22, 2 * SIZE(B)
  2519. LFD f23, 3 * SIZE(B)
  2520. lfs f0, FZERO
  2521. fmr f1, f0
  2522. fmr f2, f0
  2523. fmr f3, f0
  2524. fmr f4, f0
  2525. fmr f5, f0
  2526. fmr f6, f0
  2527. fmr f7, f0
  2528. fmr f8, f0
  2529. fmr f9, f0
  2530. fmr f10, f0
  2531. fmr f11, f0
  2532. fmr f12, f0
  2533. fmr f13, f0
  2534. fmr f14, f0
  2535. fmr f15, f0
  2536. dcbtst CO1, PREC
  2537. dcbtst CO2, PREC
  2538. srawi. r0, KK, 3
  2539. mtspr CTR, r0
  2540. mr BO, B
  2541. #else
  2542. #ifdef LN
  2543. slwi r0, K, 1 + ZBASE_SHIFT
  2544. sub AORIG, AORIG, r0
  2545. #endif
  2546. slwi TEMP, KK, 1 + ZBASE_SHIFT
  2547. add AO, AORIG, TEMP
  2548. add BO, B, TEMP
  2549. sub TEMP, K, KK
  2550. LFD f16, 0 * SIZE(AO)
  2551. LFD f17, 1 * SIZE(AO)
  2552. LFD f18, 2 * SIZE(AO)
  2553. LFD f19, 3 * SIZE(AO)
  2554. LFD f20, 0 * SIZE(BO)
  2555. LFD f21, 1 * SIZE(BO)
  2556. LFD f22, 2 * SIZE(BO)
  2557. LFD f23, 3 * SIZE(BO)
  2558. lfs f0, FZERO
  2559. fmr f1, f0
  2560. fmr f2, f0
  2561. fmr f3, f0
  2562. fmr f4, f0
  2563. fmr f5, f0
  2564. fmr f6, f0
  2565. fmr f7, f0
  2566. fmr f8, f0
  2567. fmr f9, f0
  2568. fmr f10, f0
  2569. fmr f11, f0
  2570. fmr f12, f0
  2571. fmr f13, f0
  2572. fmr f14, f0
  2573. fmr f15, f0
  2574. dcbtst CO1, PREC
  2575. dcbtst CO2, PREC
  2576. srawi. r0, TEMP, 3
  2577. mtspr CTR, r0
  2578. #endif
  2579. ble LL(35)
  2580. .align 4
  2581. LL(32):
  2582. dcbt AO, PREA
  2583. dcbtst BO, PREA
  2584. FMADD f0, f16, f20, f0
  2585. FMADD f4, f16, f21, f4
  2586. FMADD f8, f16, f22, f8
  2587. FMADD f12, f16, f23, f12
  2588. LFD f24, 4 * SIZE(AO)
  2589. LFD f28, 4 * SIZE(BO)
  2590. LFD f25, 5 * SIZE(AO)
  2591. LFD f29, 5 * SIZE(BO)
  2592. FMADD f1, f17, f20, f1
  2593. FMADD f5, f17, f21, f5
  2594. FMADD f9, f17, f22, f9
  2595. FMADD f13, f17, f23, f13
  2596. FMADD f2, f18, f20, f2
  2597. FMADD f6, f18, f21, f6
  2598. FMADD f10, f18, f22, f10
  2599. FMADD f14, f18, f23, f14
  2600. LFD f26, 6 * SIZE(AO)
  2601. LFD f30, 6 * SIZE(BO)
  2602. LFD f27, 7 * SIZE(AO)
  2603. LFD f31, 7 * SIZE(BO)
  2604. FMADD f3, f19, f20, f3
  2605. FMADD f7, f19, f21, f7
  2606. FMADD f11, f19, f22, f11
  2607. FMADD f15, f19, f23, f15
  2608. FMADD f0, f24, f28, f0
  2609. FMADD f4, f24, f29, f4
  2610. FMADD f8, f24, f30, f8
  2611. FMADD f12, f24, f31, f12
  2612. LFD f16, 8 * SIZE(AO)
  2613. LFD f20, 8 * SIZE(BO)
  2614. LFD f17, 9 * SIZE(AO)
  2615. LFD f21, 9 * SIZE(BO)
  2616. FMADD f1, f25, f28, f1
  2617. FMADD f5, f25, f29, f5
  2618. FMADD f9, f25, f30, f9
  2619. FMADD f13, f25, f31, f13
  2620. FMADD f2, f26, f28, f2
  2621. FMADD f6, f26, f29, f6
  2622. FMADD f10, f26, f30, f10
  2623. FMADD f14, f26, f31, f14
  2624. LFD f18, 10 * SIZE(AO)
  2625. LFD f22, 10 * SIZE(BO)
  2626. LFD f19, 11 * SIZE(AO)
  2627. LFD f23, 11 * SIZE(BO)
  2628. FMADD f3, f27, f28, f3
  2629. FMADD f7, f27, f29, f7
  2630. FMADD f11, f27, f30, f11
  2631. FMADD f15, f27, f31, f15
  2632. FMADD f0, f16, f20, f0
  2633. FMADD f4, f16, f21, f4
  2634. FMADD f8, f16, f22, f8
  2635. FMADD f12, f16, f23, f12
  2636. LFD f24, 12 * SIZE(AO)
  2637. LFD f28, 12 * SIZE(BO)
  2638. LFD f25, 13 * SIZE(AO)
  2639. LFD f29, 13 * SIZE(BO)
  2640. FMADD f1, f17, f20, f1
  2641. FMADD f5, f17, f21, f5
  2642. FMADD f9, f17, f22, f9
  2643. FMADD f13, f17, f23, f13
  2644. FMADD f2, f18, f20, f2
  2645. FMADD f6, f18, f21, f6
  2646. FMADD f10, f18, f22, f10
  2647. FMADD f14, f18, f23, f14
  2648. LFD f26, 14 * SIZE(AO)
  2649. LFD f30, 14 * SIZE(BO)
  2650. LFD f27, 15 * SIZE(AO)
  2651. LFD f31, 15 * SIZE(BO)
  2652. FMADD f3, f19, f20, f3
  2653. FMADD f7, f19, f21, f7
  2654. FMADD f11, f19, f22, f11
  2655. FMADD f15, f19, f23, f15
  2656. FMADD f0, f24, f28, f0
  2657. FMADD f4, f24, f29, f4
  2658. FMADD f8, f24, f30, f8
  2659. FMADD f12, f24, f31, f12
  2660. LFD f16, 16 * SIZE(AO)
  2661. LFD f20, 16 * SIZE(BO)
  2662. LFD f17, 17 * SIZE(AO)
  2663. LFD f21, 17 * SIZE(BO)
  2664. FMADD f1, f25, f28, f1
  2665. FMADD f5, f25, f29, f5
  2666. FMADD f9, f25, f30, f9
  2667. FMADD f13, f25, f31, f13
  2668. FMADD f2, f26, f28, f2
  2669. FMADD f6, f26, f29, f6
  2670. FMADD f10, f26, f30, f10
  2671. FMADD f14, f26, f31, f14
  2672. LFD f18, 18 * SIZE(AO)
  2673. LFD f22, 18 * SIZE(BO)
  2674. LFD f19, 19 * SIZE(AO)
  2675. LFD f23, 19 * SIZE(BO)
  2676. FMADD f3, f27, f28, f3
  2677. FMADD f7, f27, f29, f7
  2678. FMADD f11, f27, f30, f11
  2679. FMADD f15, f27, f31, f15
  2680. FMADD f0, f16, f20, f0
  2681. FMADD f4, f16, f21, f4
  2682. FMADD f8, f16, f22, f8
  2683. FMADD f12, f16, f23, f12
  2684. LFD f24, 20 * SIZE(AO)
  2685. LFD f28, 20 * SIZE(BO)
  2686. LFD f25, 21 * SIZE(AO)
  2687. LFD f29, 21 * SIZE(BO)
  2688. FMADD f1, f17, f20, f1
  2689. FMADD f5, f17, f21, f5
  2690. FMADD f9, f17, f22, f9
  2691. FMADD f13, f17, f23, f13
  2692. FMADD f2, f18, f20, f2
  2693. FMADD f6, f18, f21, f6
  2694. FMADD f10, f18, f22, f10
  2695. FMADD f14, f18, f23, f14
  2696. LFD f26, 22 * SIZE(AO)
  2697. LFD f30, 22 * SIZE(BO)
  2698. LFD f27, 23 * SIZE(AO)
  2699. LFD f31, 23 * SIZE(BO)
  2700. FMADD f3, f19, f20, f3
  2701. FMADD f7, f19, f21, f7
  2702. FMADD f11, f19, f22, f11
  2703. FMADD f15, f19, f23, f15
  2704. FMADD f0, f24, f28, f0
  2705. FMADD f4, f24, f29, f4
  2706. FMADD f8, f24, f30, f8
  2707. FMADD f12, f24, f31, f12
  2708. LFD f16, 24 * SIZE(AO)
  2709. LFD f20, 24 * SIZE(BO)
  2710. LFD f17, 25 * SIZE(AO)
  2711. LFD f21, 25 * SIZE(BO)
  2712. FMADD f1, f25, f28, f1
  2713. FMADD f5, f25, f29, f5
  2714. FMADD f9, f25, f30, f9
  2715. FMADD f13, f25, f31, f13
  2716. FMADD f2, f26, f28, f2
  2717. FMADD f6, f26, f29, f6
  2718. FMADD f10, f26, f30, f10
  2719. FMADD f14, f26, f31, f14
  2720. LFD f18, 26 * SIZE(AO)
  2721. LFD f22, 26 * SIZE(BO)
  2722. LFD f19, 27 * SIZE(AO)
  2723. LFD f23, 27 * SIZE(BO)
  2724. FMADD f3, f27, f28, f3
  2725. FMADD f7, f27, f29, f7
  2726. FMADD f11, f27, f30, f11
  2727. FMADD f15, f27, f31, f15
  2728. FMADD f0, f16, f20, f0
  2729. FMADD f4, f16, f21, f4
  2730. FMADD f8, f16, f22, f8
  2731. FMADD f12, f16, f23, f12
  2732. LFD f24, 28 * SIZE(AO)
  2733. LFD f28, 28 * SIZE(BO)
  2734. LFD f25, 29 * SIZE(AO)
  2735. LFD f29, 29 * SIZE(BO)
  2736. FMADD f1, f17, f20, f1
  2737. FMADD f5, f17, f21, f5
  2738. FMADD f9, f17, f22, f9
  2739. FMADD f13, f17, f23, f13
  2740. FMADD f2, f18, f20, f2
  2741. FMADD f6, f18, f21, f6
  2742. FMADD f10, f18, f22, f10
  2743. FMADD f14, f18, f23, f14
  2744. LFD f26, 30 * SIZE(AO)
  2745. LFD f30, 30 * SIZE(BO)
  2746. LFD f27, 31 * SIZE(AO)
  2747. LFD f31, 31 * SIZE(BO)
  2748. FMADD f3, f19, f20, f3
  2749. FMADD f7, f19, f21, f7
  2750. FMADD f11, f19, f22, f11
  2751. FMADD f15, f19, f23, f15
  2752. FMADD f0, f24, f28, f0
  2753. FMADD f4, f24, f29, f4
  2754. FMADD f8, f24, f30, f8
  2755. FMADD f12, f24, f31, f12
  2756. LFD f16, 32 * SIZE(AO)
  2757. LFD f20, 32 * SIZE(BO)
  2758. LFD f17, 33 * SIZE(AO)
  2759. LFD f21, 33 * SIZE(BO)
  2760. FMADD f1, f25, f28, f1
  2761. FMADD f5, f25, f29, f5
  2762. FMADD f9, f25, f30, f9
  2763. FMADD f13, f25, f31, f13
  2764. FMADD f2, f26, f28, f2
  2765. FMADD f6, f26, f29, f6
  2766. FMADD f10, f26, f30, f10
  2767. FMADD f14, f26, f31, f14
  2768. LFD f18, 34 * SIZE(AO)
  2769. LFD f22, 34 * SIZE(BO)
  2770. LFD f19, 35 * SIZE(AO)
  2771. LFD f23, 35 * SIZE(BO)
  2772. addi AO, AO, 32 * SIZE
  2773. addi BO, BO, 32 * SIZE
  2774. FMADD f3, f27, f28, f3
  2775. FMADD f7, f27, f29, f7
  2776. FMADD f11, f27, f30, f11
  2777. FMADD f15, f27, f31, f15
  2778. bdnz LL(32)
  2779. .align 4
  2780. LL(35):
  2781. #if defined(LT) || defined(RN)
  2782. andi. r0, KK, 7
  2783. #else
  2784. andi. r0, TEMP, 7
  2785. #endif
  2786. mtspr CTR, r0
  2787. ble LL(38)
  2788. .align 4
  2789. LL(36):
  2790. FMADD f0, f16, f20, f0
  2791. FMADD f4, f16, f21, f4
  2792. FMADD f8, f16, f22, f8
  2793. FMADD f12, f16, f23, f12
  2794. FMADD f1, f17, f20, f1
  2795. FMADD f5, f17, f21, f5
  2796. FMADD f9, f17, f22, f9
  2797. FMADD f13, f17, f23, f13
  2798. FMADD f2, f18, f20, f2
  2799. FMADD f6, f18, f21, f6
  2800. FMADD f10, f18, f22, f10
  2801. FMADD f14, f18, f23, f14
  2802. FMADD f3, f19, f20, f3
  2803. FMADD f7, f19, f21, f7
  2804. FMADD f11, f19, f22, f11
  2805. FMADD f15, f19, f23, f15
  2806. LFD f16, 4 * SIZE(AO)
  2807. LFD f17, 5 * SIZE(AO)
  2808. LFD f18, 6 * SIZE(AO)
  2809. LFD f19, 7 * SIZE(AO)
  2810. LFD f20, 4 * SIZE(BO)
  2811. LFD f21, 5 * SIZE(BO)
  2812. LFD f22, 6 * SIZE(BO)
  2813. LFD f23, 7 * SIZE(BO)
  2814. addi BO, BO, 4 * SIZE
  2815. addi AO, AO, 4 * SIZE
  2816. bdnz LL(36)
  2817. .align 4
  2818. LL(38):
  2819. #ifndef CONJ
  2820. FSUB f0, f0, f5
  2821. FADD f1, f1, f4
  2822. FSUB f2, f2, f7
  2823. FADD f3, f3, f6
  2824. FSUB f8, f8, f13
  2825. FADD f9, f9, f12
  2826. FSUB f10, f10, f15
  2827. FADD f11, f11, f14
  2828. #else
  2829. FADD f0, f0, f5
  2830. FSUB f1, f4, f1
  2831. FADD f2, f2, f7
  2832. FSUB f3, f6, f3
  2833. FADD f8, f8, f13
  2834. FSUB f9, f12, f9
  2835. FADD f10, f10, f15
  2836. FSUB f11, f14, f11
  2837. #endif
  2838. #if defined(LN) || defined(RT)
  2839. subi r0, KK, 2
  2840. slwi r0, r0, 1 + ZBASE_SHIFT
  2841. add AO, AORIG, r0
  2842. add BO, B, r0
  2843. #endif
  2844. #if defined(LN) || defined(LT)
  2845. LFD f16, 0 * SIZE(BO)
  2846. LFD f17, 1 * SIZE(BO)
  2847. LFD f18, 2 * SIZE(BO)
  2848. LFD f19, 3 * SIZE(BO)
  2849. LFD f20, 4 * SIZE(BO)
  2850. LFD f21, 5 * SIZE(BO)
  2851. LFD f22, 6 * SIZE(BO)
  2852. LFD f23, 7 * SIZE(BO)
  2853. FSUB f0, f16, f0
  2854. FSUB f1, f17, f1
  2855. FSUB f8, f18, f8
  2856. FSUB f9, f19, f9
  2857. FSUB f2, f20, f2
  2858. FSUB f3, f21, f3
  2859. FSUB f10, f22, f10
  2860. FSUB f11, f23, f11
  2861. #else
  2862. LFD f16, 0 * SIZE(AO)
  2863. LFD f17, 1 * SIZE(AO)
  2864. LFD f18, 2 * SIZE(AO)
  2865. LFD f19, 3 * SIZE(AO)
  2866. LFD f20, 4 * SIZE(AO)
  2867. LFD f21, 5 * SIZE(AO)
  2868. LFD f22, 6 * SIZE(AO)
  2869. LFD f23, 7 * SIZE(AO)
  2870. #ifndef CONJ
  2871. FSUB f0, f16, f0
  2872. FSUB f1, f17, f1
  2873. FSUB f2, f18, f2
  2874. FSUB f3, f19, f3
  2875. FSUB f8, f20, f8
  2876. FSUB f9, f21, f9
  2877. FSUB f10, f22, f10
  2878. FSUB f11, f23, f11
  2879. #else
  2880. FSUB f0, f16, f0
  2881. FADD f1, f17, f1
  2882. FSUB f2, f18, f2
  2883. FADD f3, f19, f3
  2884. FSUB f8, f20, f8
  2885. FADD f9, f21, f9
  2886. FSUB f10, f22, f10
  2887. FADD f11, f23, f11
  2888. #endif
  2889. #endif
  2890. #ifdef LN
  2891. LFD f16, 6 * SIZE(AO)
  2892. LFD f17, 7 * SIZE(AO)
  2893. LFD f18, 4 * SIZE(AO)
  2894. LFD f19, 5 * SIZE(AO)
  2895. LFD f20, 0 * SIZE(AO)
  2896. LFD f21, 1 * SIZE(AO)
  2897. FMUL f6, f17, f3
  2898. FMUL f7, f17, f2
  2899. FMUL f14, f17, f11
  2900. FMUL f15, f17, f10
  2901. #ifndef CONJ
  2902. FMSUB f2, f16, f2, f6
  2903. FMADD f3, f16, f3, f7
  2904. FMSUB f10, f16, f10, f14
  2905. FMADD f11, f16, f11, f15
  2906. FMADD f0, f19, f3, f0
  2907. FNMSUB f1, f19, f2, f1
  2908. FMADD f8, f19, f11, f8
  2909. FNMSUB f9, f19, f10, f9
  2910. FNMSUB f0, f18, f2, f0
  2911. FNMSUB f1, f18, f3, f1
  2912. FNMSUB f8, f18, f10, f8
  2913. FNMSUB f9, f18, f11, f9
  2914. FMUL f4, f21, f1
  2915. FMUL f5, f21, f0
  2916. FMUL f12, f21, f9
  2917. FMUL f13, f21, f8
  2918. FMSUB f0, f20, f0, f4
  2919. FMADD f1, f20, f1, f5
  2920. FMSUB f8, f20, f8, f12
  2921. FMADD f9, f20, f9, f13
  2922. #else
  2923. FMADD f2, f16, f2, f6
  2924. FMSUB f3, f16, f3, f7
  2925. FMADD f10, f16, f10, f14
  2926. FMSUB f11, f16, f11, f15
  2927. FMSUB f0, f19, f3, f0
  2928. FNMADD f1, f19, f2, f1
  2929. FMSUB f8, f19, f11, f8
  2930. FNMADD f9, f19, f10, f9
  2931. FNMADD f0, f18, f2, f0
  2932. FNMADD f1, f18, f3, f1
  2933. FNMADD f8, f18, f10, f8
  2934. FNMADD f9, f18, f11, f9
  2935. FMUL f4, f21, f1
  2936. FMUL f5, f21, f0
  2937. FMUL f12, f21, f9
  2938. FMUL f13, f21, f8
  2939. FMADD f0, f20, f0, f4
  2940. FMSUB f1, f20, f1, f5
  2941. FMADD f8, f20, f8, f12
  2942. FMSUB f9, f20, f9, f13
  2943. #endif
  2944. #endif
  2945. #ifdef LT
  2946. LFD f16, 0 * SIZE(AO)
  2947. LFD f17, 1 * SIZE(AO)
  2948. LFD f18, 2 * SIZE(AO)
  2949. LFD f19, 3 * SIZE(AO)
  2950. LFD f20, 6 * SIZE(AO)
  2951. LFD f21, 7 * SIZE(AO)
  2952. FMUL f4, f17, f1
  2953. FMUL f5, f17, f0
  2954. FMUL f12, f17, f9
  2955. FMUL f13, f17, f8
  2956. #ifndef CONJ
  2957. FMSUB f0, f16, f0, f4
  2958. FMADD f1, f16, f1, f5
  2959. FMSUB f8, f16, f8, f12
  2960. FMADD f9, f16, f9, f13
  2961. FMADD f2, f19, f1, f2
  2962. FNMSUB f3, f19, f0, f3
  2963. FMADD f10, f19, f9, f10
  2964. FNMSUB f11, f19, f8, f11
  2965. FNMSUB f2, f18, f0, f2
  2966. FNMSUB f3, f18, f1, f3
  2967. FNMSUB f10, f18, f8, f10
  2968. FNMSUB f11, f18, f9, f11
  2969. FMUL f4, f21, f3
  2970. FMUL f5, f21, f2
  2971. FMUL f12, f21, f11
  2972. FMUL f13, f21, f10
  2973. FMSUB f2, f20, f2, f4
  2974. FMADD f3, f20, f3, f5
  2975. FMSUB f10, f20, f10, f12
  2976. FMADD f11, f20, f11, f13
  2977. #else
  2978. FMADD f0, f16, f0, f4
  2979. FMSUB f1, f16, f1, f5
  2980. FMADD f8, f16, f8, f12
  2981. FMSUB f9, f16, f9, f13
  2982. FMSUB f2, f19, f1, f2
  2983. FNMADD f3, f19, f0, f3
  2984. FMSUB f10, f19, f9, f10
  2985. FNMADD f11, f19, f8, f11
  2986. FNMADD f2, f18, f0, f2
  2987. FNMADD f3, f18, f1, f3
  2988. FNMADD f10, f18, f8, f10
  2989. FNMADD f11, f18, f9, f11
  2990. FMUL f4, f21, f3
  2991. FMUL f5, f21, f2
  2992. FMUL f12, f21, f11
  2993. FMUL f13, f21, f10
  2994. FMADD f2, f20, f2, f4
  2995. FMSUB f3, f20, f3, f5
  2996. FMADD f10, f20, f10, f12
  2997. FMSUB f11, f20, f11, f13
  2998. #endif
  2999. #endif
  3000. #ifdef RN
  3001. LFD f16, 0 * SIZE(BO)
  3002. LFD f17, 1 * SIZE(BO)
  3003. LFD f18, 2 * SIZE(BO)
  3004. LFD f19, 3 * SIZE(BO)
  3005. LFD f20, 6 * SIZE(BO)
  3006. LFD f21, 7 * SIZE(BO)
  3007. FMUL f4, f17, f1
  3008. FMUL f5, f17, f0
  3009. FMUL f6, f17, f3
  3010. FMUL f7, f17, f2
  3011. #ifndef CONJ
  3012. FMSUB f0, f16, f0, f4
  3013. FMADD f1, f16, f1, f5
  3014. FMSUB f2, f16, f2, f6
  3015. FMADD f3, f16, f3, f7
  3016. FMADD f8, f19, f1, f8
  3017. FNMSUB f9, f19, f0, f9
  3018. FMADD f10, f19, f3, f10
  3019. FNMSUB f11, f19, f2, f11
  3020. FNMSUB f8, f18, f0, f8
  3021. FNMSUB f9, f18, f1, f9
  3022. FNMSUB f10, f18, f2, f10
  3023. FNMSUB f11, f18, f3, f11
  3024. FMUL f4, f21, f9
  3025. FMUL f5, f21, f8
  3026. FMUL f6, f21, f11
  3027. FMUL f7, f21, f10
  3028. FMSUB f8, f20, f8, f4
  3029. FMADD f9, f20, f9, f5
  3030. FMSUB f10, f20, f10, f6
  3031. FMADD f11, f20, f11, f7
  3032. #else
  3033. FMADD f0, f16, f0, f4
  3034. FMSUB f1, f16, f1, f5
  3035. FMADD f2, f16, f2, f6
  3036. FMSUB f3, f16, f3, f7
  3037. FMSUB f8, f19, f1, f8
  3038. FNMADD f9, f19, f0, f9
  3039. FMSUB f10, f19, f3, f10
  3040. FNMADD f11, f19, f2, f11
  3041. FNMADD f8, f18, f0, f8
  3042. FNMADD f9, f18, f1, f9
  3043. FNMADD f10, f18, f2, f10
  3044. FNMADD f11, f18, f3, f11
  3045. FMUL f4, f21, f9
  3046. FMUL f5, f21, f8
  3047. FMUL f6, f21, f11
  3048. FMUL f7, f21, f10
  3049. FMADD f8, f20, f8, f4
  3050. FMSUB f9, f20, f9, f5
  3051. FMADD f10, f20, f10, f6
  3052. FMSUB f11, f20, f11, f7
  3053. #endif
  3054. #endif
  3055. #ifdef RT
  3056. LFD f16, 6 * SIZE(BO)
  3057. LFD f17, 7 * SIZE(BO)
  3058. LFD f18, 4 * SIZE(BO)
  3059. LFD f19, 5 * SIZE(BO)
  3060. LFD f20, 0 * SIZE(BO)
  3061. LFD f21, 1 * SIZE(BO)
  3062. FMUL f12, f17, f9
  3063. FMUL f13, f17, f8
  3064. FMUL f14, f17, f11
  3065. FMUL f15, f17, f10
  3066. #ifndef CONJ
  3067. FMSUB f8, f16, f8, f12
  3068. FMADD f9, f16, f9, f13
  3069. FMSUB f10, f16, f10, f14
  3070. FMADD f11, f16, f11, f15
  3071. FMADD f0, f19, f9, f0
  3072. FNMSUB f1, f19, f8, f1
  3073. FMADD f2, f19, f11, f2
  3074. FNMSUB f3, f19, f10, f3
  3075. FNMSUB f0, f18, f8, f0
  3076. FNMSUB f1, f18, f9, f1
  3077. FNMSUB f2, f18, f10, f2
  3078. FNMSUB f3, f18, f11, f3
  3079. FMUL f4, f21, f1
  3080. FMUL f5, f21, f0
  3081. FMUL f6, f21, f3
  3082. FMUL f7, f21, f2
  3083. FMSUB f0, f20, f0, f4
  3084. FMADD f1, f20, f1, f5
  3085. FMSUB f2, f20, f2, f6
  3086. FMADD f3, f20, f3, f7
  3087. #else
  3088. FMADD f8, f16, f8, f12
  3089. FMSUB f9, f16, f9, f13
  3090. FMADD f10, f16, f10, f14
  3091. FMSUB f11, f16, f11, f15
  3092. FMSUB f0, f19, f9, f0
  3093. FNMADD f1, f19, f8, f1
  3094. FMSUB f2, f19, f11, f2
  3095. FNMADD f3, f19, f10, f3
  3096. FNMADD f0, f18, f8, f0
  3097. FNMADD f1, f18, f9, f1
  3098. FNMADD f2, f18, f10, f2
  3099. FNMADD f3, f18, f11, f3
  3100. FMUL f4, f21, f1
  3101. FMUL f5, f21, f0
  3102. FMUL f6, f21, f3
  3103. FMUL f7, f21, f2
  3104. FMADD f0, f20, f0, f4
  3105. FMSUB f1, f20, f1, f5
  3106. FMADD f2, f20, f2, f6
  3107. FMSUB f3, f20, f3, f7
  3108. #endif
  3109. #endif
  3110. #ifdef LN
  3111. subi CO1, CO1, 4 * SIZE
  3112. subi CO2, CO2, 4 * SIZE
  3113. #endif
  3114. #if defined(LN) || defined(LT)
  3115. STFD f0, 0 * SIZE(BO)
  3116. STFD f1, 1 * SIZE(BO)
  3117. STFD f8, 2 * SIZE(BO)
  3118. STFD f9, 3 * SIZE(BO)
  3119. STFD f2, 4 * SIZE(BO)
  3120. STFD f3, 5 * SIZE(BO)
  3121. STFD f10, 6 * SIZE(BO)
  3122. STFD f11, 7 * SIZE(BO)
  3123. #else
  3124. STFD f0, 0 * SIZE(AO)
  3125. STFD f1, 1 * SIZE(AO)
  3126. STFD f2, 2 * SIZE(AO)
  3127. STFD f3, 3 * SIZE(AO)
  3128. STFD f8, 4 * SIZE(AO)
  3129. STFD f9, 5 * SIZE(AO)
  3130. STFD f10, 6 * SIZE(AO)
  3131. STFD f11, 7 * SIZE(AO)
  3132. #endif
  3133. STFD f0, 0 * SIZE(CO1)
  3134. STFD f1, 1 * SIZE(CO1)
  3135. STFD f2, 2 * SIZE(CO1)
  3136. STFD f3, 3 * SIZE(CO1)
  3137. STFD f8, 0 * SIZE(CO2)
  3138. STFD f9, 1 * SIZE(CO2)
  3139. STFD f10, 2 * SIZE(CO2)
  3140. STFD f11, 3 * SIZE(CO2)
  3141. #ifndef LN
  3142. addi CO1, CO1, 4 * SIZE
  3143. addi CO2, CO2, 4 * SIZE
  3144. #endif
  3145. #ifdef RT
  3146. slwi r0, K, 1 + ZBASE_SHIFT
  3147. add AORIG, AORIG, r0
  3148. #endif
  3149. #if defined(LT) || defined(RN)
  3150. sub TEMP, K, KK
  3151. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  3152. add AO, AO, TEMP
  3153. add BO, BO, TEMP
  3154. #endif
  3155. #ifdef LT
  3156. addi KK, KK, 2
  3157. #endif
  3158. #ifdef LN
  3159. subi KK, KK, 2
  3160. #endif
  3161. addic. I, I, -1
  3162. bgt LL(31)
  3163. .align 4
  3164. LL(49):
  3165. #ifdef LN
  3166. slwi r0, K, 1 + ZBASE_SHIFT
  3167. add B, B, r0
  3168. #endif
  3169. #if defined(LT) || defined(RN)
  3170. mr B, BO
  3171. #endif
  3172. #ifdef RN
  3173. addi KK, KK, 2
  3174. #endif
  3175. #ifdef RT
  3176. subi KK, KK, 2
  3177. #endif
  3178. .align 4
  3179. LL(50):
  3180. andi. J, N, 1
  3181. ble LL(999)
  3182. #ifdef RT
  3183. slwi r0, K, 0 + ZBASE_SHIFT
  3184. sub B, B, r0
  3185. sub C, C, LDC
  3186. #endif
  3187. mr CO1, C
  3188. #ifdef LN
  3189. add KK, M, OFFSET
  3190. #endif
  3191. #ifdef LT
  3192. mr KK, OFFSET
  3193. #endif
  3194. #if defined(LN) || defined(RT)
  3195. mr AORIG, A
  3196. #else
  3197. mr AO, A
  3198. #endif
  3199. #ifndef RT
  3200. add C, C, LDC
  3201. #endif
  3202. andi. I, M, 1
  3203. ble LL(60)
  3204. #if defined(LT) || defined(RN)
  3205. LFD f16, 0 * SIZE(AO)
  3206. LFD f17, 1 * SIZE(AO)
  3207. LFD f18, 2 * SIZE(AO)
  3208. LFD f19, 3 * SIZE(AO)
  3209. LFD f20, 0 * SIZE(B)
  3210. LFD f21, 1 * SIZE(B)
  3211. LFD f22, 2 * SIZE(B)
  3212. LFD f23, 3 * SIZE(B)
  3213. lfs f0, FZERO
  3214. fmr f1, f0
  3215. fmr f2, f0
  3216. fmr f3, f0
  3217. fmr f4, f0
  3218. fmr f5, f0
  3219. fmr f6, f0
  3220. fmr f7, f0
  3221. srawi. r0, KK, 2
  3222. mr BO, B
  3223. mtspr CTR, r0
  3224. #else
  3225. #ifdef LN
  3226. slwi r0, K, 0 + ZBASE_SHIFT
  3227. sub AORIG, AORIG, r0
  3228. #endif
  3229. slwi r0, KK, 0 + ZBASE_SHIFT
  3230. add AO, AORIG, r0
  3231. add BO, B, r0
  3232. sub TEMP, K, KK
  3233. LFD f16, 0 * SIZE(AO)
  3234. LFD f17, 1 * SIZE(AO)
  3235. LFD f18, 2 * SIZE(AO)
  3236. LFD f19, 3 * SIZE(AO)
  3237. LFD f20, 0 * SIZE(BO)
  3238. LFD f21, 1 * SIZE(BO)
  3239. LFD f22, 2 * SIZE(BO)
  3240. LFD f23, 3 * SIZE(BO)
  3241. lfs f0, FZERO
  3242. fmr f1, f0
  3243. fmr f2, f0
  3244. fmr f3, f0
  3245. fmr f4, f0
  3246. fmr f5, f0
  3247. fmr f6, f0
  3248. fmr f7, f0
  3249. srawi. r0, TEMP, 2
  3250. mtspr CTR, r0
  3251. #endif
  3252. ble LL(65)
  3253. .align 4
  3254. LL(62):
  3255. FMADD f0, f16, f20, f0
  3256. FMADD f1, f17, f21, f1
  3257. FMADD f2, f17, f20, f2
  3258. FMADD f3, f16, f21, f3
  3259. LFD f16, 4 * SIZE(AO)
  3260. LFD f17, 5 * SIZE(AO)
  3261. LFD f20, 4 * SIZE(BO)
  3262. LFD f21, 5 * SIZE(BO)
  3263. FMADD f4, f18, f22, f4
  3264. FMADD f5, f19, f23, f5
  3265. FMADD f6, f19, f22, f6
  3266. FMADD f7, f18, f23, f7
  3267. LFD f18, 6 * SIZE(AO)
  3268. LFD f19, 7 * SIZE(AO)
  3269. LFD f22, 6 * SIZE(BO)
  3270. LFD f23, 7 * SIZE(BO)
  3271. FMADD f0, f16, f20, f0
  3272. FMADD f1, f17, f21, f1
  3273. FMADD f2, f17, f20, f2
  3274. FMADD f3, f16, f21, f3
  3275. LFD f16, 8 * SIZE(AO)
  3276. LFD f17, 9 * SIZE(AO)
  3277. LFD f20, 8 * SIZE(BO)
  3278. LFD f21, 9 * SIZE(BO)
  3279. FMADD f4, f18, f22, f4
  3280. FMADD f5, f19, f23, f5
  3281. FMADD f6, f19, f22, f6
  3282. FMADD f7, f18, f23, f7
  3283. LFD f18, 10 * SIZE(AO)
  3284. LFD f19, 11 * SIZE(AO)
  3285. LFD f22, 10 * SIZE(BO)
  3286. LFD f23, 11 * SIZE(BO)
  3287. addi AO, AO, 8 * SIZE
  3288. addi BO, BO, 8 * SIZE
  3289. bdnz LL(62)
  3290. .align 4
  3291. LL(65):
  3292. fadd f0, f0, f4
  3293. fadd f1, f1, f5
  3294. fadd f2, f2, f6
  3295. fadd f3, f3, f7
  3296. #if defined(LT) || defined(RN)
  3297. andi. r0, KK, 3
  3298. #else
  3299. andi. r0, TEMP, 3
  3300. #endif
  3301. mtspr CTR,r0
  3302. ble LL(67)
  3303. .align 4
  3304. LL(66):
  3305. FMADD f0, f16, f20, f0
  3306. FMADD f1, f17, f21, f1
  3307. FMADD f2, f17, f20, f2
  3308. FMADD f3, f16, f21, f3
  3309. LFD f16, 2 * SIZE(AO)
  3310. LFD f17, 3 * SIZE(AO)
  3311. LFD f20, 2 * SIZE(BO)
  3312. LFD f21, 3 * SIZE(BO)
  3313. addi AO, AO, 2 * SIZE
  3314. addi BO, BO, 2 * SIZE
  3315. bdnz LL(66)
  3316. .align 4
  3317. LL(67):
  3318. #ifndef CONJ
  3319. FSUB f0, f0, f1
  3320. FADD f1, f2, f3
  3321. #else
  3322. FADD f0, f0, f1
  3323. FSUB f1, f3, f2
  3324. #endif
  3325. #if defined(LN) || defined(RT)
  3326. subi r0, KK, 1
  3327. slwi r0, r0, 0 + ZBASE_SHIFT
  3328. add AO, AORIG, r0
  3329. add BO, B, r0
  3330. #endif
  3331. #if defined(LN) || defined(LT)
  3332. LFD f16, 0 * SIZE(BO)
  3333. LFD f17, 1 * SIZE(BO)
  3334. FSUB f0, f16, f0
  3335. FSUB f1, f17, f1
  3336. #else
  3337. LFD f16, 0 * SIZE(AO)
  3338. LFD f17, 1 * SIZE(AO)
  3339. #ifndef CONJ
  3340. FSUB f0, f16, f0
  3341. FSUB f1, f17, f1
  3342. #else
  3343. FSUB f0, f16, f0
  3344. FADD f1, f17, f1
  3345. #endif
  3346. #endif
  3347. #ifdef LN
  3348. LFD f20, 0 * SIZE(AO)
  3349. LFD f21, 1 * SIZE(AO)
  3350. FMUL f4, f21, f1
  3351. FMUL f5, f21, f0
  3352. #ifndef CONJ
  3353. FMSUB f0, f20, f0, f4
  3354. FMADD f1, f20, f1, f5
  3355. #else
  3356. FMADD f0, f20, f0, f4
  3357. FMSUB f1, f20, f1, f5
  3358. #endif
  3359. #endif
  3360. #ifdef LT
  3361. LFD f16, 0 * SIZE(AO)
  3362. LFD f17, 1 * SIZE(AO)
  3363. FMUL f4, f17, f1
  3364. FMUL f5, f17, f0
  3365. #ifndef CONJ
  3366. FMSUB f0, f16, f0, f4
  3367. FMADD f1, f16, f1, f5
  3368. #else
  3369. FMADD f0, f16, f0, f4
  3370. FMSUB f1, f16, f1, f5
  3371. #endif
  3372. #endif
  3373. #ifdef RN
  3374. LFD f16, 0 * SIZE(BO)
  3375. LFD f17, 1 * SIZE(BO)
  3376. FMUL f4, f17, f1
  3377. FMUL f5, f17, f0
  3378. #ifndef CONJ
  3379. FMSUB f0, f16, f0, f4
  3380. FMADD f1, f16, f1, f5
  3381. #else
  3382. FMADD f0, f16, f0, f4
  3383. FMSUB f1, f16, f1, f5
  3384. #endif
  3385. #endif
  3386. #ifdef RT
  3387. LFD f20, 0 * SIZE(BO)
  3388. LFD f21, 1 * SIZE(BO)
  3389. FMUL f4, f21, f1
  3390. FMUL f5, f21, f0
  3391. #ifndef CONJ
  3392. FMSUB f0, f20, f0, f4
  3393. FMADD f1, f20, f1, f5
  3394. #else
  3395. FMADD f0, f20, f0, f4
  3396. FMSUB f1, f20, f1, f5
  3397. #endif
  3398. #endif
  3399. #ifdef LN
  3400. subi CO1, CO1, 2 * SIZE
  3401. #endif
  3402. #if defined(LN) || defined(LT)
  3403. STFD f0, 0 * SIZE(BO)
  3404. STFD f1, 1 * SIZE(BO)
  3405. #else
  3406. STFD f0, 0 * SIZE(AO)
  3407. STFD f1, 1 * SIZE(AO)
  3408. #endif
  3409. STFD f0, 0 * SIZE(CO1)
  3410. STFD f1, 1 * SIZE(CO1)
  3411. #ifndef LN
  3412. addi CO1, CO1, 2 * SIZE
  3413. #endif
  3414. #ifdef RT
  3415. slwi r0, K, 0 + ZBASE_SHIFT
  3416. add AORIG, AORIG, r0
  3417. #endif
  3418. #if defined(LT) || defined(RN)
  3419. sub TEMP, K, KK
  3420. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  3421. add AO, AO, TEMP
  3422. add BO, BO, TEMP
  3423. #endif
  3424. #ifdef LT
  3425. addi KK, KK, 1
  3426. #endif
  3427. #ifdef LN
  3428. subi KK, KK, 1
  3429. #endif
  3430. .align 4
  3431. LL(60):
  3432. srawi. I, M, 1
  3433. ble LL(69)
  3434. .align 4
  3435. LL(51):
  3436. #if defined(LT) || defined(RN)
  3437. LFD f20, 0 * SIZE(AO)
  3438. LFD f21, 1 * SIZE(AO)
  3439. LFD f22, 2 * SIZE(AO)
  3440. LFD f23, 3 * SIZE(AO)
  3441. LFD f24, 4 * SIZE(AO)
  3442. LFD f25, 5 * SIZE(AO)
  3443. LFD f26, 6 * SIZE(AO)
  3444. LFD f27, 7 * SIZE(AO)
  3445. LFD f16, 0 * SIZE(B)
  3446. LFD f17, 1 * SIZE(B)
  3447. LFD f18, 2 * SIZE(B)
  3448. LFD f19, 3 * SIZE(B)
  3449. lfs f0, FZERO
  3450. fmr f1, f0
  3451. fmr f2, f0
  3452. fmr f3, f0
  3453. fmr f4, f0
  3454. fmr f5, f0
  3455. fmr f6, f0
  3456. fmr f7, f0
  3457. dcbt CO1, PREC
  3458. srawi. r0, KK, 2
  3459. mr BO, B
  3460. mtspr CTR, r0
  3461. #else
  3462. #ifdef LN
  3463. slwi r0, K, 1 + ZBASE_SHIFT
  3464. sub AORIG, AORIG, r0
  3465. #endif
  3466. slwi r0, KK, 1 + ZBASE_SHIFT
  3467. slwi TEMP, KK, 0 + ZBASE_SHIFT
  3468. add AO, AORIG, r0
  3469. add BO, B, TEMP
  3470. sub TEMP, K, KK
  3471. LFD f20, 0 * SIZE(AO)
  3472. LFD f21, 1 * SIZE(AO)
  3473. LFD f22, 2 * SIZE(AO)
  3474. LFD f23, 3 * SIZE(AO)
  3475. LFD f24, 4 * SIZE(AO)
  3476. LFD f25, 5 * SIZE(AO)
  3477. LFD f26, 6 * SIZE(AO)
  3478. LFD f27, 7 * SIZE(AO)
  3479. LFD f16, 0 * SIZE(BO)
  3480. LFD f17, 1 * SIZE(BO)
  3481. LFD f18, 2 * SIZE(BO)
  3482. LFD f19, 3 * SIZE(BO)
  3483. lfs f0, FZERO
  3484. fmr f1, f0
  3485. fmr f2, f0
  3486. fmr f3, f0
  3487. fmr f4, f0
  3488. fmr f5, f0
  3489. fmr f6, f0
  3490. fmr f7, f0
  3491. srawi. r0, TEMP, 2
  3492. mtspr CTR, r0
  3493. #endif
  3494. ble LL(55)
  3495. .align 4
  3496. LL(52):
  3497. FMADD f0, f16, f20, f0
  3498. FMADD f1, f16, f21, f1
  3499. FMADD f2, f16, f22, f2
  3500. FMADD f3, f16, f23, f3
  3501. FMADD f4, f17, f20, f4
  3502. FMADD f5, f17, f21, f5
  3503. FMADD f6, f17, f22, f6
  3504. FMADD f7, f17, f23, f7
  3505. LFD f20, 8 * SIZE(AO)
  3506. LFD f21, 9 * SIZE(AO)
  3507. LFD f22, 10 * SIZE(AO)
  3508. LFD f23, 11 * SIZE(AO)
  3509. FMADD f0, f18, f24, f0
  3510. FMADD f1, f18, f25, f1
  3511. FMADD f2, f18, f26, f2
  3512. FMADD f3, f18, f27, f3
  3513. FMADD f4, f19, f24, f4
  3514. FMADD f5, f19, f25, f5
  3515. FMADD f6, f19, f26, f6
  3516. FMADD f7, f19, f27, f7
  3517. LFD f24, 12 * SIZE(AO)
  3518. LFD f25, 13 * SIZE(AO)
  3519. LFD f26, 14 * SIZE(AO)
  3520. LFD f27, 15 * SIZE(AO)
  3521. LFD f16, 4 * SIZE(BO)
  3522. LFD f17, 5 * SIZE(BO)
  3523. LFD f18, 6 * SIZE(BO)
  3524. LFD f19, 7 * SIZE(BO)
  3525. FMADD f0, f16, f20, f0
  3526. FMADD f1, f16, f21, f1
  3527. FMADD f2, f16, f22, f2
  3528. FMADD f3, f16, f23, f3
  3529. FMADD f4, f17, f20, f4
  3530. FMADD f5, f17, f21, f5
  3531. FMADD f6, f17, f22, f6
  3532. FMADD f7, f17, f23, f7
  3533. LFD f20, 16 * SIZE(AO)
  3534. LFD f21, 17 * SIZE(AO)
  3535. LFD f22, 18 * SIZE(AO)
  3536. LFD f23, 19 * SIZE(AO)
  3537. FMADD f0, f18, f24, f0
  3538. FMADD f1, f18, f25, f1
  3539. FMADD f2, f18, f26, f2
  3540. FMADD f3, f18, f27, f3
  3541. FMADD f4, f19, f24, f4
  3542. FMADD f5, f19, f25, f5
  3543. FMADD f6, f19, f26, f6
  3544. FMADD f7, f19, f27, f7
  3545. LFD f24, 20 * SIZE(AO)
  3546. LFD f25, 21 * SIZE(AO)
  3547. LFD f26, 22 * SIZE(AO)
  3548. LFD f27, 23 * SIZE(AO)
  3549. LFD f16, 8 * SIZE(BO)
  3550. LFD f17, 9 * SIZE(BO)
  3551. LFD f18, 10 * SIZE(BO)
  3552. LFD f19, 11 * SIZE(BO)
  3553. addi AO, AO, 16 * SIZE
  3554. addi BO, BO, 8 * SIZE
  3555. dcbt PREA, AO
  3556. dcbt PREA, BO
  3557. bdnz LL(52)
  3558. .align 4
  3559. LL(55):
  3560. #if defined(LT) || defined(RN)
  3561. andi. r0, KK, 3
  3562. #else
  3563. andi. r0, TEMP, 3
  3564. #endif
  3565. mtspr CTR, r0
  3566. ble LL(57)
  3567. .align 4
  3568. LL(56):
  3569. FMADD f0, f16, f20, f0
  3570. FMADD f1, f16, f21, f1
  3571. FMADD f2, f16, f22, f2
  3572. FMADD f3, f16, f23, f3
  3573. FMADD f4, f17, f20, f4
  3574. FMADD f5, f17, f21, f5
  3575. FMADD f6, f17, f22, f6
  3576. FMADD f7, f17, f23, f7
  3577. LFD f20, 4 * SIZE(AO)
  3578. LFD f21, 5 * SIZE(AO)
  3579. LFD f22, 6 * SIZE(AO)
  3580. LFD f23, 7 * SIZE(AO)
  3581. LFD f16, 2 * SIZE(BO)
  3582. LFD f17, 3 * SIZE(BO)
  3583. addi BO, BO, 2 * SIZE
  3584. addi AO, AO, 4 * SIZE
  3585. bdnz LL(56)
  3586. .align 4
  3587. LL(57):
  3588. #ifndef CONJ
  3589. FSUB f0, f0, f5
  3590. FADD f1, f1, f4
  3591. FSUB f2, f2, f7
  3592. FADD f3, f3, f6
  3593. #else
  3594. FADD f0, f0, f5
  3595. FSUB f1, f4, f1
  3596. FADD f2, f2, f7
  3597. FSUB f3, f6, f3
  3598. #endif
  3599. #if defined(LN) || defined(RT)
  3600. #ifdef LN
  3601. subi r0, KK, 2
  3602. #else
  3603. subi r0, KK, 1
  3604. #endif
  3605. slwi TEMP, r0, 1 + ZBASE_SHIFT
  3606. slwi r0, r0, 0 + ZBASE_SHIFT
  3607. add AO, AORIG, TEMP
  3608. add BO, B, r0
  3609. #endif
  3610. #if defined(LN) || defined(LT)
  3611. LFD f16, 0 * SIZE(BO)
  3612. LFD f17, 1 * SIZE(BO)
  3613. LFD f18, 2 * SIZE(BO)
  3614. LFD f19, 3 * SIZE(BO)
  3615. FSUB f0, f16, f0
  3616. FSUB f1, f17, f1
  3617. FSUB f2, f18, f2
  3618. FSUB f3, f19, f3
  3619. #else
  3620. LFD f16, 0 * SIZE(AO)
  3621. LFD f17, 1 * SIZE(AO)
  3622. LFD f18, 2 * SIZE(AO)
  3623. LFD f19, 3 * SIZE(AO)
  3624. #ifndef CONJ
  3625. FSUB f0, f16, f0
  3626. FSUB f1, f17, f1
  3627. FSUB f2, f18, f2
  3628. FSUB f3, f19, f3
  3629. #else
  3630. FSUB f0, f16, f0
  3631. FADD f1, f17, f1
  3632. FSUB f2, f18, f2
  3633. FADD f3, f19, f3
  3634. #endif
  3635. #endif
  3636. #ifdef LN
  3637. LFD f16, 6 * SIZE(AO)
  3638. LFD f17, 7 * SIZE(AO)
  3639. LFD f18, 4 * SIZE(AO)
  3640. LFD f19, 5 * SIZE(AO)
  3641. LFD f20, 0 * SIZE(AO)
  3642. LFD f21, 1 * SIZE(AO)
  3643. FMUL f6, f17, f3
  3644. FMUL f7, f17, f2
  3645. #ifndef CONJ
  3646. FMSUB f2, f16, f2, f6
  3647. FMADD f3, f16, f3, f7
  3648. FMADD f0, f19, f3, f0
  3649. FNMSUB f1, f19, f2, f1
  3650. FNMSUB f0, f18, f2, f0
  3651. FNMSUB f1, f18, f3, f1
  3652. FMUL f4, f21, f1
  3653. FMUL f5, f21, f0
  3654. FMSUB f0, f20, f0, f4
  3655. FMADD f1, f20, f1, f5
  3656. #else
  3657. FMADD f2, f16, f2, f6
  3658. FMSUB f3, f16, f3, f7
  3659. FMSUB f0, f19, f3, f0
  3660. FNMADD f1, f19, f2, f1
  3661. FNMADD f0, f18, f2, f0
  3662. FNMADD f1, f18, f3, f1
  3663. FMUL f4, f21, f1
  3664. FMUL f5, f21, f0
  3665. FMADD f0, f20, f0, f4
  3666. FMSUB f1, f20, f1, f5
  3667. #endif
  3668. #endif
  3669. #ifdef LT
  3670. LFD f16, 0 * SIZE(AO)
  3671. LFD f17, 1 * SIZE(AO)
  3672. LFD f18, 2 * SIZE(AO)
  3673. LFD f19, 3 * SIZE(AO)
  3674. LFD f20, 6 * SIZE(AO)
  3675. LFD f21, 7 * SIZE(AO)
  3676. FMUL f4, f17, f1
  3677. FMUL f5, f17, f0
  3678. #ifndef CONJ
  3679. FMSUB f0, f16, f0, f4
  3680. FMADD f1, f16, f1, f5
  3681. FMADD f2, f19, f1, f2
  3682. FNMSUB f3, f19, f0, f3
  3683. FNMSUB f2, f18, f0, f2
  3684. FNMSUB f3, f18, f1, f3
  3685. FMUL f4, f21, f3
  3686. FMUL f5, f21, f2
  3687. FMSUB f2, f20, f2, f4
  3688. FMADD f3, f20, f3, f5
  3689. #else
  3690. FMADD f0, f16, f0, f4
  3691. FMSUB f1, f16, f1, f5
  3692. FMSUB f2, f19, f1, f2
  3693. FNMADD f3, f19, f0, f3
  3694. FNMADD f2, f18, f0, f2
  3695. FNMADD f3, f18, f1, f3
  3696. FMUL f4, f21, f3
  3697. FMUL f5, f21, f2
  3698. FMADD f2, f20, f2, f4
  3699. FMSUB f3, f20, f3, f5
  3700. #endif
  3701. #endif
  3702. #ifdef RN
  3703. LFD f16, 0 * SIZE(BO)
  3704. LFD f17, 1 * SIZE(BO)
  3705. FMUL f4, f17, f1
  3706. FMUL f5, f17, f0
  3707. FMUL f6, f17, f3
  3708. FMUL f7, f17, f2
  3709. #ifndef CONJ
  3710. FMSUB f0, f16, f0, f4
  3711. FMADD f1, f16, f1, f5
  3712. FMSUB f2, f16, f2, f6
  3713. FMADD f3, f16, f3, f7
  3714. #else
  3715. FMADD f0, f16, f0, f4
  3716. FMSUB f1, f16, f1, f5
  3717. FMADD f2, f16, f2, f6
  3718. FMSUB f3, f16, f3, f7
  3719. #endif
  3720. #endif
  3721. #ifdef RT
  3722. LFD f20, 0 * SIZE(BO)
  3723. LFD f21, 1 * SIZE(BO)
  3724. FMUL f4, f21, f1
  3725. FMUL f5, f21, f0
  3726. FMUL f6, f21, f3
  3727. FMUL f7, f21, f2
  3728. #ifndef CONJ
  3729. FMSUB f0, f20, f0, f4
  3730. FMADD f1, f20, f1, f5
  3731. FMSUB f2, f20, f2, f6
  3732. FMADD f3, f20, f3, f7
  3733. #else
  3734. FMADD f0, f20, f0, f4
  3735. FMSUB f1, f20, f1, f5
  3736. FMADD f2, f20, f2, f6
  3737. FMSUB f3, f20, f3, f7
  3738. #endif
  3739. #endif
  3740. #ifdef LN
  3741. subi CO1, CO1, 4 * SIZE
  3742. #endif
  3743. #if defined(LN) || defined(LT)
  3744. STFD f0, 0 * SIZE(BO)
  3745. STFD f1, 1 * SIZE(BO)
  3746. STFD f2, 2 * SIZE(BO)
  3747. STFD f3, 3 * SIZE(BO)
  3748. #else
  3749. STFD f0, 0 * SIZE(AO)
  3750. STFD f1, 1 * SIZE(AO)
  3751. STFD f2, 2 * SIZE(AO)
  3752. STFD f3, 3 * SIZE(AO)
  3753. #endif
  3754. STFD f0, 0 * SIZE(CO1)
  3755. STFD f1, 1 * SIZE(CO1)
  3756. STFD f2, 2 * SIZE(CO1)
  3757. STFD f3, 3 * SIZE(CO1)
  3758. #ifndef LN
  3759. addi CO1, CO1, 4 * SIZE
  3760. #endif
  3761. #ifdef RT
  3762. slwi r0, K, 1 + ZBASE_SHIFT
  3763. add AORIG, AORIG, r0
  3764. #endif
  3765. #if defined(LT) || defined(RN)
  3766. sub TEMP, K, KK
  3767. slwi r0, TEMP, 1 + ZBASE_SHIFT
  3768. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  3769. add AO, AO, r0
  3770. add BO, BO, TEMP
  3771. #endif
  3772. #ifdef LT
  3773. addi KK, KK, 2
  3774. #endif
  3775. #ifdef LN
  3776. subi KK, KK, 2
  3777. #endif
  3778. addic. I, I, -1
  3779. bgt LL(51)
  3780. .align 4
  3781. LL(69):
  3782. #ifdef LN
  3783. slwi r0, K, 0 + ZBASE_SHIFT
  3784. add B, B, r0
  3785. #endif
  3786. #if defined(LT) || defined(RN)
  3787. mr B, BO
  3788. #endif
  3789. #ifdef RN
  3790. addi KK, KK, 1
  3791. #endif
  3792. #ifdef RT
  3793. subi KK, KK, 1
  3794. #endif
  3795. .align 4
  3796. LL(999):
  3797. addi r3, 0, 0
  3798. lfd f14, 0(SP)
  3799. lfd f15, 8(SP)
  3800. lfd f16, 16(SP)
  3801. lfd f17, 24(SP)
  3802. lfd f18, 32(SP)
  3803. lfd f19, 40(SP)
  3804. lfd f20, 48(SP)
  3805. lfd f21, 56(SP)
  3806. lfd f22, 64(SP)
  3807. lfd f23, 72(SP)
  3808. lfd f24, 80(SP)
  3809. lfd f25, 88(SP)
  3810. lfd f26, 96(SP)
  3811. lfd f27, 104(SP)
  3812. lfd f28, 112(SP)
  3813. lfd f29, 120(SP)
  3814. lfd f30, 128(SP)
  3815. lfd f31, 136(SP)
  3816. #ifdef __64BIT__
  3817. ld r31, 144(SP)
  3818. ld r30, 152(SP)
  3819. ld r29, 160(SP)
  3820. ld r28, 168(SP)
  3821. ld r27, 176(SP)
  3822. ld r26, 184(SP)
  3823. ld r25, 192(SP)
  3824. ld r24, 200(SP)
  3825. ld r23, 208(SP)
  3826. ld r22, 216(SP)
  3827. ld r21, 224(SP)
  3828. ld r20, 232(SP)
  3829. ld r19, 240(SP)
  3830. #else
  3831. lwz r31, 144(SP)
  3832. lwz r30, 148(SP)
  3833. lwz r29, 152(SP)
  3834. lwz r28, 156(SP)
  3835. lwz r27, 160(SP)
  3836. lwz r26, 164(SP)
  3837. lwz r25, 168(SP)
  3838. lwz r24, 172(SP)
  3839. lwz r23, 176(SP)
  3840. lwz r22, 180(SP)
  3841. lwz r21, 184(SP)
  3842. lwz r20, 188(SP)
  3843. lwz r19, 192(SP)
  3844. #endif
  3845. addi SP, SP, STACKSIZE
  3846. blr
  3847. EPILOGUE
  3848. #endif