You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_power6_LT.S 84 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define FZERO 312(SP)
  48. #else
  49. #define STACKSIZE 256
  50. #define FZERO 240(SP)
  51. #endif
  52. #define M r3
  53. #define N r4
  54. #define K r5
  55. #if defined(linux) || defined(__FreeBSD__)
  56. #ifndef __64BIT__
  57. #define A r6
  58. #define B r7
  59. #define C r8
  60. #define LDC r9
  61. #define OFFSET r10
  62. #else
  63. #define A r8
  64. #define B r9
  65. #define C r10
  66. #define LDC r6
  67. #define OFFSET r7
  68. #endif
  69. #endif
  70. #if defined(_AIX) || defined(__APPLE__)
  71. #if !defined(__64BIT__) && defined(DOUBLE)
  72. #define A r10
  73. #define B r6
  74. #define C r7
  75. #define LDC r8
  76. #define OFFSET r9
  77. #else
  78. #define A r8
  79. #define B r9
  80. #define C r10
  81. #define LDC r6
  82. #define OFFSET r7
  83. #endif
  84. #endif
  85. #define AORIG r19
  86. #define TEMP r20
  87. #define KK r21
  88. #define I r22
  89. #define J r23
  90. #define AO r24
  91. #define BO r25
  92. #define CO1 r26
  93. #define CO2 r27
  94. #define CO3 r28
  95. #define CO4 r29
  96. #define PREA r30
  97. #define PREC r31
  98. #ifndef CONJ
  99. #define FMA1 FMADD
  100. #define FMA2 FMADD
  101. #define FMA3 FNMSUB
  102. #define FMA4 FMADD
  103. #elif defined(LN) || defined(LT)
  104. #define FMA1 FMADD
  105. #define FMA2 FMADD
  106. #define FMA3 FMADD
  107. #define FMA4 FNMSUB
  108. #else
  109. #define FMA1 FMADD
  110. #define FMA2 FNMSUB
  111. #define FMA3 FMADD
  112. #define FMA4 FMADD
  113. #endif
  114. #ifndef NEEDPARAM
  115. PROLOGUE
  116. PROFCODE
  117. addi SP, SP, -STACKSIZE
  118. li r0, 0
  119. stfd f14, 0(SP)
  120. stfd f15, 8(SP)
  121. stfd f16, 16(SP)
  122. stfd f17, 24(SP)
  123. stfd f18, 32(SP)
  124. stfd f19, 40(SP)
  125. stfd f20, 48(SP)
  126. stfd f21, 56(SP)
  127. stfd f22, 64(SP)
  128. stfd f23, 72(SP)
  129. stfd f24, 80(SP)
  130. stfd f25, 88(SP)
  131. stfd f26, 96(SP)
  132. stfd f27, 104(SP)
  133. stfd f28, 112(SP)
  134. stfd f29, 120(SP)
  135. stfd f30, 128(SP)
  136. stfd f31, 136(SP)
  137. #ifdef __64BIT__
  138. std r31, 144(SP)
  139. std r30, 152(SP)
  140. std r29, 160(SP)
  141. std r28, 168(SP)
  142. std r27, 176(SP)
  143. std r26, 184(SP)
  144. std r25, 192(SP)
  145. std r24, 200(SP)
  146. std r23, 208(SP)
  147. std r22, 216(SP)
  148. std r21, 224(SP)
  149. std r20, 232(SP)
  150. std r19, 240(SP)
  151. #else
  152. stw r31, 144(SP)
  153. stw r30, 148(SP)
  154. stw r29, 152(SP)
  155. stw r28, 156(SP)
  156. stw r27, 160(SP)
  157. stw r26, 164(SP)
  158. stw r25, 168(SP)
  159. stw r24, 172(SP)
  160. stw r23, 176(SP)
  161. stw r22, 180(SP)
  162. stw r21, 184(SP)
  163. stw r20, 188(SP)
  164. stw r19, 192(SP)
  165. #endif
  166. stw r0, FZERO
  167. #if defined(linux) || defined(__FreeBSD__)
  168. #ifdef __64BIT__
  169. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  170. #endif
  171. #endif
  172. #if defined(_AIX) || defined(__APPLE__)
  173. #ifdef __64BIT__
  174. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  175. #else
  176. #ifdef DOUBLE
  177. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  178. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  179. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  180. #else
  181. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  182. #endif
  183. #endif
  184. #endif
  185. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  186. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  187. #endif
  188. #if defined(_AIX) || defined(__APPLE__)
  189. #ifdef __64BIT__
  190. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  191. #else
  192. #ifdef DOUBLE
  193. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  194. #else
  195. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  196. #endif
  197. #endif
  198. #endif
  199. slwi LDC, LDC, ZBASE_SHIFT
  200. #ifdef LN
  201. mullw r0, M, K
  202. slwi r0, r0, ZBASE_SHIFT
  203. add A, A, r0
  204. slwi r0, M, ZBASE_SHIFT
  205. add C, C, r0
  206. #endif
  207. #ifdef RN
  208. neg KK, OFFSET
  209. #endif
  210. #ifdef RT
  211. mullw r0, N, K
  212. slwi r0, r0, ZBASE_SHIFT
  213. add B, B, r0
  214. mullw r0, N, LDC
  215. add C, C, r0
  216. sub KK, N, OFFSET
  217. #endif
  218. cmpwi cr0, M, 0
  219. ble LL(999)
  220. cmpwi cr0, N, 0
  221. ble LL(999)
  222. cmpwi cr0, K, 0
  223. ble LL(999)
  224. li PREA, 48 * SIZE
  225. li PREC, 4 * SIZE
  226. srawi. J, N, 2
  227. ble LL(30)
  228. .align 4
  229. LL(10):
  230. #ifdef RT
  231. slwi r0, K, 2 + ZBASE_SHIFT
  232. sub B, B, r0
  233. slwi r0, LDC, 2
  234. sub C, C, r0
  235. #endif
  236. mr CO1, C
  237. add CO2, C, LDC
  238. add CO3, CO2, LDC
  239. add CO4, CO3, LDC
  240. #ifdef LN
  241. add KK, M, OFFSET
  242. #endif
  243. #ifdef LT
  244. mr KK, OFFSET
  245. #endif
  246. lfs f0, FZERO
  247. fmr f1, f0
  248. fmr f2, f0
  249. fmr f3, f0
  250. fmr f4, f0
  251. fmr f5, f0
  252. fmr f6, f0
  253. fmr f7, f0
  254. fmr f8, f0
  255. fmr f9, f0
  256. fmr f10, f0
  257. fmr f11, f0
  258. fmr f12, f0
  259. fmr f13, f0
  260. fmr f14, f0
  261. fmr f15, f0
  262. srawi. I, M, 1
  263. #if defined(LN) || defined(RT)
  264. mr AORIG, A
  265. #else
  266. mr AO, A
  267. #endif
  268. #ifndef RT
  269. add C, CO4, LDC
  270. #endif
  271. ble LL(20)
  272. .align 4
  273. LL(11):
  274. #if defined(LT) || defined(RN)
  275. LFD f16, 0 * SIZE(AO)
  276. LFD f20, 0 * SIZE(B)
  277. LFD f17, 1 * SIZE(AO)
  278. LFD f21, 1 * SIZE(B)
  279. LFD f18, 2 * SIZE(AO)
  280. LFD f22, 2 * SIZE(B)
  281. LFD f19, 3 * SIZE(AO)
  282. LFD f23, 3 * SIZE(B)
  283. LFD f24, 4 * SIZE(B)
  284. LFD f25, 5 * SIZE(B)
  285. LFD f26, 6 * SIZE(B)
  286. LFD f27, 7 * SIZE(B)
  287. dcbtst CO1, PREC
  288. dcbtst CO2, PREC
  289. dcbtst CO3, PREC
  290. dcbtst CO4, PREC
  291. srawi. r0, KK, 3
  292. mtspr CTR, r0
  293. mr BO, B
  294. #else
  295. #ifdef LN
  296. slwi r0, K, 1 + ZBASE_SHIFT
  297. sub AORIG, AORIG, r0
  298. #endif
  299. slwi r0, KK, 1 + ZBASE_SHIFT
  300. slwi TEMP, KK, 2 + ZBASE_SHIFT
  301. add AO, AORIG, r0
  302. add BO, B, TEMP
  303. sub TEMP, K, KK
  304. LFD f16, 0 * SIZE(AO)
  305. LFD f20, 0 * SIZE(BO)
  306. LFD f17, 1 * SIZE(AO)
  307. LFD f21, 1 * SIZE(BO)
  308. LFD f18, 2 * SIZE(AO)
  309. LFD f22, 2 * SIZE(BO)
  310. LFD f19, 3 * SIZE(AO)
  311. LFD f23, 3 * SIZE(BO)
  312. LFD f24, 4 * SIZE(BO)
  313. LFD f25, 5 * SIZE(BO)
  314. LFD f26, 6 * SIZE(BO)
  315. LFD f27, 7 * SIZE(BO)
  316. dcbtst CO1, PREC
  317. dcbtst CO2, PREC
  318. dcbtst CO3, PREC
  319. dcbtst CO4, PREC
  320. srawi. r0, TEMP, 3
  321. mtspr CTR, r0
  322. #endif
  323. ble LL(15)
  324. .align 4
  325. LL(12):
  326. dcbt AO, PREA
  327. dcbtst BO, PREA
  328. FMA1 f0, f16, f20, f0
  329. FMA1 f2, f18, f20, f2
  330. FMA2 f1, f16, f21, f1
  331. FMA2 f3, f18, f21, f3
  332. LFD f28, 4 * SIZE(AO)
  333. LFD f29, 5 * SIZE(AO)
  334. LFD f30, 6 * SIZE(AO)
  335. LFD f31, 7 * SIZE(AO)
  336. FMA1 f4, f16, f22, f4
  337. FMA1 f6, f18, f22, f6
  338. FMA2 f5, f16, f23, f5
  339. FMA2 f7, f18, f23, f7
  340. FMA1 f8, f16, f24, f8
  341. FMA1 f10, f18, f24, f10
  342. FMA2 f9, f16, f25, f9
  343. FMA2 f11, f18, f25, f11
  344. FMA1 f12, f16, f26, f12
  345. FMA1 f14, f18, f26, f14
  346. FMA2 f13, f16, f27, f13
  347. FMA2 f15, f18, f27, f15
  348. FMA4 f1, f17, f20, f1
  349. FMA4 f3, f19, f20, f3
  350. FMA3 f0, f17, f21, f0
  351. FMA3 f2, f19, f21, f2
  352. FMA4 f5, f17, f22, f5
  353. FMA4 f7, f19, f22, f7
  354. FMA3 f4, f17, f23, f4
  355. FMA3 f6, f19, f23, f6
  356. LFD f20, 8 * SIZE(BO)
  357. LFD f21, 9 * SIZE(BO)
  358. LFD f22, 10 * SIZE(BO)
  359. LFD f23, 11 * SIZE(BO)
  360. FMA4 f9, f17, f24, f9
  361. FMA4 f11, f19, f24, f11
  362. FMA3 f8, f17, f25, f8
  363. FMA3 f10, f19, f25, f10
  364. FMA4 f13, f17, f26, f13
  365. FMA4 f15, f19, f26, f15
  366. FMA3 f12, f17, f27, f12
  367. FMA3 f14, f19, f27, f14
  368. LFD f24, 12 * SIZE(BO)
  369. LFD f25, 13 * SIZE(BO)
  370. LFD f26, 14 * SIZE(BO)
  371. LFD f27, 15 * SIZE(BO)
  372. FMA1 f0, f28, f20, f0
  373. FMA1 f2, f30, f20, f2
  374. FMA2 f1, f28, f21, f1
  375. FMA2 f3, f30, f21, f3
  376. LFD f16, 8 * SIZE(AO)
  377. LFD f17, 9 * SIZE(AO)
  378. LFD f18, 10 * SIZE(AO)
  379. LFD f19, 11 * SIZE(AO)
  380. FMA1 f4, f28, f22, f4
  381. FMA1 f6, f30, f22, f6
  382. FMA2 f5, f28, f23, f5
  383. FMA2 f7, f30, f23, f7
  384. FMA1 f8, f28, f24, f8
  385. FMA1 f10, f30, f24, f10
  386. FMA2 f9, f28, f25, f9
  387. FMA2 f11, f30, f25, f11
  388. FMA1 f12, f28, f26, f12
  389. FMA1 f14, f30, f26, f14
  390. FMA2 f13, f28, f27, f13
  391. FMA2 f15, f30, f27, f15
  392. FMA4 f1, f29, f20, f1
  393. FMA4 f3, f31, f20, f3
  394. FMA3 f0, f29, f21, f0
  395. FMA3 f2, f31, f21, f2
  396. FMA4 f5, f29, f22, f5
  397. FMA4 f7, f31, f22, f7
  398. FMA3 f4, f29, f23, f4
  399. FMA3 f6, f31, f23, f6
  400. LFD f20, 16 * SIZE(BO)
  401. LFD f21, 17 * SIZE(BO)
  402. LFD f22, 18 * SIZE(BO)
  403. LFD f23, 19 * SIZE(BO)
  404. FMA4 f9, f29, f24, f9
  405. FMA4 f11, f31, f24, f11
  406. FMA3 f8, f29, f25, f8
  407. FMA3 f10, f31, f25, f10
  408. FMA4 f13, f29, f26, f13
  409. FMA4 f15, f31, f26, f15
  410. FMA3 f12, f29, f27, f12
  411. FMA3 f14, f31, f27, f14
  412. LFD f24, 20 * SIZE(BO)
  413. LFD f25, 21 * SIZE(BO)
  414. LFD f26, 22 * SIZE(BO)
  415. LFD f27, 23 * SIZE(BO)
  416. FMA1 f0, f16, f20, f0
  417. FMA1 f2, f18, f20, f2
  418. FMA2 f1, f16, f21, f1
  419. FMA2 f3, f18, f21, f3
  420. LFD f28, 12 * SIZE(AO)
  421. LFD f29, 13 * SIZE(AO)
  422. LFD f30, 14 * SIZE(AO)
  423. LFD f31, 15 * SIZE(AO)
  424. FMA1 f4, f16, f22, f4
  425. FMA1 f6, f18, f22, f6
  426. FMA2 f5, f16, f23, f5
  427. FMA2 f7, f18, f23, f7
  428. FMA1 f8, f16, f24, f8
  429. FMA1 f10, f18, f24, f10
  430. FMA2 f9, f16, f25, f9
  431. FMA2 f11, f18, f25, f11
  432. FMA1 f12, f16, f26, f12
  433. FMA1 f14, f18, f26, f14
  434. FMA2 f13, f16, f27, f13
  435. FMA2 f15, f18, f27, f15
  436. FMA4 f1, f17, f20, f1
  437. FMA4 f3, f19, f20, f3
  438. FMA3 f0, f17, f21, f0
  439. FMA3 f2, f19, f21, f2
  440. FMA4 f5, f17, f22, f5
  441. FMA4 f7, f19, f22, f7
  442. FMA3 f4, f17, f23, f4
  443. FMA3 f6, f19, f23, f6
  444. LFD f20, 24 * SIZE(BO)
  445. LFD f21, 25 * SIZE(BO)
  446. LFD f22, 26 * SIZE(BO)
  447. LFD f23, 27 * SIZE(BO)
  448. FMA4 f9, f17, f24, f9
  449. FMA4 f11, f19, f24, f11
  450. FMA3 f8, f17, f25, f8
  451. FMA3 f10, f19, f25, f10
  452. FMA4 f13, f17, f26, f13
  453. FMA4 f15, f19, f26, f15
  454. FMA3 f12, f17, f27, f12
  455. FMA3 f14, f19, f27, f14
  456. LFD f24, 28 * SIZE(BO)
  457. LFD f25, 29 * SIZE(BO)
  458. LFD f26, 30 * SIZE(BO)
  459. LFD f27, 31 * SIZE(BO)
  460. FMA1 f0, f28, f20, f0
  461. FMA1 f2, f30, f20, f2
  462. FMA2 f1, f28, f21, f1
  463. FMA2 f3, f30, f21, f3
  464. LFD f16, 16 * SIZE(AO)
  465. LFD f17, 17 * SIZE(AO)
  466. LFD f18, 18 * SIZE(AO)
  467. LFD f19, 19 * SIZE(AO)
  468. FMA1 f4, f28, f22, f4
  469. FMA1 f6, f30, f22, f6
  470. FMA2 f5, f28, f23, f5
  471. FMA2 f7, f30, f23, f7
  472. FMA1 f8, f28, f24, f8
  473. FMA1 f10, f30, f24, f10
  474. FMA2 f9, f28, f25, f9
  475. FMA2 f11, f30, f25, f11
  476. FMA1 f12, f28, f26, f12
  477. FMA1 f14, f30, f26, f14
  478. FMA2 f13, f28, f27, f13
  479. FMA2 f15, f30, f27, f15
  480. FMA4 f1, f29, f20, f1
  481. FMA4 f3, f31, f20, f3
  482. FMA3 f0, f29, f21, f0
  483. FMA3 f2, f31, f21, f2
  484. FMA4 f5, f29, f22, f5
  485. FMA4 f7, f31, f22, f7
  486. FMA3 f4, f29, f23, f4
  487. FMA3 f6, f31, f23, f6
  488. LFD f20, 32 * SIZE(BO)
  489. LFD f21, 33 * SIZE(BO)
  490. LFD f22, 34 * SIZE(BO)
  491. LFD f23, 35 * SIZE(BO)
  492. FMA4 f9, f29, f24, f9
  493. FMA4 f11, f31, f24, f11
  494. FMA3 f8, f29, f25, f8
  495. FMA3 f10, f31, f25, f10
  496. FMA4 f13, f29, f26, f13
  497. FMA4 f15, f31, f26, f15
  498. FMA3 f12, f29, f27, f12
  499. FMA3 f14, f31, f27, f14
  500. LFD f24, 36 * SIZE(BO)
  501. LFD f25, 37 * SIZE(BO)
  502. LFD f26, 38 * SIZE(BO)
  503. LFD f27, 39 * SIZE(BO)
  504. FMA1 f0, f16, f20, f0
  505. FMA1 f2, f18, f20, f2
  506. FMA2 f1, f16, f21, f1
  507. FMA2 f3, f18, f21, f3
  508. LFD f28, 20 * SIZE(AO)
  509. LFD f29, 21 * SIZE(AO)
  510. LFD f30, 22 * SIZE(AO)
  511. LFD f31, 23 * SIZE(AO)
  512. FMA1 f4, f16, f22, f4
  513. FMA1 f6, f18, f22, f6
  514. FMA2 f5, f16, f23, f5
  515. FMA2 f7, f18, f23, f7
  516. FMA1 f8, f16, f24, f8
  517. FMA1 f10, f18, f24, f10
  518. FMA2 f9, f16, f25, f9
  519. FMA2 f11, f18, f25, f11
  520. FMA1 f12, f16, f26, f12
  521. FMA1 f14, f18, f26, f14
  522. FMA2 f13, f16, f27, f13
  523. FMA2 f15, f18, f27, f15
  524. FMA4 f1, f17, f20, f1
  525. FMA4 f3, f19, f20, f3
  526. FMA3 f0, f17, f21, f0
  527. FMA3 f2, f19, f21, f2
  528. FMA4 f5, f17, f22, f5
  529. FMA4 f7, f19, f22, f7
  530. FMA3 f4, f17, f23, f4
  531. FMA3 f6, f19, f23, f6
  532. LFD f20, 40 * SIZE(BO)
  533. LFD f21, 41 * SIZE(BO)
  534. LFD f22, 42 * SIZE(BO)
  535. LFD f23, 43 * SIZE(BO)
  536. FMA4 f9, f17, f24, f9
  537. FMA4 f11, f19, f24, f11
  538. FMA3 f8, f17, f25, f8
  539. FMA3 f10, f19, f25, f10
  540. FMA4 f13, f17, f26, f13
  541. FMA4 f15, f19, f26, f15
  542. FMA3 f12, f17, f27, f12
  543. FMA3 f14, f19, f27, f14
  544. LFD f24, 44 * SIZE(BO)
  545. LFD f25, 45 * SIZE(BO)
  546. LFD f26, 46 * SIZE(BO)
  547. LFD f27, 47 * SIZE(BO)
  548. FMA1 f0, f28, f20, f0
  549. FMA1 f2, f30, f20, f2
  550. FMA2 f1, f28, f21, f1
  551. FMA2 f3, f30, f21, f3
  552. LFD f16, 24 * SIZE(AO)
  553. LFD f17, 25 * SIZE(AO)
  554. LFD f18, 26 * SIZE(AO)
  555. LFD f19, 27 * SIZE(AO)
  556. FMA1 f4, f28, f22, f4
  557. FMA1 f6, f30, f22, f6
  558. FMA2 f5, f28, f23, f5
  559. FMA2 f7, f30, f23, f7
  560. FMA1 f8, f28, f24, f8
  561. FMA1 f10, f30, f24, f10
  562. FMA2 f9, f28, f25, f9
  563. FMA2 f11, f30, f25, f11
  564. FMA1 f12, f28, f26, f12
  565. FMA1 f14, f30, f26, f14
  566. FMA2 f13, f28, f27, f13
  567. FMA2 f15, f30, f27, f15
  568. FMA4 f1, f29, f20, f1
  569. FMA4 f3, f31, f20, f3
  570. FMA3 f0, f29, f21, f0
  571. FMA3 f2, f31, f21, f2
  572. FMA4 f5, f29, f22, f5
  573. FMA4 f7, f31, f22, f7
  574. FMA3 f4, f29, f23, f4
  575. FMA3 f6, f31, f23, f6
  576. LFD f20, 48 * SIZE(BO)
  577. LFD f21, 49 * SIZE(BO)
  578. LFD f22, 50 * SIZE(BO)
  579. LFD f23, 51 * SIZE(BO)
  580. FMA4 f9, f29, f24, f9
  581. FMA4 f11, f31, f24, f11
  582. FMA3 f8, f29, f25, f8
  583. FMA3 f10, f31, f25, f10
  584. FMA4 f13, f29, f26, f13
  585. FMA4 f15, f31, f26, f15
  586. FMA3 f12, f29, f27, f12
  587. FMA3 f14, f31, f27, f14
  588. LFD f24, 52 * SIZE(BO)
  589. LFD f25, 53 * SIZE(BO)
  590. LFD f26, 54 * SIZE(BO)
  591. LFD f27, 55 * SIZE(BO)
  592. FMA1 f0, f16, f20, f0
  593. FMA1 f2, f18, f20, f2
  594. FMA2 f1, f16, f21, f1
  595. FMA2 f3, f18, f21, f3
  596. LFD f28, 28 * SIZE(AO)
  597. LFD f29, 29 * SIZE(AO)
  598. LFD f30, 30 * SIZE(AO)
  599. LFD f31, 31 * SIZE(AO)
  600. FMA1 f4, f16, f22, f4
  601. FMA1 f6, f18, f22, f6
  602. FMA2 f5, f16, f23, f5
  603. FMA2 f7, f18, f23, f7
  604. FMA1 f8, f16, f24, f8
  605. FMA1 f10, f18, f24, f10
  606. FMA2 f9, f16, f25, f9
  607. FMA2 f11, f18, f25, f11
  608. FMA1 f12, f16, f26, f12
  609. FMA1 f14, f18, f26, f14
  610. FMA2 f13, f16, f27, f13
  611. FMA2 f15, f18, f27, f15
  612. FMA4 f1, f17, f20, f1
  613. FMA4 f3, f19, f20, f3
  614. FMA3 f0, f17, f21, f0
  615. FMA3 f2, f19, f21, f2
  616. FMA4 f5, f17, f22, f5
  617. FMA4 f7, f19, f22, f7
  618. FMA3 f4, f17, f23, f4
  619. FMA3 f6, f19, f23, f6
  620. LFD f20, 56 * SIZE(BO)
  621. LFD f21, 57 * SIZE(BO)
  622. LFD f22, 58 * SIZE(BO)
  623. LFD f23, 59 * SIZE(BO)
  624. FMA4 f9, f17, f24, f9
  625. FMA4 f11, f19, f24, f11
  626. FMA3 f8, f17, f25, f8
  627. FMA3 f10, f19, f25, f10
  628. FMA4 f13, f17, f26, f13
  629. FMA4 f15, f19, f26, f15
  630. FMA3 f12, f17, f27, f12
  631. FMA3 f14, f19, f27, f14
  632. LFD f24, 60 * SIZE(BO)
  633. LFD f25, 61 * SIZE(BO)
  634. LFD f26, 62 * SIZE(BO)
  635. LFD f27, 63 * SIZE(BO)
  636. FMA1 f0, f28, f20, f0
  637. FMA1 f2, f30, f20, f2
  638. FMA2 f1, f28, f21, f1
  639. FMA2 f3, f30, f21, f3
  640. LFD f16, 32 * SIZE(AO)
  641. LFD f17, 33 * SIZE(AO)
  642. LFD f18, 34 * SIZE(AO)
  643. LFD f19, 35 * SIZE(AO)
  644. FMA1 f4, f28, f22, f4
  645. FMA1 f6, f30, f22, f6
  646. FMA2 f5, f28, f23, f5
  647. FMA2 f7, f30, f23, f7
  648. FMA1 f8, f28, f24, f8
  649. FMA1 f10, f30, f24, f10
  650. FMA2 f9, f28, f25, f9
  651. FMA2 f11, f30, f25, f11
  652. FMA1 f12, f28, f26, f12
  653. FMA1 f14, f30, f26, f14
  654. FMA2 f13, f28, f27, f13
  655. FMA2 f15, f30, f27, f15
  656. FMA4 f1, f29, f20, f1
  657. FMA4 f3, f31, f20, f3
  658. FMA3 f0, f29, f21, f0
  659. FMA3 f2, f31, f21, f2
  660. FMA4 f5, f29, f22, f5
  661. FMA4 f7, f31, f22, f7
  662. FMA3 f4, f29, f23, f4
  663. FMA3 f6, f31, f23, f6
  664. LFD f20, 64 * SIZE(BO)
  665. LFD f21, 65 * SIZE(BO)
  666. LFD f22, 66 * SIZE(BO)
  667. LFD f23, 67 * SIZE(BO)
  668. FMA4 f9, f29, f24, f9
  669. FMA4 f11, f31, f24, f11
  670. FMA3 f8, f29, f25, f8
  671. FMA3 f10, f31, f25, f10
  672. FMA4 f13, f29, f26, f13
  673. FMA4 f15, f31, f26, f15
  674. FMA3 f12, f29, f27, f12
  675. FMA3 f14, f31, f27, f14
  676. LFD f24, 68 * SIZE(BO)
  677. LFD f25, 69 * SIZE(BO)
  678. LFD f26, 70 * SIZE(BO)
  679. LFD f27, 71 * SIZE(BO)
  680. addi AO, AO, 32 * SIZE
  681. addi BO, BO, 64 * SIZE
  682. bdnz LL(12)
  683. .align 4
  684. LL(15):
  685. #if defined(LT) || defined(RN)
  686. andi. r0, KK, 7
  687. #else
  688. andi. r0, TEMP, 7
  689. #endif
  690. mtspr CTR, r0
  691. ble LL(18)
  692. .align 4
  693. LL(16):
  694. FMA1 f0, f16, f20, f0
  695. FMA1 f2, f18, f20, f2
  696. FMA2 f1, f16, f21, f1
  697. FMA2 f3, f18, f21, f3
  698. FMA1 f4, f16, f22, f4
  699. FMA1 f6, f18, f22, f6
  700. FMA2 f5, f16, f23, f5
  701. FMA2 f7, f18, f23, f7
  702. FMA1 f8, f16, f24, f8
  703. FMA1 f10, f18, f24, f10
  704. FMA2 f9, f16, f25, f9
  705. FMA2 f11, f18, f25, f11
  706. FMA1 f12, f16, f26, f12
  707. FMA1 f14, f18, f26, f14
  708. FMA2 f13, f16, f27, f13
  709. FMA2 f15, f18, f27, f15
  710. FMA4 f1, f17, f20, f1
  711. FMA4 f3, f19, f20, f3
  712. FMA3 f0, f17, f21, f0
  713. FMA3 f2, f19, f21, f2
  714. FMA4 f5, f17, f22, f5
  715. FMA4 f7, f19, f22, f7
  716. FMA3 f4, f17, f23, f4
  717. FMA3 f6, f19, f23, f6
  718. FMA4 f9, f17, f24, f9
  719. FMA4 f11, f19, f24, f11
  720. FMA3 f8, f17, f25, f8
  721. FMA3 f10, f19, f25, f10
  722. FMA4 f13, f17, f26, f13
  723. FMA4 f15, f19, f26, f15
  724. FMA3 f12, f17, f27, f12
  725. FMA3 f14, f19, f27, f14
  726. LFD f16, 4 * SIZE(AO)
  727. LFD f17, 5 * SIZE(AO)
  728. LFD f18, 6 * SIZE(AO)
  729. LFD f19, 7 * SIZE(AO)
  730. LFD f20, 8 * SIZE(BO)
  731. LFD f21, 9 * SIZE(BO)
  732. LFD f22, 10 * SIZE(BO)
  733. LFD f23, 11 * SIZE(BO)
  734. LFD f24, 12 * SIZE(BO)
  735. LFD f25, 13 * SIZE(BO)
  736. LFD f26, 14 * SIZE(BO)
  737. LFD f27, 15 * SIZE(BO)
  738. addi AO, AO, 4 * SIZE
  739. addi BO, BO, 8 * SIZE
  740. bdnz LL(16)
  741. .align 4
  742. LL(18):
  743. #if defined(LN) || defined(RT)
  744. #ifdef LN
  745. subi r0, KK, 2
  746. #else
  747. subi r0, KK, 4
  748. #endif
  749. slwi TEMP, r0, 1 + ZBASE_SHIFT
  750. slwi r0, r0, 2 + ZBASE_SHIFT
  751. add AO, AORIG, TEMP
  752. add BO, B, r0
  753. #endif
  754. #if defined(LN) || defined(LT)
  755. LFD f16, 0 * SIZE(BO)
  756. LFD f17, 1 * SIZE(BO)
  757. LFD f18, 2 * SIZE(BO)
  758. LFD f19, 3 * SIZE(BO)
  759. FSUB f0, f16, f0
  760. FSUB f1, f17, f1
  761. FSUB f4, f18, f4
  762. FSUB f5, f19, f5
  763. LFD f20, 4 * SIZE(BO)
  764. LFD f21, 5 * SIZE(BO)
  765. LFD f22, 6 * SIZE(BO)
  766. LFD f23, 7 * SIZE(BO)
  767. FSUB f8, f20, f8
  768. FSUB f9, f21, f9
  769. FSUB f12, f22, f12
  770. FSUB f13, f23, f13
  771. LFD f24, 8 * SIZE(BO)
  772. LFD f25, 9 * SIZE(BO)
  773. LFD f26, 10 * SIZE(BO)
  774. LFD f27, 11 * SIZE(BO)
  775. FSUB f2, f24, f2
  776. FSUB f3, f25, f3
  777. FSUB f6, f26, f6
  778. FSUB f7, f27, f7
  779. LFD f28, 12 * SIZE(BO)
  780. LFD f29, 13 * SIZE(BO)
  781. LFD f30, 14 * SIZE(BO)
  782. LFD f31, 15 * SIZE(BO)
  783. FSUB f10, f28, f10
  784. FSUB f11, f29, f11
  785. FSUB f14, f30, f14
  786. FSUB f15, f31, f15
  787. #else
  788. LFD f16, 0 * SIZE(AO)
  789. LFD f17, 1 * SIZE(AO)
  790. LFD f18, 2 * SIZE(AO)
  791. LFD f19, 3 * SIZE(AO)
  792. FSUB f0, f16, f0
  793. FSUB f1, f17, f1
  794. FSUB f2, f18, f2
  795. FSUB f3, f19, f3
  796. LFD f20, 4 * SIZE(AO)
  797. LFD f21, 5 * SIZE(AO)
  798. LFD f22, 6 * SIZE(AO)
  799. LFD f23, 7 * SIZE(AO)
  800. FSUB f4, f20, f4
  801. FSUB f5, f21, f5
  802. FSUB f6, f22, f6
  803. FSUB f7, f23, f7
  804. LFD f24, 8 * SIZE(AO)
  805. LFD f25, 9 * SIZE(AO)
  806. LFD f26, 10 * SIZE(AO)
  807. LFD f27, 11 * SIZE(AO)
  808. FSUB f8, f24, f8
  809. FSUB f9, f25, f9
  810. FSUB f10, f26, f10
  811. FSUB f11, f27, f11
  812. LFD f28, 12 * SIZE(AO)
  813. LFD f29, 13 * SIZE(AO)
  814. LFD f30, 14 * SIZE(AO)
  815. LFD f31, 15 * SIZE(AO)
  816. FSUB f12, f28, f12
  817. FSUB f13, f29, f13
  818. FSUB f14, f30, f14
  819. FSUB f15, f31, f15
  820. #endif
  821. #ifdef LN
  822. LFD f24, 6 * SIZE(AO)
  823. LFD f25, 7 * SIZE(AO)
  824. LFD f26, 4 * SIZE(AO)
  825. LFD f27, 5 * SIZE(AO)
  826. LFD f28, 0 * SIZE(AO)
  827. LFD f29, 1 * SIZE(AO)
  828. FMUL f16, f25, f3
  829. FMUL f17, f25, f2
  830. FMUL f18, f25, f7
  831. FMUL f19, f25, f6
  832. FMUL f20, f25, f11
  833. FMUL f21, f25, f10
  834. FMUL f22, f25, f15
  835. FMUL f23, f25, f14
  836. #ifndef CONJ
  837. FMSUB f2, f24, f2, f16
  838. FMADD f3, f24, f3, f17
  839. FMSUB f6, f24, f6, f18
  840. FMADD f7, f24, f7, f19
  841. FMSUB f10, f24, f10, f20
  842. FMADD f11, f24, f11, f21
  843. FMSUB f14, f24, f14, f22
  844. FMADD f15, f24, f15, f23
  845. FMADD f0, f27, f3, f0
  846. FNMSUB f1, f27, f2, f1
  847. FMADD f4, f27, f7, f4
  848. FNMSUB f5, f27, f6, f5
  849. FMADD f8, f27, f11, f8
  850. FNMSUB f9, f27, f10, f9
  851. FMADD f12, f27, f15, f12
  852. FNMSUB f13, f27, f14, f13
  853. FNMSUB f0, f26, f2, f0
  854. FNMSUB f1, f26, f3, f1
  855. FNMSUB f4, f26, f6, f4
  856. FNMSUB f5, f26, f7, f5
  857. FNMSUB f8, f26, f10, f8
  858. FNMSUB f9, f26, f11, f9
  859. FNMSUB f12, f26, f14, f12
  860. FNMSUB f13, f26, f15, f13
  861. FMUL f16, f29, f1
  862. FMUL f17, f29, f0
  863. FMUL f18, f29, f5
  864. FMUL f19, f29, f4
  865. FMUL f20, f29, f9
  866. FMUL f21, f29, f8
  867. FMUL f22, f29, f13
  868. FMUL f23, f29, f12
  869. FMSUB f0, f28, f0, f16
  870. FMADD f1, f28, f1, f17
  871. FMSUB f4, f28, f4, f18
  872. FMADD f5, f28, f5, f19
  873. FMSUB f8, f28, f8, f20
  874. FMADD f9, f28, f9, f21
  875. FMSUB f12, f28, f12, f22
  876. FMADD f13, f28, f13, f23
  877. #else
  878. FMADD f2, f24, f2, f16
  879. FMSUB f3, f24, f3, f17
  880. FMADD f6, f24, f6, f18
  881. FMSUB f7, f24, f7, f19
  882. FMADD f10, f24, f10, f20
  883. FMSUB f11, f24, f11, f21
  884. FMADD f14, f24, f14, f22
  885. FMSUB f15, f24, f15, f23
  886. FMSUB f0, f27, f3, f0
  887. FNMADD f1, f27, f2, f1
  888. FMSUB f4, f27, f7, f4
  889. FNMADD f5, f27, f6, f5
  890. FMSUB f8, f27, f11, f8
  891. FNMADD f9, f27, f10, f9
  892. FMSUB f12, f27, f15, f12
  893. FNMADD f13, f27, f14, f13
  894. FNMADD f0, f26, f2, f0
  895. FNMADD f1, f26, f3, f1
  896. FNMADD f4, f26, f6, f4
  897. FNMADD f5, f26, f7, f5
  898. FNMADD f8, f26, f10, f8
  899. FNMADD f9, f26, f11, f9
  900. FNMADD f12, f26, f14, f12
  901. FNMADD f13, f26, f15, f13
  902. FMUL f16, f29, f1
  903. FMUL f17, f29, f0
  904. FMUL f18, f29, f5
  905. FMUL f19, f29, f4
  906. FMUL f20, f29, f9
  907. FMUL f21, f29, f8
  908. FMUL f22, f29, f13
  909. FMUL f23, f29, f12
  910. FMADD f0, f28, f0, f16
  911. FMSUB f1, f28, f1, f17
  912. FMADD f4, f28, f4, f18
  913. FMSUB f5, f28, f5, f19
  914. FMADD f8, f28, f8, f20
  915. FMSUB f9, f28, f9, f21
  916. FMADD f12, f28, f12, f22
  917. FMSUB f13, f28, f13, f23
  918. #endif
  919. #endif
  920. #ifdef LT
  921. LFD f24, 0 * SIZE(AO)
  922. LFD f25, 1 * SIZE(AO)
  923. LFD f26, 2 * SIZE(AO)
  924. LFD f27, 3 * SIZE(AO)
  925. LFD f28, 6 * SIZE(AO)
  926. LFD f29, 7 * SIZE(AO)
  927. FMUL f16, f25, f1
  928. FMUL f17, f25, f0
  929. FMUL f18, f25, f5
  930. FMUL f19, f25, f4
  931. FMUL f20, f25, f9
  932. FMUL f21, f25, f8
  933. FMUL f22, f25, f13
  934. FMUL f23, f25, f12
  935. #ifndef CONJ
  936. FMSUB f0, f24, f0, f16
  937. FMADD f1, f24, f1, f17
  938. FMSUB f4, f24, f4, f18
  939. FMADD f5, f24, f5, f19
  940. FMSUB f8, f24, f8, f20
  941. FMADD f9, f24, f9, f21
  942. FMSUB f12, f24, f12, f22
  943. FMADD f13, f24, f13, f23
  944. FMADD f2, f27, f1, f2
  945. FNMSUB f3, f27, f0, f3
  946. FMADD f6, f27, f5, f6
  947. FNMSUB f7, f27, f4, f7
  948. FMADD f10, f27, f9, f10
  949. FNMSUB f11, f27, f8, f11
  950. FMADD f14, f27, f13, f14
  951. FNMSUB f15, f27, f12, f15
  952. FNMSUB f2, f26, f0, f2
  953. FNMSUB f3, f26, f1, f3
  954. FNMSUB f6, f26, f4, f6
  955. FNMSUB f7, f26, f5, f7
  956. FNMSUB f10, f26, f8, f10
  957. FNMSUB f11, f26, f9, f11
  958. FNMSUB f14, f26, f12, f14
  959. FNMSUB f15, f26, f13, f15
  960. FMUL f16, f29, f3
  961. FMUL f17, f29, f2
  962. FMUL f18, f29, f7
  963. FMUL f19, f29, f6
  964. FMUL f20, f29, f11
  965. FMUL f21, f29, f10
  966. FMUL f22, f29, f15
  967. FMUL f23, f29, f14
  968. FMSUB f2, f28, f2, f16
  969. FMADD f3, f28, f3, f17
  970. FMSUB f6, f28, f6, f18
  971. FMADD f7, f28, f7, f19
  972. FMSUB f10, f28, f10, f20
  973. FMADD f11, f28, f11, f21
  974. FMSUB f14, f28, f14, f22
  975. FMADD f15, f28, f15, f23
  976. #else
  977. FMADD f0, f24, f0, f16
  978. FMSUB f1, f24, f1, f17
  979. FMADD f4, f24, f4, f18
  980. FMSUB f5, f24, f5, f19
  981. FMADD f8, f24, f8, f20
  982. FMSUB f9, f24, f9, f21
  983. FMADD f12, f24, f12, f22
  984. FMSUB f13, f24, f13, f23
  985. FMSUB f2, f27, f1, f2
  986. FNMADD f3, f27, f0, f3
  987. FMSUB f6, f27, f5, f6
  988. FNMADD f7, f27, f4, f7
  989. FMSUB f10, f27, f9, f10
  990. FNMADD f11, f27, f8, f11
  991. FMSUB f14, f27, f13, f14
  992. FNMADD f15, f27, f12, f15
  993. FNMADD f2, f26, f0, f2
  994. FNMADD f3, f26, f1, f3
  995. FNMADD f6, f26, f4, f6
  996. FNMADD f7, f26, f5, f7
  997. FNMADD f10, f26, f8, f10
  998. FNMADD f11, f26, f9, f11
  999. FNMADD f14, f26, f12, f14
  1000. FNMADD f15, f26, f13, f15
  1001. FMUL f16, f29, f3
  1002. FMUL f17, f29, f2
  1003. FMUL f18, f29, f7
  1004. FMUL f19, f29, f6
  1005. FMUL f20, f29, f11
  1006. FMUL f21, f29, f10
  1007. FMUL f22, f29, f15
  1008. FMUL f23, f29, f14
  1009. FMADD f2, f28, f2, f16
  1010. FMSUB f3, f28, f3, f17
  1011. FMADD f6, f28, f6, f18
  1012. FMSUB f7, f28, f7, f19
  1013. FMADD f10, f28, f10, f20
  1014. FMSUB f11, f28, f11, f21
  1015. FMADD f14, f28, f14, f22
  1016. FMSUB f15, f28, f15, f23
  1017. #endif
  1018. #endif
  1019. #ifdef RN
  1020. LFD f24, 0 * SIZE(BO)
  1021. LFD f25, 1 * SIZE(BO)
  1022. LFD f26, 2 * SIZE(BO)
  1023. LFD f27, 3 * SIZE(BO)
  1024. LFD f28, 4 * SIZE(BO)
  1025. LFD f29, 5 * SIZE(BO)
  1026. LFD f30, 6 * SIZE(BO)
  1027. LFD f31, 7 * SIZE(BO)
  1028. FMUL f16, f25, f1
  1029. FMUL f17, f25, f0
  1030. FMUL f18, f25, f3
  1031. FMUL f19, f25, f2
  1032. #ifndef CONJ
  1033. FMSUB f0, f24, f0, f16
  1034. FMADD f1, f24, f1, f17
  1035. FMSUB f2, f24, f2, f18
  1036. FMADD f3, f24, f3, f19
  1037. FMADD f4, f27, f1, f4
  1038. FNMSUB f5, f27, f0, f5
  1039. FMADD f6, f27, f3, f6
  1040. FNMSUB f7, f27, f2, f7
  1041. FNMSUB f4, f26, f0, f4
  1042. FNMSUB f5, f26, f1, f5
  1043. FNMSUB f6, f26, f2, f6
  1044. FNMSUB f7, f26, f3, f7
  1045. FMADD f8, f29, f1, f8
  1046. FNMSUB f9, f29, f0, f9
  1047. FMADD f10, f29, f3, f10
  1048. FNMSUB f11, f29, f2, f11
  1049. FNMSUB f8, f28, f0, f8
  1050. FNMSUB f9, f28, f1, f9
  1051. FNMSUB f10, f28, f2, f10
  1052. FNMSUB f11, f28, f3, f11
  1053. FMADD f12, f31, f1, f12
  1054. FNMSUB f13, f31, f0, f13
  1055. FMADD f14, f31, f3, f14
  1056. FNMSUB f15, f31, f2, f15
  1057. FNMSUB f12, f30, f0, f12
  1058. FNMSUB f13, f30, f1, f13
  1059. FNMSUB f14, f30, f2, f14
  1060. FNMSUB f15, f30, f3, f15
  1061. LFD f26, 10 * SIZE(BO)
  1062. LFD f27, 11 * SIZE(BO)
  1063. LFD f28, 12 * SIZE(BO)
  1064. LFD f29, 13 * SIZE(BO)
  1065. LFD f30, 14 * SIZE(BO)
  1066. LFD f31, 15 * SIZE(BO)
  1067. FMUL f16, f27, f5
  1068. FMUL f17, f27, f4
  1069. FMUL f18, f27, f7
  1070. FMUL f19, f27, f6
  1071. FMSUB f4, f26, f4, f16
  1072. FMADD f5, f26, f5, f17
  1073. FMSUB f6, f26, f6, f18
  1074. FMADD f7, f26, f7, f19
  1075. FMADD f8, f29, f5, f8
  1076. FNMSUB f9, f29, f4, f9
  1077. FMADD f10, f29, f7, f10
  1078. FNMSUB f11, f29, f6, f11
  1079. FNMSUB f8, f28, f4, f8
  1080. FNMSUB f9, f28, f5, f9
  1081. FNMSUB f10, f28, f6, f10
  1082. FNMSUB f11, f28, f7, f11
  1083. FMADD f12, f31, f5, f12
  1084. FNMSUB f13, f31, f4, f13
  1085. FMADD f14, f31, f7, f14
  1086. FNMSUB f15, f31, f6, f15
  1087. FNMSUB f12, f30, f4, f12
  1088. FNMSUB f13, f30, f5, f13
  1089. FNMSUB f14, f30, f6, f14
  1090. FNMSUB f15, f30, f7, f15
  1091. LFD f26, 20 * SIZE(BO)
  1092. LFD f27, 21 * SIZE(BO)
  1093. LFD f28, 22 * SIZE(BO)
  1094. LFD f29, 23 * SIZE(BO)
  1095. LFD f30, 30 * SIZE(BO)
  1096. LFD f31, 31 * SIZE(BO)
  1097. FMUL f16, f27, f9
  1098. FMUL f17, f27, f8
  1099. FMUL f18, f27, f11
  1100. FMUL f19, f27, f10
  1101. FMSUB f8, f26, f8, f16
  1102. FMADD f9, f26, f9, f17
  1103. FMSUB f10, f26, f10, f18
  1104. FMADD f11, f26, f11, f19
  1105. FMADD f12, f29, f9, f12
  1106. FNMSUB f13, f29, f8, f13
  1107. FMADD f14, f29, f11, f14
  1108. FNMSUB f15, f29, f10, f15
  1109. FNMSUB f12, f28, f8, f12
  1110. FNMSUB f13, f28, f9, f13
  1111. FNMSUB f14, f28, f10, f14
  1112. FNMSUB f15, f28, f11, f15
  1113. FMUL f16, f31, f13
  1114. FMUL f17, f31, f12
  1115. FMUL f18, f31, f15
  1116. FMUL f19, f31, f14
  1117. FMSUB f12, f30, f12, f16
  1118. FMADD f13, f30, f13, f17
  1119. FMSUB f14, f30, f14, f18
  1120. FMADD f15, f30, f15, f19
  1121. #else
  1122. FMADD f0, f24, f0, f16
  1123. FMSUB f1, f24, f1, f17
  1124. FMADD f2, f24, f2, f18
  1125. FMSUB f3, f24, f3, f19
  1126. FMSUB f4, f27, f1, f4
  1127. FNMADD f5, f27, f0, f5
  1128. FMSUB f6, f27, f3, f6
  1129. FNMADD f7, f27, f2, f7
  1130. FNMADD f4, f26, f0, f4
  1131. FNMADD f5, f26, f1, f5
  1132. FNMADD f6, f26, f2, f6
  1133. FNMADD f7, f26, f3, f7
  1134. FMSUB f8, f29, f1, f8
  1135. FNMADD f9, f29, f0, f9
  1136. FMSUB f10, f29, f3, f10
  1137. FNMADD f11, f29, f2, f11
  1138. FNMADD f8, f28, f0, f8
  1139. FNMADD f9, f28, f1, f9
  1140. FNMADD f10, f28, f2, f10
  1141. FNMADD f11, f28, f3, f11
  1142. FMSUB f12, f31, f1, f12
  1143. FNMADD f13, f31, f0, f13
  1144. FMSUB f14, f31, f3, f14
  1145. FNMADD f15, f31, f2, f15
  1146. FNMADD f12, f30, f0, f12
  1147. FNMADD f13, f30, f1, f13
  1148. FNMADD f14, f30, f2, f14
  1149. FNMADD f15, f30, f3, f15
  1150. LFD f26, 10 * SIZE(BO)
  1151. LFD f27, 11 * SIZE(BO)
  1152. LFD f28, 12 * SIZE(BO)
  1153. LFD f29, 13 * SIZE(BO)
  1154. LFD f30, 14 * SIZE(BO)
  1155. LFD f31, 15 * SIZE(BO)
  1156. FMUL f16, f27, f5
  1157. FMUL f17, f27, f4
  1158. FMUL f18, f27, f7
  1159. FMUL f19, f27, f6
  1160. FMADD f4, f26, f4, f16
  1161. FMSUB f5, f26, f5, f17
  1162. FMADD f6, f26, f6, f18
  1163. FMSUB f7, f26, f7, f19
  1164. FMSUB f8, f29, f5, f8
  1165. FNMADD f9, f29, f4, f9
  1166. FMSUB f10, f29, f7, f10
  1167. FNMADD f11, f29, f6, f11
  1168. FNMADD f8, f28, f4, f8
  1169. FNMADD f9, f28, f5, f9
  1170. FNMADD f10, f28, f6, f10
  1171. FNMADD f11, f28, f7, f11
  1172. FMSUB f12, f31, f5, f12
  1173. FNMADD f13, f31, f4, f13
  1174. FMSUB f14, f31, f7, f14
  1175. FNMADD f15, f31, f6, f15
  1176. FNMADD f12, f30, f4, f12
  1177. FNMADD f13, f30, f5, f13
  1178. FNMADD f14, f30, f6, f14
  1179. FNMADD f15, f30, f7, f15
  1180. LFD f26, 20 * SIZE(BO)
  1181. LFD f27, 21 * SIZE(BO)
  1182. LFD f28, 22 * SIZE(BO)
  1183. LFD f29, 23 * SIZE(BO)
  1184. LFD f30, 30 * SIZE(BO)
  1185. LFD f31, 31 * SIZE(BO)
  1186. FMUL f16, f27, f9
  1187. FMUL f17, f27, f8
  1188. FMUL f18, f27, f11
  1189. FMUL f19, f27, f10
  1190. FMADD f8, f26, f8, f16
  1191. FMSUB f9, f26, f9, f17
  1192. FMADD f10, f26, f10, f18
  1193. FMSUB f11, f26, f11, f19
  1194. FMSUB f12, f29, f9, f12
  1195. FNMADD f13, f29, f8, f13
  1196. FMSUB f14, f29, f11, f14
  1197. FNMADD f15, f29, f10, f15
  1198. FNMADD f12, f28, f8, f12
  1199. FNMADD f13, f28, f9, f13
  1200. FNMADD f14, f28, f10, f14
  1201. FNMADD f15, f28, f11, f15
  1202. FMUL f16, f31, f13
  1203. FMUL f17, f31, f12
  1204. FMUL f18, f31, f15
  1205. FMUL f19, f31, f14
  1206. FMADD f12, f30, f12, f16
  1207. FMSUB f13, f30, f13, f17
  1208. FMADD f14, f30, f14, f18
  1209. FMSUB f15, f30, f15, f19
  1210. #endif
  1211. #endif
  1212. #ifdef RT
  1213. LFD f24, 30 * SIZE(BO)
  1214. LFD f25, 31 * SIZE(BO)
  1215. LFD f26, 28 * SIZE(BO)
  1216. LFD f27, 29 * SIZE(BO)
  1217. LFD f28, 26 * SIZE(BO)
  1218. LFD f29, 27 * SIZE(BO)
  1219. LFD f30, 24 * SIZE(BO)
  1220. LFD f31, 25 * SIZE(BO)
  1221. FMUL f16, f25, f13
  1222. FMUL f17, f25, f12
  1223. FMUL f18, f25, f15
  1224. FMUL f19, f25, f14
  1225. #ifndef CONJ
  1226. FMSUB f12, f24, f12, f16
  1227. FMADD f13, f24, f13, f17
  1228. FMSUB f14, f24, f14, f18
  1229. FMADD f15, f24, f15, f19
  1230. FMADD f8, f27, f13, f8
  1231. FNMSUB f9, f27, f12, f9
  1232. FMADD f10, f27, f15, f10
  1233. FNMSUB f11, f27, f14, f11
  1234. FNMSUB f8, f26, f12, f8
  1235. FNMSUB f9, f26, f13, f9
  1236. FNMSUB f10, f26, f14, f10
  1237. FNMSUB f11, f26, f15, f11
  1238. FMADD f4, f29, f13, f4
  1239. FNMSUB f5, f29, f12, f5
  1240. FMADD f6, f29, f15, f6
  1241. FNMSUB f7, f29, f14, f7
  1242. FNMSUB f4, f28, f12, f4
  1243. FNMSUB f5, f28, f13, f5
  1244. FNMSUB f6, f28, f14, f6
  1245. FNMSUB f7, f28, f15, f7
  1246. FMADD f0, f31, f13, f0
  1247. FNMSUB f1, f31, f12, f1
  1248. FMADD f2, f31, f15, f2
  1249. FNMSUB f3, f31, f14, f3
  1250. FNMSUB f0, f30, f12, f0
  1251. FNMSUB f1, f30, f13, f1
  1252. FNMSUB f2, f30, f14, f2
  1253. FNMSUB f3, f30, f15, f3
  1254. LFD f26, 20 * SIZE(BO)
  1255. LFD f27, 21 * SIZE(BO)
  1256. LFD f28, 18 * SIZE(BO)
  1257. LFD f29, 19 * SIZE(BO)
  1258. LFD f30, 16 * SIZE(BO)
  1259. LFD f31, 17 * SIZE(BO)
  1260. FMUL f16, f27, f9
  1261. FMUL f17, f27, f8
  1262. FMUL f18, f27, f11
  1263. FMUL f19, f27, f10
  1264. FMSUB f8, f26, f8, f16
  1265. FMADD f9, f26, f9, f17
  1266. FMSUB f10, f26, f10, f18
  1267. FMADD f11, f26, f11, f19
  1268. FMADD f4, f29, f9, f4
  1269. FNMSUB f5, f29, f8, f5
  1270. FMADD f6, f29, f11, f6
  1271. FNMSUB f7, f29, f10, f7
  1272. FNMSUB f4, f28, f8, f4
  1273. FNMSUB f5, f28, f9, f5
  1274. FNMSUB f6, f28, f10, f6
  1275. FNMSUB f7, f28, f11, f7
  1276. FMADD f0, f31, f9, f0
  1277. FNMSUB f1, f31, f8, f1
  1278. FMADD f2, f31, f11, f2
  1279. FNMSUB f3, f31, f10, f3
  1280. FNMSUB f0, f30, f8, f0
  1281. FNMSUB f1, f30, f9, f1
  1282. FNMSUB f2, f30, f10, f2
  1283. FNMSUB f3, f30, f11, f3
  1284. LFD f26, 10 * SIZE(BO)
  1285. LFD f27, 11 * SIZE(BO)
  1286. LFD f28, 8 * SIZE(BO)
  1287. LFD f29, 9 * SIZE(BO)
  1288. LFD f30, 0 * SIZE(BO)
  1289. LFD f31, 1 * SIZE(BO)
  1290. FMUL f16, f27, f5
  1291. FMUL f17, f27, f4
  1292. FMUL f18, f27, f7
  1293. FMUL f19, f27, f6
  1294. FMSUB f4, f26, f4, f16
  1295. FMADD f5, f26, f5, f17
  1296. FMSUB f6, f26, f6, f18
  1297. FMADD f7, f26, f7, f19
  1298. FMADD f0, f29, f5, f0
  1299. FNMSUB f1, f29, f4, f1
  1300. FMADD f2, f29, f7, f2
  1301. FNMSUB f3, f29, f6, f3
  1302. FNMSUB f0, f28, f4, f0
  1303. FNMSUB f1, f28, f5, f1
  1304. FNMSUB f2, f28, f6, f2
  1305. FNMSUB f3, f28, f7, f3
  1306. FMUL f16, f31, f1
  1307. FMUL f17, f31, f0
  1308. FMUL f18, f31, f3
  1309. FMUL f19, f31, f2
  1310. FMSUB f0, f30, f0, f16
  1311. FMADD f1, f30, f1, f17
  1312. FMSUB f2, f30, f2, f18
  1313. FMADD f3, f30, f3, f19
  1314. #else
  1315. FMADD f12, f24, f12, f16
  1316. FMSUB f13, f24, f13, f17
  1317. FMADD f14, f24, f14, f18
  1318. FMSUB f15, f24, f15, f19
  1319. FMSUB f8, f27, f13, f8
  1320. FNMADD f9, f27, f12, f9
  1321. FMSUB f10, f27, f15, f10
  1322. FNMADD f11, f27, f14, f11
  1323. FNMADD f8, f26, f12, f8
  1324. FNMADD f9, f26, f13, f9
  1325. FNMADD f10, f26, f14, f10
  1326. FNMADD f11, f26, f15, f11
  1327. FMSUB f4, f29, f13, f4
  1328. FNMADD f5, f29, f12, f5
  1329. FMSUB f6, f29, f15, f6
  1330. FNMADD f7, f29, f14, f7
  1331. FNMADD f4, f28, f12, f4
  1332. FNMADD f5, f28, f13, f5
  1333. FNMADD f6, f28, f14, f6
  1334. FNMADD f7, f28, f15, f7
  1335. FMSUB f0, f31, f13, f0
  1336. FNMADD f1, f31, f12, f1
  1337. FMSUB f2, f31, f15, f2
  1338. FNMADD f3, f31, f14, f3
  1339. FNMADD f0, f30, f12, f0
  1340. FNMADD f1, f30, f13, f1
  1341. FNMADD f2, f30, f14, f2
  1342. FNMADD f3, f30, f15, f3
  1343. LFD f26, 20 * SIZE(BO)
  1344. LFD f27, 21 * SIZE(BO)
  1345. LFD f28, 18 * SIZE(BO)
  1346. LFD f29, 19 * SIZE(BO)
  1347. LFD f30, 16 * SIZE(BO)
  1348. LFD f31, 17 * SIZE(BO)
  1349. FMUL f16, f27, f9
  1350. FMUL f17, f27, f8
  1351. FMUL f18, f27, f11
  1352. FMUL f19, f27, f10
  1353. FMADD f8, f26, f8, f16
  1354. FMSUB f9, f26, f9, f17
  1355. FMADD f10, f26, f10, f18
  1356. FMSUB f11, f26, f11, f19
  1357. FMSUB f4, f29, f9, f4
  1358. FNMADD f5, f29, f8, f5
  1359. FMSUB f6, f29, f11, f6
  1360. FNMADD f7, f29, f10, f7
  1361. FNMADD f4, f28, f8, f4
  1362. FNMADD f5, f28, f9, f5
  1363. FNMADD f6, f28, f10, f6
  1364. FNMADD f7, f28, f11, f7
  1365. FMSUB f0, f31, f9, f0
  1366. FNMADD f1, f31, f8, f1
  1367. FMSUB f2, f31, f11, f2
  1368. FNMADD f3, f31, f10, f3
  1369. FNMADD f0, f30, f8, f0
  1370. FNMADD f1, f30, f9, f1
  1371. FNMADD f2, f30, f10, f2
  1372. FNMADD f3, f30, f11, f3
  1373. LFD f26, 10 * SIZE(BO)
  1374. LFD f27, 11 * SIZE(BO)
  1375. LFD f28, 8 * SIZE(BO)
  1376. LFD f29, 9 * SIZE(BO)
  1377. LFD f30, 0 * SIZE(BO)
  1378. LFD f31, 1 * SIZE(BO)
  1379. FMUL f16, f27, f5
  1380. FMUL f17, f27, f4
  1381. FMUL f18, f27, f7
  1382. FMUL f19, f27, f6
  1383. FMADD f4, f26, f4, f16
  1384. FMSUB f5, f26, f5, f17
  1385. FMADD f6, f26, f6, f18
  1386. FMSUB f7, f26, f7, f19
  1387. FMSUB f0, f29, f5, f0
  1388. FNMADD f1, f29, f4, f1
  1389. FMSUB f2, f29, f7, f2
  1390. FNMADD f3, f29, f6, f3
  1391. FNMADD f0, f28, f4, f0
  1392. FNMADD f1, f28, f5, f1
  1393. FNMADD f2, f28, f6, f2
  1394. FNMADD f3, f28, f7, f3
  1395. FMUL f16, f31, f1
  1396. FMUL f17, f31, f0
  1397. FMUL f18, f31, f3
  1398. FMUL f19, f31, f2
  1399. FMADD f0, f30, f0, f16
  1400. FMSUB f1, f30, f1, f17
  1401. FMADD f2, f30, f2, f18
  1402. FMSUB f3, f30, f3, f19
  1403. #endif
  1404. #endif
  1405. #ifdef LN
  1406. subi CO1, CO1, 4 * SIZE
  1407. subi CO2, CO2, 4 * SIZE
  1408. subi CO3, CO3, 4 * SIZE
  1409. subi CO4, CO4, 4 * SIZE
  1410. #endif
  1411. #if defined(LN) || defined(LT)
  1412. STFD f0, 0 * SIZE(BO)
  1413. STFD f1, 1 * SIZE(BO)
  1414. STFD f4, 2 * SIZE(BO)
  1415. STFD f5, 3 * SIZE(BO)
  1416. STFD f8, 4 * SIZE(BO)
  1417. STFD f9, 5 * SIZE(BO)
  1418. STFD f12, 6 * SIZE(BO)
  1419. STFD f13, 7 * SIZE(BO)
  1420. STFD f2, 8 * SIZE(BO)
  1421. STFD f3, 9 * SIZE(BO)
  1422. STFD f6, 10 * SIZE(BO)
  1423. STFD f7, 11 * SIZE(BO)
  1424. STFD f10, 12 * SIZE(BO)
  1425. STFD f11, 13 * SIZE(BO)
  1426. STFD f14, 14 * SIZE(BO)
  1427. STFD f15, 15 * SIZE(BO)
  1428. #else
  1429. STFD f0, 0 * SIZE(AO)
  1430. STFD f1, 1 * SIZE(AO)
  1431. STFD f2, 2 * SIZE(AO)
  1432. STFD f3, 3 * SIZE(AO)
  1433. STFD f4, 4 * SIZE(AO)
  1434. STFD f5, 5 * SIZE(AO)
  1435. STFD f6, 6 * SIZE(AO)
  1436. STFD f7, 7 * SIZE(AO)
  1437. STFD f8, 8 * SIZE(AO)
  1438. STFD f9, 9 * SIZE(AO)
  1439. STFD f10, 10 * SIZE(AO)
  1440. STFD f11, 11 * SIZE(AO)
  1441. STFD f12, 12 * SIZE(AO)
  1442. STFD f13, 13 * SIZE(AO)
  1443. STFD f14, 14 * SIZE(AO)
  1444. STFD f15, 15 * SIZE(AO)
  1445. #endif
  1446. STFD f0, 0 * SIZE(CO1)
  1447. STFD f1, 1 * SIZE(CO1)
  1448. STFD f2, 2 * SIZE(CO1)
  1449. STFD f3, 3 * SIZE(CO1)
  1450. lfs f0, FZERO
  1451. fmr f1, f0
  1452. fmr f2, f0
  1453. fmr f3, f0
  1454. STFD f4, 0 * SIZE(CO2)
  1455. STFD f5, 1 * SIZE(CO2)
  1456. STFD f6, 2 * SIZE(CO2)
  1457. STFD f7, 3 * SIZE(CO2)
  1458. fmr f4, f0
  1459. fmr f5, f0
  1460. fmr f6, f0
  1461. fmr f7, f0
  1462. STFD f8, 0 * SIZE(CO3)
  1463. STFD f9, 1 * SIZE(CO3)
  1464. STFD f10, 2 * SIZE(CO3)
  1465. STFD f11, 3 * SIZE(CO3)
  1466. fmr f8, f0
  1467. fmr f9, f0
  1468. fmr f10, f0
  1469. fmr f11, f0
  1470. STFD f12, 0 * SIZE(CO4)
  1471. STFD f13, 1 * SIZE(CO4)
  1472. STFD f14, 2 * SIZE(CO4)
  1473. STFD f15, 3 * SIZE(CO4)
  1474. fmr f12, f0
  1475. fmr f13, f0
  1476. fmr f14, f0
  1477. fmr f15, f0
  1478. #ifndef LN
  1479. addi CO1, CO1, 4 * SIZE
  1480. addi CO2, CO2, 4 * SIZE
  1481. addi CO3, CO3, 4 * SIZE
  1482. addi CO4, CO4, 4 * SIZE
  1483. #endif
  1484. #ifdef RT
  1485. slwi r0, K, 1 + ZBASE_SHIFT
  1486. add AORIG, AORIG, r0
  1487. #endif
  1488. #if defined(LT) || defined(RN)
  1489. sub TEMP, K, KK
  1490. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1491. slwi TEMP, TEMP, 2 + ZBASE_SHIFT
  1492. add AO, AO, r0
  1493. add BO, BO, TEMP
  1494. #endif
  1495. #ifdef LT
  1496. addi KK, KK, 2
  1497. #endif
  1498. #ifdef LN
  1499. subi KK, KK, 2
  1500. #endif
  1501. addic. I, I, -1
  1502. bgt LL(11)
  1503. .align 4
  1504. LL(20):
  1505. andi. I, M, 1
  1506. ble LL(29)
  1507. #if defined(LT) || defined(RN)
  1508. LFD f16, 0 * SIZE(AO)
  1509. LFD f17, 1 * SIZE(AO)
  1510. LFD f18, 2 * SIZE(AO)
  1511. LFD f19, 3 * SIZE(AO)
  1512. LFD f20, 0 * SIZE(B)
  1513. LFD f21, 1 * SIZE(B)
  1514. LFD f22, 2 * SIZE(B)
  1515. LFD f23, 3 * SIZE(B)
  1516. LFD f24, 4 * SIZE(B)
  1517. LFD f25, 5 * SIZE(B)
  1518. LFD f26, 6 * SIZE(B)
  1519. LFD f27, 7 * SIZE(B)
  1520. srawi. r0, KK, 2
  1521. mr BO, B
  1522. mtspr CTR, r0
  1523. #else
  1524. #ifdef LN
  1525. slwi r0, K, 0 + ZBASE_SHIFT
  1526. sub AORIG, AORIG, r0
  1527. #endif
  1528. slwi r0, KK, 0 + ZBASE_SHIFT
  1529. slwi TEMP, KK, 2 + ZBASE_SHIFT
  1530. add AO, AORIG, r0
  1531. add BO, B, TEMP
  1532. sub TEMP, K, KK
  1533. LFD f16, 0 * SIZE(AO)
  1534. LFD f17, 1 * SIZE(AO)
  1535. LFD f18, 2 * SIZE(AO)
  1536. LFD f19, 3 * SIZE(AO)
  1537. LFD f20, 0 * SIZE(BO)
  1538. LFD f21, 1 * SIZE(BO)
  1539. LFD f22, 2 * SIZE(BO)
  1540. LFD f23, 3 * SIZE(BO)
  1541. LFD f24, 4 * SIZE(BO)
  1542. LFD f25, 5 * SIZE(BO)
  1543. LFD f26, 6 * SIZE(BO)
  1544. LFD f27, 7 * SIZE(BO)
  1545. srawi. r0, TEMP, 2
  1546. mtspr CTR, r0
  1547. #endif
  1548. ble LL(25)
  1549. .align 4
  1550. LL(22):
  1551. FMA1 f0, f16, f20, f0
  1552. FMA4 f3, f17, f20, f3
  1553. FMA2 f1, f16, f21, f1
  1554. FMA3 f2, f17, f21, f2
  1555. LFD f28, 4 * SIZE(AO)
  1556. LFD f29, 5 * SIZE(AO)
  1557. LFD f30, 6 * SIZE(AO)
  1558. LFD f31, 7 * SIZE(AO)
  1559. FMA1 f4, f16, f22, f4
  1560. FMA4 f7, f17, f22, f7
  1561. FMA2 f5, f16, f23, f5
  1562. FMA3 f6, f17, f23, f6
  1563. LFD f20, 8 * SIZE(BO)
  1564. LFD f21, 9 * SIZE(BO)
  1565. LFD f22, 10 * SIZE(BO)
  1566. LFD f23, 11 * SIZE(BO)
  1567. FMA1 f8, f16, f24, f8
  1568. FMA4 f11, f17, f24, f11
  1569. FMA2 f9, f16, f25, f9
  1570. FMA3 f10, f17, f25, f10
  1571. FMA1 f12, f16, f26, f12
  1572. FMA4 f15, f17, f26, f15
  1573. FMA2 f13, f16, f27, f13
  1574. FMA3 f14, f17, f27, f14
  1575. LFD f24, 12 * SIZE(BO)
  1576. LFD f25, 13 * SIZE(BO)
  1577. LFD f26, 14 * SIZE(BO)
  1578. LFD f27, 15 * SIZE(BO)
  1579. FMA1 f0, f18, f20, f0
  1580. FMA4 f3, f19, f20, f3
  1581. FMA2 f1, f18, f21, f1
  1582. FMA3 f2, f19, f21, f2
  1583. FMA1 f4, f18, f22, f4
  1584. FMA4 f7, f19, f22, f7
  1585. FMA2 f5, f18, f23, f5
  1586. FMA3 f6, f19, f23, f6
  1587. LFD f20, 16 * SIZE(BO)
  1588. LFD f21, 17 * SIZE(BO)
  1589. LFD f22, 18 * SIZE(BO)
  1590. LFD f23, 19 * SIZE(BO)
  1591. FMA1 f8, f18, f24, f8
  1592. FMA4 f11, f19, f24, f11
  1593. FMA2 f9, f18, f25, f9
  1594. FMA3 f10, f19, f25, f10
  1595. FMA1 f12, f18, f26, f12
  1596. FMA4 f15, f19, f26, f15
  1597. FMA2 f13, f18, f27, f13
  1598. FMA3 f14, f19, f27, f14
  1599. LFD f24, 20 * SIZE(BO)
  1600. LFD f25, 21 * SIZE(BO)
  1601. LFD f26, 22 * SIZE(BO)
  1602. LFD f27, 23 * SIZE(BO)
  1603. FMA1 f0, f28, f20, f0
  1604. FMA4 f3, f29, f20, f3
  1605. FMA2 f1, f28, f21, f1
  1606. FMA3 f2, f29, f21, f2
  1607. LFD f16, 8 * SIZE(AO)
  1608. LFD f17, 9 * SIZE(AO)
  1609. LFD f18, 10 * SIZE(AO)
  1610. LFD f19, 11 * SIZE(AO)
  1611. FMA1 f4, f28, f22, f4
  1612. FMA4 f7, f29, f22, f7
  1613. FMA2 f5, f28, f23, f5
  1614. FMA3 f6, f29, f23, f6
  1615. LFD f20, 24 * SIZE(BO)
  1616. LFD f21, 25 * SIZE(BO)
  1617. LFD f22, 26 * SIZE(BO)
  1618. LFD f23, 27 * SIZE(BO)
  1619. FMA1 f8, f28, f24, f8
  1620. FMA4 f11, f29, f24, f11
  1621. FMA2 f9, f28, f25, f9
  1622. FMA3 f10, f29, f25, f10
  1623. FMA1 f12, f28, f26, f12
  1624. FMA4 f15, f29, f26, f15
  1625. FMA2 f13, f28, f27, f13
  1626. FMA3 f14, f29, f27, f14
  1627. LFD f24, 28 * SIZE(BO)
  1628. LFD f25, 29 * SIZE(BO)
  1629. LFD f26, 30 * SIZE(BO)
  1630. LFD f27, 31 * SIZE(BO)
  1631. FMA1 f0, f30, f20, f0
  1632. FMA4 f3, f31, f20, f3
  1633. FMA2 f1, f30, f21, f1
  1634. FMA3 f2, f31, f21, f2
  1635. FMA1 f4, f30, f22, f4
  1636. FMA4 f7, f31, f22, f7
  1637. FMA2 f5, f30, f23, f5
  1638. FMA3 f6, f31, f23, f6
  1639. LFD f20, 32 * SIZE(BO)
  1640. LFD f21, 33 * SIZE(BO)
  1641. LFD f22, 34 * SIZE(BO)
  1642. LFD f23, 35 * SIZE(BO)
  1643. FMA1 f8, f30, f24, f8
  1644. FMA4 f11, f31, f24, f11
  1645. FMA2 f9, f30, f25, f9
  1646. FMA3 f10, f31, f25, f10
  1647. FMA1 f12, f30, f26, f12
  1648. FMA4 f15, f31, f26, f15
  1649. FMA2 f13, f30, f27, f13
  1650. FMA3 f14, f31, f27, f14
  1651. LFD f24, 36 * SIZE(BO)
  1652. LFD f25, 37 * SIZE(BO)
  1653. LFD f26, 38 * SIZE(BO)
  1654. LFD f27, 39 * SIZE(BO)
  1655. addi AO, AO, 8 * SIZE
  1656. addi BO, BO, 32 * SIZE
  1657. bdnz LL(22)
  1658. .align 4
  1659. LL(25):
  1660. #if defined(LT) || defined(RN)
  1661. andi. r0, KK, 3
  1662. #else
  1663. andi. r0, TEMP, 3
  1664. #endif
  1665. mtspr CTR, r0
  1666. ble LL(27)
  1667. .align 4
  1668. LL(26):
  1669. FMA1 f0, f16, f20, f0
  1670. FMA4 f3, f17, f20, f3
  1671. FMA2 f1, f16, f21, f1
  1672. FMA3 f2, f17, f21, f2
  1673. FMA1 f4, f16, f22, f4
  1674. FMA4 f7, f17, f22, f7
  1675. FMA2 f5, f16, f23, f5
  1676. FMA3 f6, f17, f23, f6
  1677. LFD f20, 8 * SIZE(BO)
  1678. LFD f21, 9 * SIZE(BO)
  1679. LFD f22, 10 * SIZE(BO)
  1680. LFD f23, 11 * SIZE(BO)
  1681. FMA1 f8, f16, f24, f8
  1682. FMA4 f11, f17, f24, f11
  1683. FMA2 f9, f16, f25, f9
  1684. FMA3 f10, f17, f25, f10
  1685. FMA1 f12, f16, f26, f12
  1686. FMA4 f15, f17, f26, f15
  1687. FMA2 f13, f16, f27, f13
  1688. FMA3 f14, f17, f27, f14
  1689. LFD f16, 2 * SIZE(AO)
  1690. LFD f17, 3 * SIZE(AO)
  1691. LFD f24, 12 * SIZE(BO)
  1692. LFD f25, 13 * SIZE(BO)
  1693. LFD f26, 14 * SIZE(BO)
  1694. LFD f27, 15 * SIZE(BO)
  1695. addi AO, AO, 2 * SIZE
  1696. addi BO, BO, 8 * SIZE
  1697. bdnz LL(26)
  1698. .align 4
  1699. LL(27):
  1700. #if defined(LN) || defined(RT)
  1701. #ifdef LN
  1702. subi r0, KK, 1
  1703. #else
  1704. subi r0, KK, 4
  1705. #endif
  1706. slwi TEMP, r0, 0 + ZBASE_SHIFT
  1707. slwi r0, r0, 2 + ZBASE_SHIFT
  1708. add AO, AORIG, TEMP
  1709. add BO, B, r0
  1710. #endif
  1711. FADD f0, f0, f2
  1712. FADD f1, f1, f3
  1713. FADD f4, f4, f6
  1714. FADD f5, f5, f7
  1715. FADD f8, f8, f10
  1716. FADD f9, f9, f11
  1717. FADD f12, f12, f14
  1718. FADD f13, f13, f15
  1719. #if defined(LN) || defined(LT)
  1720. LFD f16, 0 * SIZE(BO)
  1721. LFD f17, 1 * SIZE(BO)
  1722. LFD f18, 2 * SIZE(BO)
  1723. LFD f19, 3 * SIZE(BO)
  1724. FSUB f0, f16, f0
  1725. FSUB f1, f17, f1
  1726. FSUB f4, f18, f4
  1727. FSUB f5, f19, f5
  1728. LFD f20, 4 * SIZE(BO)
  1729. LFD f21, 5 * SIZE(BO)
  1730. LFD f22, 6 * SIZE(BO)
  1731. LFD f23, 7 * SIZE(BO)
  1732. FSUB f8, f20, f8
  1733. FSUB f9, f21, f9
  1734. FSUB f12, f22, f12
  1735. FSUB f13, f23, f13
  1736. #else
  1737. LFD f16, 0 * SIZE(AO)
  1738. LFD f17, 1 * SIZE(AO)
  1739. LFD f20, 2 * SIZE(AO)
  1740. LFD f21, 3 * SIZE(AO)
  1741. FSUB f0, f16, f0
  1742. FSUB f1, f17, f1
  1743. FSUB f4, f20, f4
  1744. FSUB f5, f21, f5
  1745. LFD f24, 4 * SIZE(AO)
  1746. LFD f25, 5 * SIZE(AO)
  1747. LFD f28, 6 * SIZE(AO)
  1748. LFD f29, 7 * SIZE(AO)
  1749. FSUB f8, f24, f8
  1750. FSUB f9, f25, f9
  1751. FSUB f12, f28, f12
  1752. FSUB f13, f29, f13
  1753. #endif
  1754. #ifdef LN
  1755. LFD f28, 0 * SIZE(AO)
  1756. LFD f29, 1 * SIZE(AO)
  1757. FMUL f16, f29, f1
  1758. FMUL f17, f29, f0
  1759. FMUL f18, f29, f5
  1760. FMUL f19, f29, f4
  1761. FMUL f20, f29, f9
  1762. FMUL f21, f29, f8
  1763. FMUL f22, f29, f13
  1764. FMUL f23, f29, f12
  1765. #ifndef CONJ
  1766. FMSUB f0, f28, f0, f16
  1767. FMADD f1, f28, f1, f17
  1768. FMSUB f4, f28, f4, f18
  1769. FMADD f5, f28, f5, f19
  1770. FMSUB f8, f28, f8, f20
  1771. FMADD f9, f28, f9, f21
  1772. FMSUB f12, f28, f12, f22
  1773. FMADD f13, f28, f13, f23
  1774. #else
  1775. FMADD f0, f28, f0, f16
  1776. FMSUB f1, f28, f1, f17
  1777. FMADD f4, f28, f4, f18
  1778. FMSUB f5, f28, f5, f19
  1779. FMADD f8, f28, f8, f20
  1780. FMSUB f9, f28, f9, f21
  1781. FMADD f12, f28, f12, f22
  1782. FMSUB f13, f28, f13, f23
  1783. #endif
  1784. #endif
  1785. #ifdef LT
  1786. LFD f24, 0 * SIZE(AO)
  1787. LFD f25, 1 * SIZE(AO)
  1788. FMUL f16, f25, f1
  1789. FMUL f17, f25, f0
  1790. FMUL f18, f25, f5
  1791. FMUL f19, f25, f4
  1792. FMUL f20, f25, f9
  1793. FMUL f21, f25, f8
  1794. FMUL f22, f25, f13
  1795. FMUL f23, f25, f12
  1796. #ifndef CONJ
  1797. FMSUB f0, f24, f0, f16
  1798. FMADD f1, f24, f1, f17
  1799. FMSUB f4, f24, f4, f18
  1800. FMADD f5, f24, f5, f19
  1801. FMSUB f8, f24, f8, f20
  1802. FMADD f9, f24, f9, f21
  1803. FMSUB f12, f24, f12, f22
  1804. FMADD f13, f24, f13, f23
  1805. #else
  1806. FMADD f0, f24, f0, f16
  1807. FMSUB f1, f24, f1, f17
  1808. FMADD f4, f24, f4, f18
  1809. FMSUB f5, f24, f5, f19
  1810. FMADD f8, f24, f8, f20
  1811. FMSUB f9, f24, f9, f21
  1812. FMADD f12, f24, f12, f22
  1813. FMSUB f13, f24, f13, f23
  1814. #endif
  1815. #endif
  1816. #ifdef RN
  1817. LFD f24, 0 * SIZE(BO)
  1818. LFD f25, 1 * SIZE(BO)
  1819. LFD f26, 2 * SIZE(BO)
  1820. LFD f27, 3 * SIZE(BO)
  1821. LFD f28, 4 * SIZE(BO)
  1822. LFD f29, 5 * SIZE(BO)
  1823. LFD f30, 6 * SIZE(BO)
  1824. LFD f31, 7 * SIZE(BO)
  1825. FMUL f16, f25, f1
  1826. FMUL f17, f25, f0
  1827. #ifndef CONJ
  1828. FMSUB f0, f24, f0, f16
  1829. FMADD f1, f24, f1, f17
  1830. FMADD f4, f27, f1, f4
  1831. FNMSUB f5, f27, f0, f5
  1832. FNMSUB f4, f26, f0, f4
  1833. FNMSUB f5, f26, f1, f5
  1834. FMADD f8, f29, f1, f8
  1835. FNMSUB f9, f29, f0, f9
  1836. FNMSUB f8, f28, f0, f8
  1837. FNMSUB f9, f28, f1, f9
  1838. FMADD f12, f31, f1, f12
  1839. FNMSUB f13, f31, f0, f13
  1840. FNMSUB f12, f30, f0, f12
  1841. FNMSUB f13, f30, f1, f13
  1842. LFD f26, 10 * SIZE(BO)
  1843. LFD f27, 11 * SIZE(BO)
  1844. LFD f28, 12 * SIZE(BO)
  1845. LFD f29, 13 * SIZE(BO)
  1846. LFD f30, 14 * SIZE(BO)
  1847. LFD f31, 15 * SIZE(BO)
  1848. FMUL f16, f27, f5
  1849. FMUL f17, f27, f4
  1850. FMSUB f4, f26, f4, f16
  1851. FMADD f5, f26, f5, f17
  1852. FMADD f8, f29, f5, f8
  1853. FNMSUB f9, f29, f4, f9
  1854. FNMSUB f8, f28, f4, f8
  1855. FNMSUB f9, f28, f5, f9
  1856. FMADD f12, f31, f5, f12
  1857. FNMSUB f13, f31, f4, f13
  1858. FNMSUB f12, f30, f4, f12
  1859. FNMSUB f13, f30, f5, f13
  1860. LFD f26, 20 * SIZE(BO)
  1861. LFD f27, 21 * SIZE(BO)
  1862. LFD f28, 22 * SIZE(BO)
  1863. LFD f29, 23 * SIZE(BO)
  1864. LFD f30, 30 * SIZE(BO)
  1865. LFD f31, 31 * SIZE(BO)
  1866. FMUL f16, f27, f9
  1867. FMUL f17, f27, f8
  1868. FMSUB f8, f26, f8, f16
  1869. FMADD f9, f26, f9, f17
  1870. FMADD f12, f29, f9, f12
  1871. FNMSUB f13, f29, f8, f13
  1872. FNMSUB f12, f28, f8, f12
  1873. FNMSUB f13, f28, f9, f13
  1874. FMUL f16, f31, f13
  1875. FMUL f17, f31, f12
  1876. FMSUB f12, f30, f12, f16
  1877. FMADD f13, f30, f13, f17
  1878. #else
  1879. FMADD f0, f24, f0, f16
  1880. FMSUB f1, f24, f1, f17
  1881. FMSUB f4, f27, f1, f4
  1882. FNMADD f5, f27, f0, f5
  1883. FNMADD f4, f26, f0, f4
  1884. FNMADD f5, f26, f1, f5
  1885. FMSUB f8, f29, f1, f8
  1886. FNMADD f9, f29, f0, f9
  1887. FNMADD f8, f28, f0, f8
  1888. FNMADD f9, f28, f1, f9
  1889. FMSUB f12, f31, f1, f12
  1890. FNMADD f13, f31, f0, f13
  1891. FNMADD f12, f30, f0, f12
  1892. FNMADD f13, f30, f1, f13
  1893. LFD f26, 10 * SIZE(BO)
  1894. LFD f27, 11 * SIZE(BO)
  1895. LFD f28, 12 * SIZE(BO)
  1896. LFD f29, 13 * SIZE(BO)
  1897. LFD f30, 14 * SIZE(BO)
  1898. LFD f31, 15 * SIZE(BO)
  1899. FMUL f16, f27, f5
  1900. FMUL f17, f27, f4
  1901. FMADD f4, f26, f4, f16
  1902. FMSUB f5, f26, f5, f17
  1903. FMSUB f8, f29, f5, f8
  1904. FNMADD f9, f29, f4, f9
  1905. FNMADD f8, f28, f4, f8
  1906. FNMADD f9, f28, f5, f9
  1907. FMSUB f12, f31, f5, f12
  1908. FNMADD f13, f31, f4, f13
  1909. FNMADD f12, f30, f4, f12
  1910. FNMADD f13, f30, f5, f13
  1911. LFD f26, 20 * SIZE(BO)
  1912. LFD f27, 21 * SIZE(BO)
  1913. LFD f28, 22 * SIZE(BO)
  1914. LFD f29, 23 * SIZE(BO)
  1915. LFD f30, 30 * SIZE(BO)
  1916. LFD f31, 31 * SIZE(BO)
  1917. FMUL f16, f27, f9
  1918. FMUL f17, f27, f8
  1919. FMADD f8, f26, f8, f16
  1920. FMSUB f9, f26, f9, f17
  1921. FMSUB f12, f29, f9, f12
  1922. FNMADD f13, f29, f8, f13
  1923. FNMADD f12, f28, f8, f12
  1924. FNMADD f13, f28, f9, f13
  1925. FMUL f16, f31, f13
  1926. FMUL f17, f31, f12
  1927. FMADD f12, f30, f12, f16
  1928. FMSUB f13, f30, f13, f17
  1929. #endif
  1930. #endif
  1931. #ifdef RT
  1932. LFD f24, 30 * SIZE(BO)
  1933. LFD f25, 31 * SIZE(BO)
  1934. LFD f26, 28 * SIZE(BO)
  1935. LFD f27, 29 * SIZE(BO)
  1936. LFD f28, 26 * SIZE(BO)
  1937. LFD f29, 27 * SIZE(BO)
  1938. LFD f30, 24 * SIZE(BO)
  1939. LFD f31, 25 * SIZE(BO)
  1940. FMUL f16, f25, f13
  1941. FMUL f17, f25, f12
  1942. #ifndef CONJ
  1943. FMSUB f12, f24, f12, f16
  1944. FMADD f13, f24, f13, f17
  1945. FMADD f8, f27, f13, f8
  1946. FNMSUB f9, f27, f12, f9
  1947. FNMSUB f8, f26, f12, f8
  1948. FNMSUB f9, f26, f13, f9
  1949. FMADD f4, f29, f13, f4
  1950. FNMSUB f5, f29, f12, f5
  1951. FNMSUB f4, f28, f12, f4
  1952. FNMSUB f5, f28, f13, f5
  1953. FMADD f0, f31, f13, f0
  1954. FNMSUB f1, f31, f12, f1
  1955. FNMSUB f0, f30, f12, f0
  1956. FNMSUB f1, f30, f13, f1
  1957. LFD f26, 20 * SIZE(BO)
  1958. LFD f27, 21 * SIZE(BO)
  1959. LFD f28, 18 * SIZE(BO)
  1960. LFD f29, 19 * SIZE(BO)
  1961. LFD f30, 16 * SIZE(BO)
  1962. LFD f31, 17 * SIZE(BO)
  1963. FMUL f16, f27, f9
  1964. FMUL f17, f27, f8
  1965. FMSUB f8, f26, f8, f16
  1966. FMADD f9, f26, f9, f17
  1967. FMADD f4, f29, f9, f4
  1968. FNMSUB f5, f29, f8, f5
  1969. FNMSUB f4, f28, f8, f4
  1970. FNMSUB f5, f28, f9, f5
  1971. FMADD f0, f31, f9, f0
  1972. FNMSUB f1, f31, f8, f1
  1973. FNMSUB f0, f30, f8, f0
  1974. FNMSUB f1, f30, f9, f1
  1975. LFD f26, 10 * SIZE(BO)
  1976. LFD f27, 11 * SIZE(BO)
  1977. LFD f28, 8 * SIZE(BO)
  1978. LFD f29, 9 * SIZE(BO)
  1979. LFD f30, 0 * SIZE(BO)
  1980. LFD f31, 1 * SIZE(BO)
  1981. FMUL f16, f27, f5
  1982. FMUL f17, f27, f4
  1983. FMSUB f4, f26, f4, f16
  1984. FMADD f5, f26, f5, f17
  1985. FMADD f0, f29, f5, f0
  1986. FNMSUB f1, f29, f4, f1
  1987. FNMSUB f0, f28, f4, f0
  1988. FNMSUB f1, f28, f5, f1
  1989. FMUL f16, f31, f1
  1990. FMUL f17, f31, f0
  1991. FMSUB f0, f30, f0, f16
  1992. FMADD f1, f30, f1, f17
  1993. #else
  1994. FMADD f12, f24, f12, f16
  1995. FMSUB f13, f24, f13, f17
  1996. FMSUB f8, f27, f13, f8
  1997. FNMADD f9, f27, f12, f9
  1998. FNMADD f8, f26, f12, f8
  1999. FNMADD f9, f26, f13, f9
  2000. FMSUB f4, f29, f13, f4
  2001. FNMADD f5, f29, f12, f5
  2002. FNMADD f4, f28, f12, f4
  2003. FNMADD f5, f28, f13, f5
  2004. FMSUB f0, f31, f13, f0
  2005. FNMADD f1, f31, f12, f1
  2006. FNMADD f0, f30, f12, f0
  2007. FNMADD f1, f30, f13, f1
  2008. LFD f26, 20 * SIZE(BO)
  2009. LFD f27, 21 * SIZE(BO)
  2010. LFD f28, 18 * SIZE(BO)
  2011. LFD f29, 19 * SIZE(BO)
  2012. LFD f30, 16 * SIZE(BO)
  2013. LFD f31, 17 * SIZE(BO)
  2014. FMUL f16, f27, f9
  2015. FMUL f17, f27, f8
  2016. FMADD f8, f26, f8, f16
  2017. FMSUB f9, f26, f9, f17
  2018. FMSUB f4, f29, f9, f4
  2019. FNMADD f5, f29, f8, f5
  2020. FNMADD f4, f28, f8, f4
  2021. FNMADD f5, f28, f9, f5
  2022. FMSUB f0, f31, f9, f0
  2023. FNMADD f1, f31, f8, f1
  2024. FNMADD f0, f30, f8, f0
  2025. FNMADD f1, f30, f9, f1
  2026. LFD f26, 10 * SIZE(BO)
  2027. LFD f27, 11 * SIZE(BO)
  2028. LFD f28, 8 * SIZE(BO)
  2029. LFD f29, 9 * SIZE(BO)
  2030. LFD f30, 0 * SIZE(BO)
  2031. LFD f31, 1 * SIZE(BO)
  2032. FMUL f16, f27, f5
  2033. FMUL f17, f27, f4
  2034. FMADD f4, f26, f4, f16
  2035. FMSUB f5, f26, f5, f17
  2036. FMSUB f0, f29, f5, f0
  2037. FNMADD f1, f29, f4, f1
  2038. FNMADD f0, f28, f4, f0
  2039. FNMADD f1, f28, f5, f1
  2040. FMUL f16, f31, f1
  2041. FMUL f17, f31, f0
  2042. FMADD f0, f30, f0, f16
  2043. FMSUB f1, f30, f1, f17
  2044. #endif
  2045. #endif
  2046. #ifdef LN
  2047. subi CO1, CO1, 2 * SIZE
  2048. subi CO2, CO2, 2 * SIZE
  2049. subi CO3, CO3, 2 * SIZE
  2050. subi CO4, CO4, 2 * SIZE
  2051. #endif
  2052. #if defined(LN) || defined(LT)
  2053. STFD f0, 0 * SIZE(BO)
  2054. STFD f1, 1 * SIZE(BO)
  2055. STFD f4, 2 * SIZE(BO)
  2056. STFD f5, 3 * SIZE(BO)
  2057. STFD f8, 4 * SIZE(BO)
  2058. STFD f9, 5 * SIZE(BO)
  2059. STFD f12, 6 * SIZE(BO)
  2060. STFD f13, 7 * SIZE(BO)
  2061. #else
  2062. STFD f0, 0 * SIZE(AO)
  2063. STFD f1, 1 * SIZE(AO)
  2064. STFD f4, 2 * SIZE(AO)
  2065. STFD f5, 3 * SIZE(AO)
  2066. STFD f8, 4 * SIZE(AO)
  2067. STFD f9, 5 * SIZE(AO)
  2068. STFD f12, 6 * SIZE(AO)
  2069. STFD f13, 7 * SIZE(AO)
  2070. #endif
  2071. STFD f0, 0 * SIZE(CO1)
  2072. STFD f1, 1 * SIZE(CO1)
  2073. STFD f4, 0 * SIZE(CO2)
  2074. STFD f5, 1 * SIZE(CO2)
  2075. STFD f8, 0 * SIZE(CO3)
  2076. STFD f9, 1 * SIZE(CO3)
  2077. STFD f12, 0 * SIZE(CO4)
  2078. STFD f13, 1 * SIZE(CO4)
  2079. #ifndef LN
  2080. addi CO1, CO1, 2 * SIZE
  2081. addi CO2, CO2, 2 * SIZE
  2082. addi CO3, CO3, 2 * SIZE
  2083. addi CO4, CO4, 2 * SIZE
  2084. #endif
  2085. #ifdef RT
  2086. slwi r0, K, 0 + ZBASE_SHIFT
  2087. add AORIG, AORIG, r0
  2088. #endif
  2089. #if defined(LT) || defined(RN)
  2090. sub TEMP, K, KK
  2091. slwi r0, TEMP, 0 + ZBASE_SHIFT
  2092. slwi TEMP, TEMP, 2 + ZBASE_SHIFT
  2093. add AO, AO, r0
  2094. add BO, BO, TEMP
  2095. #endif
  2096. #ifdef LT
  2097. addi KK, KK, 1
  2098. #endif
  2099. #ifdef LN
  2100. subi KK, KK, 1
  2101. #endif
  2102. .align 4
  2103. LL(29):
  2104. #ifdef LN
  2105. slwi r0, K, 2 + ZBASE_SHIFT
  2106. add B, B, r0
  2107. #endif
  2108. #if defined(LT) || defined(RN)
  2109. mr B, BO
  2110. #endif
  2111. #ifdef RN
  2112. addi KK, KK, 4
  2113. #endif
  2114. #ifdef RT
  2115. subi KK, KK, 4
  2116. #endif
  2117. addic. J, J, -1
  2118. bgt LL(10)
  2119. .align 4
  2120. LL(30):
  2121. andi. J, N, 2
  2122. ble LL(50)
  2123. .align 4
  2124. #ifdef RT
  2125. slwi r0, K, 1 + ZBASE_SHIFT
  2126. sub B, B, r0
  2127. slwi r0, LDC, 1
  2128. sub C, C, r0
  2129. #endif
  2130. mr CO1, C
  2131. add CO2, C, LDC
  2132. #ifdef LN
  2133. add KK, M, OFFSET
  2134. #endif
  2135. #ifdef LT
  2136. mr KK, OFFSET
  2137. #endif
  2138. srawi. I, M, 1
  2139. #if defined(LN) || defined(RT)
  2140. mr AORIG, A
  2141. #else
  2142. mr AO, A
  2143. #endif
  2144. #ifndef RT
  2145. add C, CO2, LDC
  2146. #endif
  2147. ble LL(40)
  2148. .align 4
  2149. LL(31):
  2150. #if defined(LT) || defined(RN)
  2151. LFD f16, 0 * SIZE(AO)
  2152. LFD f17, 1 * SIZE(AO)
  2153. LFD f18, 2 * SIZE(AO)
  2154. LFD f19, 3 * SIZE(AO)
  2155. LFD f20, 0 * SIZE(B)
  2156. LFD f21, 1 * SIZE(B)
  2157. LFD f22, 2 * SIZE(B)
  2158. LFD f23, 3 * SIZE(B)
  2159. lfs f0, FZERO
  2160. fmr f1, f0
  2161. fmr f2, f0
  2162. fmr f3, f0
  2163. fmr f4, f0
  2164. fmr f5, f0
  2165. fmr f6, f0
  2166. fmr f7, f0
  2167. fmr f8, f0
  2168. fmr f9, f0
  2169. fmr f10, f0
  2170. fmr f11, f0
  2171. fmr f12, f0
  2172. fmr f13, f0
  2173. fmr f14, f0
  2174. fmr f15, f0
  2175. dcbtst CO1, PREC
  2176. dcbtst CO2, PREC
  2177. srawi. r0, KK, 3
  2178. mtspr CTR, r0
  2179. mr BO, B
  2180. #else
  2181. #ifdef LN
  2182. slwi r0, K, 1 + ZBASE_SHIFT
  2183. sub AORIG, AORIG, r0
  2184. #endif
  2185. slwi TEMP, KK, 1 + ZBASE_SHIFT
  2186. add AO, AORIG, TEMP
  2187. add BO, B, TEMP
  2188. sub TEMP, K, KK
  2189. LFD f16, 0 * SIZE(AO)
  2190. LFD f17, 1 * SIZE(AO)
  2191. LFD f18, 2 * SIZE(AO)
  2192. LFD f19, 3 * SIZE(AO)
  2193. LFD f20, 0 * SIZE(BO)
  2194. LFD f21, 1 * SIZE(BO)
  2195. LFD f22, 2 * SIZE(BO)
  2196. LFD f23, 3 * SIZE(BO)
  2197. lfs f0, FZERO
  2198. fmr f1, f0
  2199. fmr f2, f0
  2200. fmr f3, f0
  2201. fmr f4, f0
  2202. fmr f5, f0
  2203. fmr f6, f0
  2204. fmr f7, f0
  2205. fmr f8, f0
  2206. fmr f9, f0
  2207. fmr f10, f0
  2208. fmr f11, f0
  2209. fmr f12, f0
  2210. fmr f13, f0
  2211. fmr f14, f0
  2212. fmr f15, f0
  2213. dcbtst CO1, PREC
  2214. dcbtst CO2, PREC
  2215. srawi. r0, TEMP, 3
  2216. mtspr CTR, r0
  2217. #endif
  2218. ble LL(35)
  2219. .align 4
  2220. LL(32):
  2221. dcbt AO, PREA
  2222. dcbtst BO, PREA
  2223. FMADD f0, f16, f20, f0
  2224. FMADD f4, f16, f21, f4
  2225. FMADD f8, f16, f22, f8
  2226. FMADD f12, f16, f23, f12
  2227. LFD f24, 4 * SIZE(AO)
  2228. LFD f28, 4 * SIZE(BO)
  2229. LFD f25, 5 * SIZE(AO)
  2230. LFD f29, 5 * SIZE(BO)
  2231. FMADD f1, f17, f20, f1
  2232. FMADD f5, f17, f21, f5
  2233. FMADD f9, f17, f22, f9
  2234. FMADD f13, f17, f23, f13
  2235. FMADD f2, f18, f20, f2
  2236. FMADD f6, f18, f21, f6
  2237. FMADD f10, f18, f22, f10
  2238. FMADD f14, f18, f23, f14
  2239. LFD f26, 6 * SIZE(AO)
  2240. LFD f30, 6 * SIZE(BO)
  2241. LFD f27, 7 * SIZE(AO)
  2242. LFD f31, 7 * SIZE(BO)
  2243. FMADD f3, f19, f20, f3
  2244. FMADD f7, f19, f21, f7
  2245. FMADD f11, f19, f22, f11
  2246. FMADD f15, f19, f23, f15
  2247. FMADD f0, f24, f28, f0
  2248. FMADD f4, f24, f29, f4
  2249. FMADD f8, f24, f30, f8
  2250. FMADD f12, f24, f31, f12
  2251. LFD f16, 8 * SIZE(AO)
  2252. LFD f20, 8 * SIZE(BO)
  2253. LFD f17, 9 * SIZE(AO)
  2254. LFD f21, 9 * SIZE(BO)
  2255. FMADD f1, f25, f28, f1
  2256. FMADD f5, f25, f29, f5
  2257. FMADD f9, f25, f30, f9
  2258. FMADD f13, f25, f31, f13
  2259. FMADD f2, f26, f28, f2
  2260. FMADD f6, f26, f29, f6
  2261. FMADD f10, f26, f30, f10
  2262. FMADD f14, f26, f31, f14
  2263. LFD f18, 10 * SIZE(AO)
  2264. LFD f22, 10 * SIZE(BO)
  2265. LFD f19, 11 * SIZE(AO)
  2266. LFD f23, 11 * SIZE(BO)
  2267. FMADD f3, f27, f28, f3
  2268. FMADD f7, f27, f29, f7
  2269. FMADD f11, f27, f30, f11
  2270. FMADD f15, f27, f31, f15
  2271. FMADD f0, f16, f20, f0
  2272. FMADD f4, f16, f21, f4
  2273. FMADD f8, f16, f22, f8
  2274. FMADD f12, f16, f23, f12
  2275. LFD f24, 12 * SIZE(AO)
  2276. LFD f28, 12 * SIZE(BO)
  2277. LFD f25, 13 * SIZE(AO)
  2278. LFD f29, 13 * SIZE(BO)
  2279. FMADD f1, f17, f20, f1
  2280. FMADD f5, f17, f21, f5
  2281. FMADD f9, f17, f22, f9
  2282. FMADD f13, f17, f23, f13
  2283. FMADD f2, f18, f20, f2
  2284. FMADD f6, f18, f21, f6
  2285. FMADD f10, f18, f22, f10
  2286. FMADD f14, f18, f23, f14
  2287. LFD f26, 14 * SIZE(AO)
  2288. LFD f30, 14 * SIZE(BO)
  2289. LFD f27, 15 * SIZE(AO)
  2290. LFD f31, 15 * SIZE(BO)
  2291. FMADD f3, f19, f20, f3
  2292. FMADD f7, f19, f21, f7
  2293. FMADD f11, f19, f22, f11
  2294. FMADD f15, f19, f23, f15
  2295. FMADD f0, f24, f28, f0
  2296. FMADD f4, f24, f29, f4
  2297. FMADD f8, f24, f30, f8
  2298. FMADD f12, f24, f31, f12
  2299. LFD f16, 16 * SIZE(AO)
  2300. LFD f20, 16 * SIZE(BO)
  2301. LFD f17, 17 * SIZE(AO)
  2302. LFD f21, 17 * SIZE(BO)
  2303. FMADD f1, f25, f28, f1
  2304. FMADD f5, f25, f29, f5
  2305. FMADD f9, f25, f30, f9
  2306. FMADD f13, f25, f31, f13
  2307. FMADD f2, f26, f28, f2
  2308. FMADD f6, f26, f29, f6
  2309. FMADD f10, f26, f30, f10
  2310. FMADD f14, f26, f31, f14
  2311. LFD f18, 18 * SIZE(AO)
  2312. LFD f22, 18 * SIZE(BO)
  2313. LFD f19, 19 * SIZE(AO)
  2314. LFD f23, 19 * SIZE(BO)
  2315. FMADD f3, f27, f28, f3
  2316. FMADD f7, f27, f29, f7
  2317. FMADD f11, f27, f30, f11
  2318. FMADD f15, f27, f31, f15
  2319. FMADD f0, f16, f20, f0
  2320. FMADD f4, f16, f21, f4
  2321. FMADD f8, f16, f22, f8
  2322. FMADD f12, f16, f23, f12
  2323. LFD f24, 20 * SIZE(AO)
  2324. LFD f28, 20 * SIZE(BO)
  2325. LFD f25, 21 * SIZE(AO)
  2326. LFD f29, 21 * SIZE(BO)
  2327. FMADD f1, f17, f20, f1
  2328. FMADD f5, f17, f21, f5
  2329. FMADD f9, f17, f22, f9
  2330. FMADD f13, f17, f23, f13
  2331. FMADD f2, f18, f20, f2
  2332. FMADD f6, f18, f21, f6
  2333. FMADD f10, f18, f22, f10
  2334. FMADD f14, f18, f23, f14
  2335. LFD f26, 22 * SIZE(AO)
  2336. LFD f30, 22 * SIZE(BO)
  2337. LFD f27, 23 * SIZE(AO)
  2338. LFD f31, 23 * SIZE(BO)
  2339. FMADD f3, f19, f20, f3
  2340. FMADD f7, f19, f21, f7
  2341. FMADD f11, f19, f22, f11
  2342. FMADD f15, f19, f23, f15
  2343. FMADD f0, f24, f28, f0
  2344. FMADD f4, f24, f29, f4
  2345. FMADD f8, f24, f30, f8
  2346. FMADD f12, f24, f31, f12
  2347. LFD f16, 24 * SIZE(AO)
  2348. LFD f20, 24 * SIZE(BO)
  2349. LFD f17, 25 * SIZE(AO)
  2350. LFD f21, 25 * SIZE(BO)
  2351. FMADD f1, f25, f28, f1
  2352. FMADD f5, f25, f29, f5
  2353. FMADD f9, f25, f30, f9
  2354. FMADD f13, f25, f31, f13
  2355. FMADD f2, f26, f28, f2
  2356. FMADD f6, f26, f29, f6
  2357. FMADD f10, f26, f30, f10
  2358. FMADD f14, f26, f31, f14
  2359. LFD f18, 26 * SIZE(AO)
  2360. LFD f22, 26 * SIZE(BO)
  2361. LFD f19, 27 * SIZE(AO)
  2362. LFD f23, 27 * SIZE(BO)
  2363. FMADD f3, f27, f28, f3
  2364. FMADD f7, f27, f29, f7
  2365. FMADD f11, f27, f30, f11
  2366. FMADD f15, f27, f31, f15
  2367. FMADD f0, f16, f20, f0
  2368. FMADD f4, f16, f21, f4
  2369. FMADD f8, f16, f22, f8
  2370. FMADD f12, f16, f23, f12
  2371. LFD f24, 28 * SIZE(AO)
  2372. LFD f28, 28 * SIZE(BO)
  2373. LFD f25, 29 * SIZE(AO)
  2374. LFD f29, 29 * SIZE(BO)
  2375. FMADD f1, f17, f20, f1
  2376. FMADD f5, f17, f21, f5
  2377. FMADD f9, f17, f22, f9
  2378. FMADD f13, f17, f23, f13
  2379. FMADD f2, f18, f20, f2
  2380. FMADD f6, f18, f21, f6
  2381. FMADD f10, f18, f22, f10
  2382. FMADD f14, f18, f23, f14
  2383. LFD f26, 30 * SIZE(AO)
  2384. LFD f30, 30 * SIZE(BO)
  2385. LFD f27, 31 * SIZE(AO)
  2386. LFD f31, 31 * SIZE(BO)
  2387. FMADD f3, f19, f20, f3
  2388. FMADD f7, f19, f21, f7
  2389. FMADD f11, f19, f22, f11
  2390. FMADD f15, f19, f23, f15
  2391. FMADD f0, f24, f28, f0
  2392. FMADD f4, f24, f29, f4
  2393. FMADD f8, f24, f30, f8
  2394. FMADD f12, f24, f31, f12
  2395. LFD f16, 32 * SIZE(AO)
  2396. LFD f20, 32 * SIZE(BO)
  2397. LFD f17, 33 * SIZE(AO)
  2398. LFD f21, 33 * SIZE(BO)
  2399. FMADD f1, f25, f28, f1
  2400. FMADD f5, f25, f29, f5
  2401. FMADD f9, f25, f30, f9
  2402. FMADD f13, f25, f31, f13
  2403. FMADD f2, f26, f28, f2
  2404. FMADD f6, f26, f29, f6
  2405. FMADD f10, f26, f30, f10
  2406. FMADD f14, f26, f31, f14
  2407. LFD f18, 34 * SIZE(AO)
  2408. LFD f22, 34 * SIZE(BO)
  2409. LFD f19, 35 * SIZE(AO)
  2410. LFD f23, 35 * SIZE(BO)
  2411. addi AO, AO, 32 * SIZE
  2412. addi BO, BO, 32 * SIZE
  2413. FMADD f3, f27, f28, f3
  2414. FMADD f7, f27, f29, f7
  2415. FMADD f11, f27, f30, f11
  2416. FMADD f15, f27, f31, f15
  2417. bdnz LL(32)
  2418. .align 4
  2419. LL(35):
  2420. #if defined(LT) || defined(RN)
  2421. andi. r0, KK, 7
  2422. #else
  2423. andi. r0, TEMP, 7
  2424. #endif
  2425. mtspr CTR, r0
  2426. ble LL(38)
  2427. .align 4
  2428. LL(36):
  2429. FMADD f0, f16, f20, f0
  2430. FMADD f4, f16, f21, f4
  2431. FMADD f8, f16, f22, f8
  2432. FMADD f12, f16, f23, f12
  2433. FMADD f1, f17, f20, f1
  2434. FMADD f5, f17, f21, f5
  2435. FMADD f9, f17, f22, f9
  2436. FMADD f13, f17, f23, f13
  2437. FMADD f2, f18, f20, f2
  2438. FMADD f6, f18, f21, f6
  2439. FMADD f10, f18, f22, f10
  2440. FMADD f14, f18, f23, f14
  2441. FMADD f3, f19, f20, f3
  2442. FMADD f7, f19, f21, f7
  2443. FMADD f11, f19, f22, f11
  2444. FMADD f15, f19, f23, f15
  2445. LFD f16, 4 * SIZE(AO)
  2446. LFD f17, 5 * SIZE(AO)
  2447. LFD f18, 6 * SIZE(AO)
  2448. LFD f19, 7 * SIZE(AO)
  2449. LFD f20, 4 * SIZE(BO)
  2450. LFD f21, 5 * SIZE(BO)
  2451. LFD f22, 6 * SIZE(BO)
  2452. LFD f23, 7 * SIZE(BO)
  2453. addi BO, BO, 4 * SIZE
  2454. addi AO, AO, 4 * SIZE
  2455. bdnz LL(36)
  2456. .align 4
  2457. LL(38):
  2458. #ifndef CONJ
  2459. FSUB f0, f0, f5
  2460. FADD f1, f1, f4
  2461. FSUB f2, f2, f7
  2462. FADD f3, f3, f6
  2463. FSUB f8, f8, f13
  2464. FADD f9, f9, f12
  2465. FSUB f10, f10, f15
  2466. FADD f11, f11, f14
  2467. #else
  2468. FADD f0, f0, f5
  2469. FSUB f1, f4, f1
  2470. FADD f2, f2, f7
  2471. FSUB f3, f6, f3
  2472. FADD f8, f8, f13
  2473. FSUB f9, f12, f9
  2474. FADD f10, f10, f15
  2475. FSUB f11, f14, f11
  2476. #endif
  2477. #if defined(LN) || defined(RT)
  2478. subi r0, KK, 2
  2479. slwi r0, r0, 1 + ZBASE_SHIFT
  2480. add AO, AORIG, r0
  2481. add BO, B, r0
  2482. #endif
  2483. #if defined(LN) || defined(LT)
  2484. LFD f16, 0 * SIZE(BO)
  2485. LFD f17, 1 * SIZE(BO)
  2486. LFD f18, 2 * SIZE(BO)
  2487. LFD f19, 3 * SIZE(BO)
  2488. LFD f20, 4 * SIZE(BO)
  2489. LFD f21, 5 * SIZE(BO)
  2490. LFD f22, 6 * SIZE(BO)
  2491. LFD f23, 7 * SIZE(BO)
  2492. FSUB f0, f16, f0
  2493. FSUB f1, f17, f1
  2494. FSUB f8, f18, f8
  2495. FSUB f9, f19, f9
  2496. FSUB f2, f20, f2
  2497. FSUB f3, f21, f3
  2498. FSUB f10, f22, f10
  2499. FSUB f11, f23, f11
  2500. #else
  2501. LFD f16, 0 * SIZE(AO)
  2502. LFD f17, 1 * SIZE(AO)
  2503. LFD f18, 2 * SIZE(AO)
  2504. LFD f19, 3 * SIZE(AO)
  2505. LFD f20, 4 * SIZE(AO)
  2506. LFD f21, 5 * SIZE(AO)
  2507. LFD f22, 6 * SIZE(AO)
  2508. LFD f23, 7 * SIZE(AO)
  2509. #ifndef CONJ
  2510. FSUB f0, f16, f0
  2511. FSUB f1, f17, f1
  2512. FSUB f2, f18, f2
  2513. FSUB f3, f19, f3
  2514. FSUB f8, f20, f8
  2515. FSUB f9, f21, f9
  2516. FSUB f10, f22, f10
  2517. FSUB f11, f23, f11
  2518. #else
  2519. FSUB f0, f16, f0
  2520. FADD f1, f17, f1
  2521. FSUB f2, f18, f2
  2522. FADD f3, f19, f3
  2523. FSUB f8, f20, f8
  2524. FADD f9, f21, f9
  2525. FSUB f10, f22, f10
  2526. FADD f11, f23, f11
  2527. #endif
  2528. #endif
  2529. #ifdef LN
  2530. LFD f16, 6 * SIZE(AO)
  2531. LFD f17, 7 * SIZE(AO)
  2532. LFD f18, 4 * SIZE(AO)
  2533. LFD f19, 5 * SIZE(AO)
  2534. LFD f20, 0 * SIZE(AO)
  2535. LFD f21, 1 * SIZE(AO)
  2536. FMUL f6, f17, f3
  2537. FMUL f7, f17, f2
  2538. FMUL f14, f17, f11
  2539. FMUL f15, f17, f10
  2540. #ifndef CONJ
  2541. FMSUB f2, f16, f2, f6
  2542. FMADD f3, f16, f3, f7
  2543. FMSUB f10, f16, f10, f14
  2544. FMADD f11, f16, f11, f15
  2545. FMADD f0, f19, f3, f0
  2546. FNMSUB f1, f19, f2, f1
  2547. FMADD f8, f19, f11, f8
  2548. FNMSUB f9, f19, f10, f9
  2549. FNMSUB f0, f18, f2, f0
  2550. FNMSUB f1, f18, f3, f1
  2551. FNMSUB f8, f18, f10, f8
  2552. FNMSUB f9, f18, f11, f9
  2553. FMUL f4, f21, f1
  2554. FMUL f5, f21, f0
  2555. FMUL f12, f21, f9
  2556. FMUL f13, f21, f8
  2557. FMSUB f0, f20, f0, f4
  2558. FMADD f1, f20, f1, f5
  2559. FMSUB f8, f20, f8, f12
  2560. FMADD f9, f20, f9, f13
  2561. #else
  2562. FMADD f2, f16, f2, f6
  2563. FMSUB f3, f16, f3, f7
  2564. FMADD f10, f16, f10, f14
  2565. FMSUB f11, f16, f11, f15
  2566. FMSUB f0, f19, f3, f0
  2567. FNMADD f1, f19, f2, f1
  2568. FMSUB f8, f19, f11, f8
  2569. FNMADD f9, f19, f10, f9
  2570. FNMADD f0, f18, f2, f0
  2571. FNMADD f1, f18, f3, f1
  2572. FNMADD f8, f18, f10, f8
  2573. FNMADD f9, f18, f11, f9
  2574. FMUL f4, f21, f1
  2575. FMUL f5, f21, f0
  2576. FMUL f12, f21, f9
  2577. FMUL f13, f21, f8
  2578. FMADD f0, f20, f0, f4
  2579. FMSUB f1, f20, f1, f5
  2580. FMADD f8, f20, f8, f12
  2581. FMSUB f9, f20, f9, f13
  2582. #endif
  2583. #endif
  2584. #ifdef LT
  2585. LFD f16, 0 * SIZE(AO)
  2586. LFD f17, 1 * SIZE(AO)
  2587. LFD f18, 2 * SIZE(AO)
  2588. LFD f19, 3 * SIZE(AO)
  2589. LFD f20, 6 * SIZE(AO)
  2590. LFD f21, 7 * SIZE(AO)
  2591. FMUL f4, f17, f1
  2592. FMUL f5, f17, f0
  2593. FMUL f12, f17, f9
  2594. FMUL f13, f17, f8
  2595. #ifndef CONJ
  2596. FMSUB f0, f16, f0, f4
  2597. FMADD f1, f16, f1, f5
  2598. FMSUB f8, f16, f8, f12
  2599. FMADD f9, f16, f9, f13
  2600. FMADD f2, f19, f1, f2
  2601. FNMSUB f3, f19, f0, f3
  2602. FMADD f10, f19, f9, f10
  2603. FNMSUB f11, f19, f8, f11
  2604. FNMSUB f2, f18, f0, f2
  2605. FNMSUB f3, f18, f1, f3
  2606. FNMSUB f10, f18, f8, f10
  2607. FNMSUB f11, f18, f9, f11
  2608. FMUL f4, f21, f3
  2609. FMUL f5, f21, f2
  2610. FMUL f12, f21, f11
  2611. FMUL f13, f21, f10
  2612. FMSUB f2, f20, f2, f4
  2613. FMADD f3, f20, f3, f5
  2614. FMSUB f10, f20, f10, f12
  2615. FMADD f11, f20, f11, f13
  2616. #else
  2617. FMADD f0, f16, f0, f4
  2618. FMSUB f1, f16, f1, f5
  2619. FMADD f8, f16, f8, f12
  2620. FMSUB f9, f16, f9, f13
  2621. FMSUB f2, f19, f1, f2
  2622. FNMADD f3, f19, f0, f3
  2623. FMSUB f10, f19, f9, f10
  2624. FNMADD f11, f19, f8, f11
  2625. FNMADD f2, f18, f0, f2
  2626. FNMADD f3, f18, f1, f3
  2627. FNMADD f10, f18, f8, f10
  2628. FNMADD f11, f18, f9, f11
  2629. FMUL f4, f21, f3
  2630. FMUL f5, f21, f2
  2631. FMUL f12, f21, f11
  2632. FMUL f13, f21, f10
  2633. FMADD f2, f20, f2, f4
  2634. FMSUB f3, f20, f3, f5
  2635. FMADD f10, f20, f10, f12
  2636. FMSUB f11, f20, f11, f13
  2637. #endif
  2638. #endif
  2639. #ifdef RN
  2640. LFD f16, 0 * SIZE(BO)
  2641. LFD f17, 1 * SIZE(BO)
  2642. LFD f18, 2 * SIZE(BO)
  2643. LFD f19, 3 * SIZE(BO)
  2644. LFD f20, 6 * SIZE(BO)
  2645. LFD f21, 7 * SIZE(BO)
  2646. FMUL f4, f17, f1
  2647. FMUL f5, f17, f0
  2648. FMUL f6, f17, f3
  2649. FMUL f7, f17, f2
  2650. #ifndef CONJ
  2651. FMSUB f0, f16, f0, f4
  2652. FMADD f1, f16, f1, f5
  2653. FMSUB f2, f16, f2, f6
  2654. FMADD f3, f16, f3, f7
  2655. FMADD f8, f19, f1, f8
  2656. FNMSUB f9, f19, f0, f9
  2657. FMADD f10, f19, f3, f10
  2658. FNMSUB f11, f19, f2, f11
  2659. FNMSUB f8, f18, f0, f8
  2660. FNMSUB f9, f18, f1, f9
  2661. FNMSUB f10, f18, f2, f10
  2662. FNMSUB f11, f18, f3, f11
  2663. FMUL f4, f21, f9
  2664. FMUL f5, f21, f8
  2665. FMUL f6, f21, f11
  2666. FMUL f7, f21, f10
  2667. FMSUB f8, f20, f8, f4
  2668. FMADD f9, f20, f9, f5
  2669. FMSUB f10, f20, f10, f6
  2670. FMADD f11, f20, f11, f7
  2671. #else
  2672. FMADD f0, f16, f0, f4
  2673. FMSUB f1, f16, f1, f5
  2674. FMADD f2, f16, f2, f6
  2675. FMSUB f3, f16, f3, f7
  2676. FMSUB f8, f19, f1, f8
  2677. FNMADD f9, f19, f0, f9
  2678. FMSUB f10, f19, f3, f10
  2679. FNMADD f11, f19, f2, f11
  2680. FNMADD f8, f18, f0, f8
  2681. FNMADD f9, f18, f1, f9
  2682. FNMADD f10, f18, f2, f10
  2683. FNMADD f11, f18, f3, f11
  2684. FMUL f4, f21, f9
  2685. FMUL f5, f21, f8
  2686. FMUL f6, f21, f11
  2687. FMUL f7, f21, f10
  2688. FMADD f8, f20, f8, f4
  2689. FMSUB f9, f20, f9, f5
  2690. FMADD f10, f20, f10, f6
  2691. FMSUB f11, f20, f11, f7
  2692. #endif
  2693. #endif
  2694. #ifdef RT
  2695. LFD f16, 6 * SIZE(BO)
  2696. LFD f17, 7 * SIZE(BO)
  2697. LFD f18, 4 * SIZE(BO)
  2698. LFD f19, 5 * SIZE(BO)
  2699. LFD f20, 0 * SIZE(BO)
  2700. LFD f21, 1 * SIZE(BO)
  2701. FMUL f12, f17, f9
  2702. FMUL f13, f17, f8
  2703. FMUL f14, f17, f11
  2704. FMUL f15, f17, f10
  2705. #ifndef CONJ
  2706. FMSUB f8, f16, f8, f12
  2707. FMADD f9, f16, f9, f13
  2708. FMSUB f10, f16, f10, f14
  2709. FMADD f11, f16, f11, f15
  2710. FMADD f0, f19, f9, f0
  2711. FNMSUB f1, f19, f8, f1
  2712. FMADD f2, f19, f11, f2
  2713. FNMSUB f3, f19, f10, f3
  2714. FNMSUB f0, f18, f8, f0
  2715. FNMSUB f1, f18, f9, f1
  2716. FNMSUB f2, f18, f10, f2
  2717. FNMSUB f3, f18, f11, f3
  2718. FMUL f4, f21, f1
  2719. FMUL f5, f21, f0
  2720. FMUL f6, f21, f3
  2721. FMUL f7, f21, f2
  2722. FMSUB f0, f20, f0, f4
  2723. FMADD f1, f20, f1, f5
  2724. FMSUB f2, f20, f2, f6
  2725. FMADD f3, f20, f3, f7
  2726. #else
  2727. FMADD f8, f16, f8, f12
  2728. FMSUB f9, f16, f9, f13
  2729. FMADD f10, f16, f10, f14
  2730. FMSUB f11, f16, f11, f15
  2731. FMSUB f0, f19, f9, f0
  2732. FNMADD f1, f19, f8, f1
  2733. FMSUB f2, f19, f11, f2
  2734. FNMADD f3, f19, f10, f3
  2735. FNMADD f0, f18, f8, f0
  2736. FNMADD f1, f18, f9, f1
  2737. FNMADD f2, f18, f10, f2
  2738. FNMADD f3, f18, f11, f3
  2739. FMUL f4, f21, f1
  2740. FMUL f5, f21, f0
  2741. FMUL f6, f21, f3
  2742. FMUL f7, f21, f2
  2743. FMADD f0, f20, f0, f4
  2744. FMSUB f1, f20, f1, f5
  2745. FMADD f2, f20, f2, f6
  2746. FMSUB f3, f20, f3, f7
  2747. #endif
  2748. #endif
  2749. #ifdef LN
  2750. subi CO1, CO1, 4 * SIZE
  2751. subi CO2, CO2, 4 * SIZE
  2752. #endif
  2753. #if defined(LN) || defined(LT)
  2754. STFD f0, 0 * SIZE(BO)
  2755. STFD f1, 1 * SIZE(BO)
  2756. STFD f8, 2 * SIZE(BO)
  2757. STFD f9, 3 * SIZE(BO)
  2758. STFD f2, 4 * SIZE(BO)
  2759. STFD f3, 5 * SIZE(BO)
  2760. STFD f10, 6 * SIZE(BO)
  2761. STFD f11, 7 * SIZE(BO)
  2762. #else
  2763. STFD f0, 0 * SIZE(AO)
  2764. STFD f1, 1 * SIZE(AO)
  2765. STFD f2, 2 * SIZE(AO)
  2766. STFD f3, 3 * SIZE(AO)
  2767. STFD f8, 4 * SIZE(AO)
  2768. STFD f9, 5 * SIZE(AO)
  2769. STFD f10, 6 * SIZE(AO)
  2770. STFD f11, 7 * SIZE(AO)
  2771. #endif
  2772. STFD f0, 0 * SIZE(CO1)
  2773. STFD f1, 1 * SIZE(CO1)
  2774. STFD f2, 2 * SIZE(CO1)
  2775. STFD f3, 3 * SIZE(CO1)
  2776. STFD f8, 0 * SIZE(CO2)
  2777. STFD f9, 1 * SIZE(CO2)
  2778. STFD f10, 2 * SIZE(CO2)
  2779. STFD f11, 3 * SIZE(CO2)
  2780. #ifndef LN
  2781. addi CO1, CO1, 4 * SIZE
  2782. addi CO2, CO2, 4 * SIZE
  2783. #endif
  2784. #ifdef RT
  2785. slwi r0, K, 1 + ZBASE_SHIFT
  2786. add AORIG, AORIG, r0
  2787. #endif
  2788. #if defined(LT) || defined(RN)
  2789. sub TEMP, K, KK
  2790. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  2791. add AO, AO, TEMP
  2792. add BO, BO, TEMP
  2793. #endif
  2794. #ifdef LT
  2795. addi KK, KK, 2
  2796. #endif
  2797. #ifdef LN
  2798. subi KK, KK, 2
  2799. #endif
  2800. addic. I, I, -1
  2801. bgt LL(31)
  2802. .align 4
  2803. LL(40):
  2804. andi. I, M, 1
  2805. ble LL(49)
  2806. #if defined(LT) || defined(RN)
  2807. LFD f16, 0 * SIZE(AO)
  2808. LFD f17, 1 * SIZE(AO)
  2809. LFD f18, 2 * SIZE(AO)
  2810. LFD f19, 3 * SIZE(AO)
  2811. LFD f20, 0 * SIZE(B)
  2812. LFD f21, 1 * SIZE(B)
  2813. LFD f22, 2 * SIZE(B)
  2814. LFD f23, 3 * SIZE(B)
  2815. LFD f24, 4 * SIZE(B)
  2816. LFD f25, 5 * SIZE(B)
  2817. LFD f26, 6 * SIZE(B)
  2818. LFD f27, 7 * SIZE(B)
  2819. lfs f0, FZERO
  2820. fmr f1, f0
  2821. fmr f2, f0
  2822. fmr f3, f0
  2823. fmr f4, f0
  2824. fmr f5, f0
  2825. fmr f6, f0
  2826. fmr f7, f0
  2827. srawi. r0, KK, 2
  2828. mr BO, B
  2829. mtspr CTR, r0
  2830. #else
  2831. #ifdef LN
  2832. slwi r0, K, 0 + ZBASE_SHIFT
  2833. sub AORIG, AORIG, r0
  2834. #endif
  2835. slwi r0, KK, 0 + ZBASE_SHIFT
  2836. slwi TEMP, KK, 1 + ZBASE_SHIFT
  2837. add AO, AORIG, r0
  2838. add BO, B, TEMP
  2839. sub TEMP, K, KK
  2840. LFD f16, 0 * SIZE(AO)
  2841. LFD f17, 1 * SIZE(AO)
  2842. LFD f18, 2 * SIZE(AO)
  2843. LFD f19, 3 * SIZE(AO)
  2844. LFD f20, 0 * SIZE(BO)
  2845. LFD f21, 1 * SIZE(BO)
  2846. LFD f22, 2 * SIZE(BO)
  2847. LFD f23, 3 * SIZE(BO)
  2848. LFD f24, 4 * SIZE(BO)
  2849. LFD f25, 5 * SIZE(BO)
  2850. LFD f26, 6 * SIZE(BO)
  2851. LFD f27, 7 * SIZE(BO)
  2852. lfs f0, FZERO
  2853. fmr f1, f0
  2854. fmr f2, f0
  2855. fmr f3, f0
  2856. fmr f4, f0
  2857. fmr f5, f0
  2858. fmr f6, f0
  2859. fmr f7, f0
  2860. srawi. r0, TEMP, 2
  2861. mtspr CTR, r0
  2862. #endif
  2863. ble LL(45)
  2864. .align 4
  2865. LL(42):
  2866. FMADD f0, f16, f20, f0
  2867. FMADD f1, f16, f21, f1
  2868. FMADD f2, f16, f22, f2
  2869. FMADD f3, f16, f23, f3
  2870. FMADD f4, f17, f20, f4
  2871. FMADD f5, f17, f21, f5
  2872. FMADD f6, f17, f22, f6
  2873. FMADD f7, f17, f23, f7
  2874. LFD f20, 8 * SIZE(BO)
  2875. LFD f21, 9 * SIZE(BO)
  2876. LFD f22, 10 * SIZE(BO)
  2877. LFD f23, 11 * SIZE(BO)
  2878. FMADD f0, f18, f24, f0
  2879. FMADD f1, f18, f25, f1
  2880. FMADD f2, f18, f26, f2
  2881. FMADD f3, f18, f27, f3
  2882. FMADD f4, f19, f24, f4
  2883. FMADD f5, f19, f25, f5
  2884. FMADD f6, f19, f26, f6
  2885. FMADD f7, f19, f27, f7
  2886. LFD f24, 12 * SIZE(BO)
  2887. LFD f25, 13 * SIZE(BO)
  2888. LFD f26, 14 * SIZE(BO)
  2889. LFD f27, 15 * SIZE(BO)
  2890. LFD f16, 4 * SIZE(AO)
  2891. LFD f17, 5 * SIZE(AO)
  2892. LFD f18, 6 * SIZE(AO)
  2893. LFD f19, 7 * SIZE(AO)
  2894. FMADD f0, f16, f20, f0
  2895. FMADD f1, f16, f21, f1
  2896. FMADD f2, f16, f22, f2
  2897. FMADD f3, f16, f23, f3
  2898. FMADD f4, f17, f20, f4
  2899. FMADD f5, f17, f21, f5
  2900. FMADD f6, f17, f22, f6
  2901. FMADD f7, f17, f23, f7
  2902. LFD f20, 16 * SIZE(BO)
  2903. LFD f21, 17 * SIZE(BO)
  2904. LFD f22, 18 * SIZE(BO)
  2905. LFD f23, 19 * SIZE(BO)
  2906. FMADD f0, f18, f24, f0
  2907. FMADD f1, f18, f25, f1
  2908. FMADD f2, f18, f26, f2
  2909. FMADD f3, f18, f27, f3
  2910. FMADD f4, f19, f24, f4
  2911. FMADD f5, f19, f25, f5
  2912. FMADD f6, f19, f26, f6
  2913. FMADD f7, f19, f27, f7
  2914. LFD f16, 8 * SIZE(AO)
  2915. LFD f17, 9 * SIZE(AO)
  2916. LFD f18, 10 * SIZE(AO)
  2917. LFD f19, 11 * SIZE(AO)
  2918. LFD f24, 20 * SIZE(BO)
  2919. LFD f25, 21 * SIZE(BO)
  2920. LFD f26, 22 * SIZE(BO)
  2921. LFD f27, 23 * SIZE(BO)
  2922. addi BO, BO, 16 * SIZE
  2923. addi AO, AO, 8 * SIZE
  2924. bdnz LL(42)
  2925. .align 4
  2926. LL(45):
  2927. #if defined(LT) || defined(RN)
  2928. andi. r0, KK, 3
  2929. #else
  2930. andi. r0, TEMP, 3
  2931. #endif
  2932. mtspr CTR, r0
  2933. ble LL(47)
  2934. .align 4
  2935. LL(46):
  2936. FMADD f0, f16, f20, f0
  2937. FMADD f1, f16, f21, f1
  2938. FMADD f2, f16, f22, f2
  2939. FMADD f3, f16, f23, f3
  2940. FMADD f4, f17, f20, f4
  2941. FMADD f5, f17, f21, f5
  2942. FMADD f6, f17, f22, f6
  2943. FMADD f7, f17, f23, f7
  2944. LFD f20, 4 * SIZE(BO)
  2945. LFD f21, 5 * SIZE(BO)
  2946. LFD f22, 6 * SIZE(BO)
  2947. LFD f23, 7 * SIZE(BO)
  2948. LFD f16, 2 * SIZE(AO)
  2949. LFD f17, 3 * SIZE(AO)
  2950. addi AO, AO, 2 * SIZE
  2951. addi BO, BO, 4 * SIZE
  2952. bdnz LL(46)
  2953. .align 4
  2954. LL(47):
  2955. #ifndef CONJ
  2956. FSUB f0, f0, f5
  2957. FADD f1, f1, f4
  2958. FSUB f2, f2, f7
  2959. FADD f3, f3, f6
  2960. #else
  2961. #if defined(LN) || defined(LT)
  2962. FADD f0, f0, f5
  2963. FSUB f1, f1, f4
  2964. FADD f2, f2, f7
  2965. FSUB f3, f3, f6
  2966. #else
  2967. FADD f0, f0, f5
  2968. FSUB f1, f4, f1
  2969. FADD f2, f2, f7
  2970. FSUB f3, f6, f3
  2971. #endif
  2972. #endif
  2973. #if defined(LN) || defined(RT)
  2974. #ifdef LN
  2975. subi r0, KK, 1
  2976. #else
  2977. subi r0, KK, 2
  2978. #endif
  2979. slwi TEMP, r0, 0 + ZBASE_SHIFT
  2980. slwi r0, r0, 1 + ZBASE_SHIFT
  2981. add AO, AORIG, TEMP
  2982. add BO, B, r0
  2983. #endif
  2984. #if defined(LN) || defined(LT)
  2985. LFD f16, 0 * SIZE(BO)
  2986. LFD f17, 1 * SIZE(BO)
  2987. LFD f18, 2 * SIZE(BO)
  2988. LFD f19, 3 * SIZE(BO)
  2989. FSUB f0, f16, f0
  2990. FSUB f1, f17, f1
  2991. FSUB f2, f18, f2
  2992. FSUB f3, f19, f3
  2993. #else
  2994. LFD f16, 0 * SIZE(AO)
  2995. LFD f17, 1 * SIZE(AO)
  2996. LFD f20, 2 * SIZE(AO)
  2997. LFD f21, 3 * SIZE(AO)
  2998. FSUB f0, f16, f0
  2999. FSUB f1, f17, f1
  3000. FSUB f2, f20, f2
  3001. FSUB f3, f21, f3
  3002. #endif
  3003. #ifdef LN
  3004. LFD f20, 0 * SIZE(AO)
  3005. LFD f21, 1 * SIZE(AO)
  3006. FMUL f4, f21, f1
  3007. FMUL f5, f21, f0
  3008. FMUL f12, f21, f3
  3009. FMUL f13, f21, f2
  3010. #ifndef CONJ
  3011. FMSUB f0, f20, f0, f4
  3012. FMADD f1, f20, f1, f5
  3013. FMSUB f2, f20, f2, f12
  3014. FMADD f3, f20, f3, f13
  3015. #else
  3016. FMADD f0, f20, f0, f4
  3017. FMSUB f1, f20, f1, f5
  3018. FMADD f2, f20, f2, f12
  3019. FMSUB f3, f20, f3, f13
  3020. #endif
  3021. #endif
  3022. #ifdef LT
  3023. LFD f16, 0 * SIZE(AO)
  3024. LFD f17, 1 * SIZE(AO)
  3025. FMUL f4, f17, f1
  3026. FMUL f5, f17, f0
  3027. FMUL f12, f17, f3
  3028. FMUL f13, f17, f2
  3029. #ifndef CONJ
  3030. FMSUB f0, f16, f0, f4
  3031. FMADD f1, f16, f1, f5
  3032. FMSUB f2, f16, f2, f12
  3033. FMADD f3, f16, f3, f13
  3034. #else
  3035. FMADD f0, f16, f0, f4
  3036. FMSUB f1, f16, f1, f5
  3037. FMADD f2, f16, f2, f12
  3038. FMSUB f3, f16, f3, f13
  3039. #endif
  3040. #endif
  3041. #ifdef RN
  3042. LFD f16, 0 * SIZE(BO)
  3043. LFD f17, 1 * SIZE(BO)
  3044. LFD f18, 2 * SIZE(BO)
  3045. LFD f19, 3 * SIZE(BO)
  3046. LFD f20, 6 * SIZE(BO)
  3047. LFD f21, 7 * SIZE(BO)
  3048. FMUL f4, f17, f1
  3049. FMUL f5, f17, f0
  3050. #ifndef CONJ
  3051. FMSUB f0, f16, f0, f4
  3052. FMADD f1, f16, f1, f5
  3053. FMADD f2, f19, f1, f2
  3054. FNMSUB f3, f19, f0, f3
  3055. FNMSUB f2, f18, f0, f2
  3056. FNMSUB f3, f18, f1, f3
  3057. FMUL f4, f21, f3
  3058. FMUL f5, f21, f2
  3059. FMSUB f2, f20, f2, f4
  3060. FMADD f3, f20, f3, f5
  3061. #else
  3062. FMADD f0, f16, f0, f4
  3063. FMSUB f1, f16, f1, f5
  3064. FMSUB f2, f19, f1, f2
  3065. FNMADD f3, f19, f0, f3
  3066. FNMADD f2, f18, f0, f2
  3067. FNMADD f3, f18, f1, f3
  3068. FMUL f4, f21, f3
  3069. FMUL f5, f21, f2
  3070. FMADD f2, f20, f2, f4
  3071. FMSUB f3, f20, f3, f5
  3072. #endif
  3073. #endif
  3074. #ifdef RT
  3075. LFD f16, 6 * SIZE(BO)
  3076. LFD f17, 7 * SIZE(BO)
  3077. LFD f18, 4 * SIZE(BO)
  3078. LFD f19, 5 * SIZE(BO)
  3079. LFD f20, 0 * SIZE(BO)
  3080. LFD f21, 1 * SIZE(BO)
  3081. FMUL f12, f17, f3
  3082. FMUL f13, f17, f2
  3083. #ifndef CONJ
  3084. FMSUB f2, f16, f2, f12
  3085. FMADD f3, f16, f3, f13
  3086. FMADD f0, f19, f3, f0
  3087. FNMSUB f1, f19, f2, f1
  3088. FNMSUB f0, f18, f2, f0
  3089. FNMSUB f1, f18, f3, f1
  3090. FMUL f4, f21, f1
  3091. FMUL f5, f21, f0
  3092. FMSUB f0, f20, f0, f4
  3093. FMADD f1, f20, f1, f5
  3094. #else
  3095. FMADD f2, f16, f2, f12
  3096. FMSUB f3, f16, f3, f13
  3097. FMSUB f0, f19, f3, f0
  3098. FNMADD f1, f19, f2, f1
  3099. FNMADD f0, f18, f2, f0
  3100. FNMADD f1, f18, f3, f1
  3101. FMUL f4, f21, f1
  3102. FMUL f5, f21, f0
  3103. FMADD f0, f20, f0, f4
  3104. FMSUB f1, f20, f1, f5
  3105. #endif
  3106. #endif
  3107. #ifdef LN
  3108. subi CO1, CO1, 2 * SIZE
  3109. subi CO2, CO2, 2 * SIZE
  3110. #endif
  3111. #if defined(LN) || defined(LT)
  3112. STFD f0, 0 * SIZE(BO)
  3113. STFD f1, 1 * SIZE(BO)
  3114. STFD f2, 2 * SIZE(BO)
  3115. STFD f3, 3 * SIZE(BO)
  3116. #else
  3117. STFD f0, 0 * SIZE(AO)
  3118. STFD f1, 1 * SIZE(AO)
  3119. STFD f2, 2 * SIZE(AO)
  3120. STFD f3, 3 * SIZE(AO)
  3121. #endif
  3122. STFD f0, 0 * SIZE(CO1)
  3123. STFD f1, 1 * SIZE(CO1)
  3124. STFD f2, 0 * SIZE(CO2)
  3125. STFD f3, 1 * SIZE(CO2)
  3126. #ifndef LN
  3127. addi CO1, CO1, 2 * SIZE
  3128. addi CO2, CO2, 2 * SIZE
  3129. #endif
  3130. #ifdef RT
  3131. slwi r0, K, 0 + ZBASE_SHIFT
  3132. add AORIG, AORIG, r0
  3133. #endif
  3134. #if defined(LT) || defined(RN)
  3135. sub TEMP, K, KK
  3136. slwi r0, TEMP, 0 + ZBASE_SHIFT
  3137. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  3138. add AO, AO, r0
  3139. add BO, BO, TEMP
  3140. #endif
  3141. #ifdef LT
  3142. addi KK, KK, 1
  3143. #endif
  3144. #ifdef LN
  3145. subi KK, KK, 1
  3146. #endif
  3147. .align 4
  3148. LL(49):
  3149. #ifdef LN
  3150. slwi r0, K, 1 + ZBASE_SHIFT
  3151. add B, B, r0
  3152. #endif
  3153. #if defined(LT) || defined(RN)
  3154. mr B, BO
  3155. #endif
  3156. #ifdef RN
  3157. addi KK, KK, 2
  3158. #endif
  3159. #ifdef RT
  3160. subi KK, KK, 2
  3161. #endif
  3162. .align 4
  3163. LL(50):
  3164. andi. J, N, 1
  3165. ble LL(999)
  3166. #ifdef RT
  3167. slwi r0, K, 0 + ZBASE_SHIFT
  3168. sub B, B, r0
  3169. sub C, C, LDC
  3170. #endif
  3171. mr CO1, C
  3172. #ifdef LN
  3173. add KK, M, OFFSET
  3174. #endif
  3175. #ifdef LT
  3176. mr KK, OFFSET
  3177. #endif
  3178. srawi. I, M, 1
  3179. #if defined(LN) || defined(RT)
  3180. mr AORIG, A
  3181. #else
  3182. mr AO, A
  3183. #endif
  3184. #ifndef RT
  3185. add C, C, LDC
  3186. #endif
  3187. ble LL(60)
  3188. .align 4
  3189. LL(51):
  3190. #if defined(LT) || defined(RN)
  3191. LFD f20, 0 * SIZE(AO)
  3192. LFD f21, 1 * SIZE(AO)
  3193. LFD f22, 2 * SIZE(AO)
  3194. LFD f23, 3 * SIZE(AO)
  3195. LFD f24, 4 * SIZE(AO)
  3196. LFD f25, 5 * SIZE(AO)
  3197. LFD f26, 6 * SIZE(AO)
  3198. LFD f27, 7 * SIZE(AO)
  3199. LFD f16, 0 * SIZE(B)
  3200. LFD f17, 1 * SIZE(B)
  3201. LFD f18, 2 * SIZE(B)
  3202. LFD f19, 3 * SIZE(B)
  3203. lfs f0, FZERO
  3204. fmr f1, f0
  3205. fmr f2, f0
  3206. fmr f3, f0
  3207. fmr f4, f0
  3208. fmr f5, f0
  3209. fmr f6, f0
  3210. fmr f7, f0
  3211. dcbt CO1, PREC
  3212. srawi. r0, KK, 2
  3213. mr BO, B
  3214. mtspr CTR, r0
  3215. #else
  3216. #ifdef LN
  3217. slwi r0, K, 1 + ZBASE_SHIFT
  3218. sub AORIG, AORIG, r0
  3219. #endif
  3220. slwi r0, KK, 1 + ZBASE_SHIFT
  3221. slwi TEMP, KK, 0 + ZBASE_SHIFT
  3222. add AO, AORIG, r0
  3223. add BO, B, TEMP
  3224. sub TEMP, K, KK
  3225. LFD f20, 0 * SIZE(AO)
  3226. LFD f21, 1 * SIZE(AO)
  3227. LFD f22, 2 * SIZE(AO)
  3228. LFD f23, 3 * SIZE(AO)
  3229. LFD f24, 4 * SIZE(AO)
  3230. LFD f25, 5 * SIZE(AO)
  3231. LFD f26, 6 * SIZE(AO)
  3232. LFD f27, 7 * SIZE(AO)
  3233. LFD f16, 0 * SIZE(BO)
  3234. LFD f17, 1 * SIZE(BO)
  3235. LFD f18, 2 * SIZE(BO)
  3236. LFD f19, 3 * SIZE(BO)
  3237. lfs f0, FZERO
  3238. fmr f1, f0
  3239. fmr f2, f0
  3240. fmr f3, f0
  3241. fmr f4, f0
  3242. fmr f5, f0
  3243. fmr f6, f0
  3244. fmr f7, f0
  3245. srawi. r0, TEMP, 2
  3246. mtspr CTR, r0
  3247. #endif
  3248. ble LL(55)
  3249. .align 4
  3250. LL(52):
  3251. FMADD f0, f16, f20, f0
  3252. FMADD f1, f16, f21, f1
  3253. FMADD f2, f16, f22, f2
  3254. FMADD f3, f16, f23, f3
  3255. FMADD f4, f17, f20, f4
  3256. FMADD f5, f17, f21, f5
  3257. FMADD f6, f17, f22, f6
  3258. FMADD f7, f17, f23, f7
  3259. LFD f20, 8 * SIZE(AO)
  3260. LFD f21, 9 * SIZE(AO)
  3261. LFD f22, 10 * SIZE(AO)
  3262. LFD f23, 11 * SIZE(AO)
  3263. FMADD f0, f18, f24, f0
  3264. FMADD f1, f18, f25, f1
  3265. FMADD f2, f18, f26, f2
  3266. FMADD f3, f18, f27, f3
  3267. FMADD f4, f19, f24, f4
  3268. FMADD f5, f19, f25, f5
  3269. FMADD f6, f19, f26, f6
  3270. FMADD f7, f19, f27, f7
  3271. LFD f24, 12 * SIZE(AO)
  3272. LFD f25, 13 * SIZE(AO)
  3273. LFD f26, 14 * SIZE(AO)
  3274. LFD f27, 15 * SIZE(AO)
  3275. LFD f16, 4 * SIZE(BO)
  3276. LFD f17, 5 * SIZE(BO)
  3277. LFD f18, 6 * SIZE(BO)
  3278. LFD f19, 7 * SIZE(BO)
  3279. FMADD f0, f16, f20, f0
  3280. FMADD f1, f16, f21, f1
  3281. FMADD f2, f16, f22, f2
  3282. FMADD f3, f16, f23, f3
  3283. FMADD f4, f17, f20, f4
  3284. FMADD f5, f17, f21, f5
  3285. FMADD f6, f17, f22, f6
  3286. FMADD f7, f17, f23, f7
  3287. LFD f20, 16 * SIZE(AO)
  3288. LFD f21, 17 * SIZE(AO)
  3289. LFD f22, 18 * SIZE(AO)
  3290. LFD f23, 19 * SIZE(AO)
  3291. FMADD f0, f18, f24, f0
  3292. FMADD f1, f18, f25, f1
  3293. FMADD f2, f18, f26, f2
  3294. FMADD f3, f18, f27, f3
  3295. FMADD f4, f19, f24, f4
  3296. FMADD f5, f19, f25, f5
  3297. FMADD f6, f19, f26, f6
  3298. FMADD f7, f19, f27, f7
  3299. LFD f24, 20 * SIZE(AO)
  3300. LFD f25, 21 * SIZE(AO)
  3301. LFD f26, 22 * SIZE(AO)
  3302. LFD f27, 23 * SIZE(AO)
  3303. LFD f16, 8 * SIZE(BO)
  3304. LFD f17, 9 * SIZE(BO)
  3305. LFD f18, 10 * SIZE(BO)
  3306. LFD f19, 11 * SIZE(BO)
  3307. addi AO, AO, 16 * SIZE
  3308. addi BO, BO, 8 * SIZE
  3309. dcbt PREA, AO
  3310. dcbt PREA, BO
  3311. bdnz LL(52)
  3312. .align 4
  3313. LL(55):
  3314. #if defined(LT) || defined(RN)
  3315. andi. r0, KK, 3
  3316. #else
  3317. andi. r0, TEMP, 3
  3318. #endif
  3319. mtspr CTR, r0
  3320. ble LL(57)
  3321. .align 4
  3322. LL(56):
  3323. FMADD f0, f16, f20, f0
  3324. FMADD f1, f16, f21, f1
  3325. FMADD f2, f16, f22, f2
  3326. FMADD f3, f16, f23, f3
  3327. FMADD f4, f17, f20, f4
  3328. FMADD f5, f17, f21, f5
  3329. FMADD f6, f17, f22, f6
  3330. FMADD f7, f17, f23, f7
  3331. LFD f20, 4 * SIZE(AO)
  3332. LFD f21, 5 * SIZE(AO)
  3333. LFD f22, 6 * SIZE(AO)
  3334. LFD f23, 7 * SIZE(AO)
  3335. LFD f16, 2 * SIZE(BO)
  3336. LFD f17, 3 * SIZE(BO)
  3337. addi BO, BO, 2 * SIZE
  3338. addi AO, AO, 4 * SIZE
  3339. bdnz LL(56)
  3340. .align 4
  3341. LL(57):
  3342. #ifndef CONJ
  3343. FSUB f0, f0, f5
  3344. FADD f1, f1, f4
  3345. FSUB f2, f2, f7
  3346. FADD f3, f3, f6
  3347. #else
  3348. FADD f0, f0, f5
  3349. FSUB f1, f4, f1
  3350. FADD f2, f2, f7
  3351. FSUB f3, f6, f3
  3352. #endif
  3353. #if defined(LN) || defined(RT)
  3354. #ifdef LN
  3355. subi r0, KK, 2
  3356. #else
  3357. subi r0, KK, 1
  3358. #endif
  3359. slwi TEMP, r0, 1 + ZBASE_SHIFT
  3360. slwi r0, r0, 0 + ZBASE_SHIFT
  3361. add AO, AORIG, TEMP
  3362. add BO, B, r0
  3363. #endif
  3364. #if defined(LN) || defined(LT)
  3365. LFD f16, 0 * SIZE(BO)
  3366. LFD f17, 1 * SIZE(BO)
  3367. LFD f18, 2 * SIZE(BO)
  3368. LFD f19, 3 * SIZE(BO)
  3369. FSUB f0, f16, f0
  3370. FSUB f1, f17, f1
  3371. FSUB f2, f18, f2
  3372. FSUB f3, f19, f3
  3373. #else
  3374. LFD f16, 0 * SIZE(AO)
  3375. LFD f17, 1 * SIZE(AO)
  3376. LFD f18, 2 * SIZE(AO)
  3377. LFD f19, 3 * SIZE(AO)
  3378. #ifndef CONJ
  3379. FSUB f0, f16, f0
  3380. FSUB f1, f17, f1
  3381. FSUB f2, f18, f2
  3382. FSUB f3, f19, f3
  3383. #else
  3384. FSUB f0, f16, f0
  3385. FADD f1, f17, f1
  3386. FSUB f2, f18, f2
  3387. FADD f3, f19, f3
  3388. #endif
  3389. #endif
  3390. #ifdef LN
  3391. LFD f16, 6 * SIZE(AO)
  3392. LFD f17, 7 * SIZE(AO)
  3393. LFD f18, 4 * SIZE(AO)
  3394. LFD f19, 5 * SIZE(AO)
  3395. LFD f20, 0 * SIZE(AO)
  3396. LFD f21, 1 * SIZE(AO)
  3397. FMUL f6, f17, f3
  3398. FMUL f7, f17, f2
  3399. #ifndef CONJ
  3400. FMSUB f2, f16, f2, f6
  3401. FMADD f3, f16, f3, f7
  3402. FMADD f0, f19, f3, f0
  3403. FNMSUB f1, f19, f2, f1
  3404. FNMSUB f0, f18, f2, f0
  3405. FNMSUB f1, f18, f3, f1
  3406. FMUL f4, f21, f1
  3407. FMUL f5, f21, f0
  3408. FMSUB f0, f20, f0, f4
  3409. FMADD f1, f20, f1, f5
  3410. #else
  3411. FMADD f2, f16, f2, f6
  3412. FMSUB f3, f16, f3, f7
  3413. FMSUB f0, f19, f3, f0
  3414. FNMADD f1, f19, f2, f1
  3415. FNMADD f0, f18, f2, f0
  3416. FNMADD f1, f18, f3, f1
  3417. FMUL f4, f21, f1
  3418. FMUL f5, f21, f0
  3419. FMADD f0, f20, f0, f4
  3420. FMSUB f1, f20, f1, f5
  3421. #endif
  3422. #endif
  3423. #ifdef LT
  3424. LFD f16, 0 * SIZE(AO)
  3425. LFD f17, 1 * SIZE(AO)
  3426. LFD f18, 2 * SIZE(AO)
  3427. LFD f19, 3 * SIZE(AO)
  3428. LFD f20, 6 * SIZE(AO)
  3429. LFD f21, 7 * SIZE(AO)
  3430. FMUL f4, f17, f1
  3431. FMUL f5, f17, f0
  3432. #ifndef CONJ
  3433. FMSUB f0, f16, f0, f4
  3434. FMADD f1, f16, f1, f5
  3435. FMADD f2, f19, f1, f2
  3436. FNMSUB f3, f19, f0, f3
  3437. FNMSUB f2, f18, f0, f2
  3438. FNMSUB f3, f18, f1, f3
  3439. FMUL f4, f21, f3
  3440. FMUL f5, f21, f2
  3441. FMSUB f2, f20, f2, f4
  3442. FMADD f3, f20, f3, f5
  3443. #else
  3444. FMADD f0, f16, f0, f4
  3445. FMSUB f1, f16, f1, f5
  3446. FMSUB f2, f19, f1, f2
  3447. FNMADD f3, f19, f0, f3
  3448. FNMADD f2, f18, f0, f2
  3449. FNMADD f3, f18, f1, f3
  3450. FMUL f4, f21, f3
  3451. FMUL f5, f21, f2
  3452. FMADD f2, f20, f2, f4
  3453. FMSUB f3, f20, f3, f5
  3454. #endif
  3455. #endif
  3456. #ifdef RN
  3457. LFD f16, 0 * SIZE(BO)
  3458. LFD f17, 1 * SIZE(BO)
  3459. FMUL f4, f17, f1
  3460. FMUL f5, f17, f0
  3461. FMUL f6, f17, f3
  3462. FMUL f7, f17, f2
  3463. #ifndef CONJ
  3464. FMSUB f0, f16, f0, f4
  3465. FMADD f1, f16, f1, f5
  3466. FMSUB f2, f16, f2, f6
  3467. FMADD f3, f16, f3, f7
  3468. #else
  3469. FMADD f0, f16, f0, f4
  3470. FMSUB f1, f16, f1, f5
  3471. FMADD f2, f16, f2, f6
  3472. FMSUB f3, f16, f3, f7
  3473. #endif
  3474. #endif
  3475. #ifdef RT
  3476. LFD f20, 0 * SIZE(BO)
  3477. LFD f21, 1 * SIZE(BO)
  3478. FMUL f4, f21, f1
  3479. FMUL f5, f21, f0
  3480. FMUL f6, f21, f3
  3481. FMUL f7, f21, f2
  3482. #ifndef CONJ
  3483. FMSUB f0, f20, f0, f4
  3484. FMADD f1, f20, f1, f5
  3485. FMSUB f2, f20, f2, f6
  3486. FMADD f3, f20, f3, f7
  3487. #else
  3488. FMADD f0, f20, f0, f4
  3489. FMSUB f1, f20, f1, f5
  3490. FMADD f2, f20, f2, f6
  3491. FMSUB f3, f20, f3, f7
  3492. #endif
  3493. #endif
  3494. #ifdef LN
  3495. subi CO1, CO1, 4 * SIZE
  3496. #endif
  3497. #if defined(LN) || defined(LT)
  3498. STFD f0, 0 * SIZE(BO)
  3499. STFD f1, 1 * SIZE(BO)
  3500. STFD f2, 2 * SIZE(BO)
  3501. STFD f3, 3 * SIZE(BO)
  3502. #else
  3503. STFD f0, 0 * SIZE(AO)
  3504. STFD f1, 1 * SIZE(AO)
  3505. STFD f2, 2 * SIZE(AO)
  3506. STFD f3, 3 * SIZE(AO)
  3507. #endif
  3508. STFD f0, 0 * SIZE(CO1)
  3509. STFD f1, 1 * SIZE(CO1)
  3510. STFD f2, 2 * SIZE(CO1)
  3511. STFD f3, 3 * SIZE(CO1)
  3512. #ifndef LN
  3513. addi CO1, CO1, 4 * SIZE
  3514. #endif
  3515. #ifdef RT
  3516. slwi r0, K, 1 + ZBASE_SHIFT
  3517. add AORIG, AORIG, r0
  3518. #endif
  3519. #if defined(LT) || defined(RN)
  3520. sub TEMP, K, KK
  3521. slwi r0, TEMP, 1 + ZBASE_SHIFT
  3522. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  3523. add AO, AO, r0
  3524. add BO, BO, TEMP
  3525. #endif
  3526. #ifdef LT
  3527. addi KK, KK, 2
  3528. #endif
  3529. #ifdef LN
  3530. subi KK, KK, 2
  3531. #endif
  3532. addic. I, I, -1
  3533. bgt LL(51)
  3534. .align 4
  3535. LL(60):
  3536. andi. I, M, 1
  3537. ble LL(69)
  3538. #if defined(LT) || defined(RN)
  3539. LFD f16, 0 * SIZE(AO)
  3540. LFD f17, 1 * SIZE(AO)
  3541. LFD f18, 2 * SIZE(AO)
  3542. LFD f19, 3 * SIZE(AO)
  3543. LFD f20, 0 * SIZE(B)
  3544. LFD f21, 1 * SIZE(B)
  3545. LFD f22, 2 * SIZE(B)
  3546. LFD f23, 3 * SIZE(B)
  3547. lfs f0, FZERO
  3548. fmr f1, f0
  3549. fmr f2, f0
  3550. fmr f3, f0
  3551. fmr f4, f0
  3552. fmr f5, f0
  3553. fmr f6, f0
  3554. fmr f7, f0
  3555. srawi. r0, KK, 2
  3556. mr BO, B
  3557. mtspr CTR, r0
  3558. #else
  3559. #ifdef LN
  3560. slwi r0, K, 0 + ZBASE_SHIFT
  3561. sub AORIG, AORIG, r0
  3562. #endif
  3563. slwi r0, KK, 0 + ZBASE_SHIFT
  3564. add AO, AORIG, r0
  3565. add BO, B, r0
  3566. sub TEMP, K, KK
  3567. LFD f16, 0 * SIZE(AO)
  3568. LFD f17, 1 * SIZE(AO)
  3569. LFD f18, 2 * SIZE(AO)
  3570. LFD f19, 3 * SIZE(AO)
  3571. LFD f20, 0 * SIZE(BO)
  3572. LFD f21, 1 * SIZE(BO)
  3573. LFD f22, 2 * SIZE(BO)
  3574. LFD f23, 3 * SIZE(BO)
  3575. lfs f0, FZERO
  3576. fmr f1, f0
  3577. fmr f2, f0
  3578. fmr f3, f0
  3579. fmr f4, f0
  3580. fmr f5, f0
  3581. fmr f6, f0
  3582. fmr f7, f0
  3583. srawi. r0, TEMP, 2
  3584. mtspr CTR, r0
  3585. #endif
  3586. ble LL(65)
  3587. .align 4
  3588. LL(62):
  3589. FMADD f0, f16, f20, f0
  3590. FMADD f1, f17, f21, f1
  3591. FMADD f2, f17, f20, f2
  3592. FMADD f3, f16, f21, f3
  3593. LFD f16, 4 * SIZE(AO)
  3594. LFD f17, 5 * SIZE(AO)
  3595. LFD f20, 4 * SIZE(BO)
  3596. LFD f21, 5 * SIZE(BO)
  3597. FMADD f4, f18, f22, f4
  3598. FMADD f5, f19, f23, f5
  3599. FMADD f6, f19, f22, f6
  3600. FMADD f7, f18, f23, f7
  3601. LFD f18, 6 * SIZE(AO)
  3602. LFD f19, 7 * SIZE(AO)
  3603. LFD f22, 6 * SIZE(BO)
  3604. LFD f23, 7 * SIZE(BO)
  3605. FMADD f0, f16, f20, f0
  3606. FMADD f1, f17, f21, f1
  3607. FMADD f2, f17, f20, f2
  3608. FMADD f3, f16, f21, f3
  3609. LFD f16, 8 * SIZE(AO)
  3610. LFD f17, 9 * SIZE(AO)
  3611. LFD f20, 8 * SIZE(BO)
  3612. LFD f21, 9 * SIZE(BO)
  3613. FMADD f4, f18, f22, f4
  3614. FMADD f5, f19, f23, f5
  3615. FMADD f6, f19, f22, f6
  3616. FMADD f7, f18, f23, f7
  3617. LFD f18, 10 * SIZE(AO)
  3618. LFD f19, 11 * SIZE(AO)
  3619. LFD f22, 10 * SIZE(BO)
  3620. LFD f23, 11 * SIZE(BO)
  3621. addi AO, AO, 8 * SIZE
  3622. addi BO, BO, 8 * SIZE
  3623. bdnz LL(62)
  3624. .align 4
  3625. LL(65):
  3626. fadd f0, f0, f4
  3627. fadd f1, f1, f5
  3628. fadd f2, f2, f6
  3629. fadd f3, f3, f7
  3630. #if defined(LT) || defined(RN)
  3631. andi. r0, KK, 3
  3632. #else
  3633. andi. r0, TEMP, 3
  3634. #endif
  3635. mtspr CTR,r0
  3636. ble LL(67)
  3637. .align 4
  3638. LL(66):
  3639. FMADD f0, f16, f20, f0
  3640. FMADD f1, f17, f21, f1
  3641. FMADD f2, f17, f20, f2
  3642. FMADD f3, f16, f21, f3
  3643. LFD f16, 2 * SIZE(AO)
  3644. LFD f17, 3 * SIZE(AO)
  3645. LFD f20, 2 * SIZE(BO)
  3646. LFD f21, 3 * SIZE(BO)
  3647. addi AO, AO, 2 * SIZE
  3648. addi BO, BO, 2 * SIZE
  3649. bdnz LL(66)
  3650. .align 4
  3651. LL(67):
  3652. #ifndef CONJ
  3653. FSUB f0, f0, f1
  3654. FADD f1, f2, f3
  3655. #else
  3656. FADD f0, f0, f1
  3657. FSUB f1, f3, f2
  3658. #endif
  3659. #if defined(LN) || defined(RT)
  3660. subi r0, KK, 1
  3661. slwi r0, r0, 0 + ZBASE_SHIFT
  3662. add AO, AORIG, r0
  3663. add BO, B, r0
  3664. #endif
  3665. #if defined(LN) || defined(LT)
  3666. LFD f16, 0 * SIZE(BO)
  3667. LFD f17, 1 * SIZE(BO)
  3668. FSUB f0, f16, f0
  3669. FSUB f1, f17, f1
  3670. #else
  3671. LFD f16, 0 * SIZE(AO)
  3672. LFD f17, 1 * SIZE(AO)
  3673. #ifndef CONJ
  3674. FSUB f0, f16, f0
  3675. FSUB f1, f17, f1
  3676. #else
  3677. FSUB f0, f16, f0
  3678. FADD f1, f17, f1
  3679. #endif
  3680. #endif
  3681. #ifdef LN
  3682. LFD f20, 0 * SIZE(AO)
  3683. LFD f21, 1 * SIZE(AO)
  3684. FMUL f4, f21, f1
  3685. FMUL f5, f21, f0
  3686. #ifndef CONJ
  3687. FMSUB f0, f20, f0, f4
  3688. FMADD f1, f20, f1, f5
  3689. #else
  3690. FMADD f0, f20, f0, f4
  3691. FMSUB f1, f20, f1, f5
  3692. #endif
  3693. #endif
  3694. #ifdef LT
  3695. LFD f16, 0 * SIZE(AO)
  3696. LFD f17, 1 * SIZE(AO)
  3697. FMUL f4, f17, f1
  3698. FMUL f5, f17, f0
  3699. #ifndef CONJ
  3700. FMSUB f0, f16, f0, f4
  3701. FMADD f1, f16, f1, f5
  3702. #else
  3703. FMADD f0, f16, f0, f4
  3704. FMSUB f1, f16, f1, f5
  3705. #endif
  3706. #endif
  3707. #ifdef RN
  3708. LFD f16, 0 * SIZE(BO)
  3709. LFD f17, 1 * SIZE(BO)
  3710. FMUL f4, f17, f1
  3711. FMUL f5, f17, f0
  3712. #ifndef CONJ
  3713. FMSUB f0, f16, f0, f4
  3714. FMADD f1, f16, f1, f5
  3715. #else
  3716. FMADD f0, f16, f0, f4
  3717. FMSUB f1, f16, f1, f5
  3718. #endif
  3719. #endif
  3720. #ifdef RT
  3721. LFD f20, 0 * SIZE(BO)
  3722. LFD f21, 1 * SIZE(BO)
  3723. FMUL f4, f21, f1
  3724. FMUL f5, f21, f0
  3725. #ifndef CONJ
  3726. FMSUB f0, f20, f0, f4
  3727. FMADD f1, f20, f1, f5
  3728. #else
  3729. FMADD f0, f20, f0, f4
  3730. FMSUB f1, f20, f1, f5
  3731. #endif
  3732. #endif
  3733. #ifdef LN
  3734. subi CO1, CO1, 2 * SIZE
  3735. #endif
  3736. #if defined(LN) || defined(LT)
  3737. STFD f0, 0 * SIZE(BO)
  3738. STFD f1, 1 * SIZE(BO)
  3739. #else
  3740. STFD f0, 0 * SIZE(AO)
  3741. STFD f1, 1 * SIZE(AO)
  3742. #endif
  3743. STFD f0, 0 * SIZE(CO1)
  3744. STFD f1, 1 * SIZE(CO1)
  3745. #ifndef LN
  3746. addi CO1, CO1, 2 * SIZE
  3747. #endif
  3748. #ifdef RT
  3749. slwi r0, K, 0 + ZBASE_SHIFT
  3750. add AORIG, AORIG, r0
  3751. #endif
  3752. #if defined(LT) || defined(RN)
  3753. sub TEMP, K, KK
  3754. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  3755. add AO, AO, TEMP
  3756. add BO, BO, TEMP
  3757. #endif
  3758. #ifdef LT
  3759. addi KK, KK, 1
  3760. #endif
  3761. #ifdef LN
  3762. subi KK, KK, 1
  3763. #endif
  3764. .align 4
  3765. LL(69):
  3766. #ifdef LN
  3767. slwi r0, K, 0 + ZBASE_SHIFT
  3768. add B, B, r0
  3769. #endif
  3770. #if defined(LT) || defined(RN)
  3771. mr B, BO
  3772. #endif
  3773. #ifdef RN
  3774. addi KK, KK, 1
  3775. #endif
  3776. #ifdef RT
  3777. subi KK, KK, 1
  3778. #endif
  3779. .align 4
  3780. LL(999):
  3781. addi r3, 0, 0
  3782. lfd f14, 0(SP)
  3783. lfd f15, 8(SP)
  3784. lfd f16, 16(SP)
  3785. lfd f17, 24(SP)
  3786. lfd f18, 32(SP)
  3787. lfd f19, 40(SP)
  3788. lfd f20, 48(SP)
  3789. lfd f21, 56(SP)
  3790. lfd f22, 64(SP)
  3791. lfd f23, 72(SP)
  3792. lfd f24, 80(SP)
  3793. lfd f25, 88(SP)
  3794. lfd f26, 96(SP)
  3795. lfd f27, 104(SP)
  3796. lfd f28, 112(SP)
  3797. lfd f29, 120(SP)
  3798. lfd f30, 128(SP)
  3799. lfd f31, 136(SP)
  3800. #ifdef __64BIT__
  3801. ld r31, 144(SP)
  3802. ld r30, 152(SP)
  3803. ld r29, 160(SP)
  3804. ld r28, 168(SP)
  3805. ld r27, 176(SP)
  3806. ld r26, 184(SP)
  3807. ld r25, 192(SP)
  3808. ld r24, 200(SP)
  3809. ld r23, 208(SP)
  3810. ld r22, 216(SP)
  3811. ld r21, 224(SP)
  3812. ld r20, 232(SP)
  3813. ld r19, 240(SP)
  3814. #else
  3815. lwz r31, 144(SP)
  3816. lwz r30, 148(SP)
  3817. lwz r29, 152(SP)
  3818. lwz r28, 156(SP)
  3819. lwz r27, 160(SP)
  3820. lwz r26, 164(SP)
  3821. lwz r25, 168(SP)
  3822. lwz r24, 172(SP)
  3823. lwz r23, 176(SP)
  3824. lwz r22, 180(SP)
  3825. lwz r21, 184(SP)
  3826. lwz r20, 188(SP)
  3827. lwz r19, 192(SP)
  3828. #endif
  3829. addi SP, SP, STACKSIZE
  3830. blr
  3831. EPILOGUE
  3832. #endif