You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT.S 62 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #ifdef linux
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. #define PREB r30
  100. #define PREC r31
  101. #ifndef NEEDPARAM
  102. PROLOGUE
  103. PROFCODE
  104. addi SP, SP, -STACKSIZE
  105. li r0, 0
  106. stfd f14, 0(SP)
  107. stfd f15, 8(SP)
  108. stfd f16, 16(SP)
  109. stfd f17, 24(SP)
  110. stfd f18, 32(SP)
  111. stfd f19, 40(SP)
  112. stfd f20, 48(SP)
  113. stfd f21, 56(SP)
  114. stfd f22, 64(SP)
  115. stfd f23, 72(SP)
  116. stfd f24, 80(SP)
  117. stfd f25, 88(SP)
  118. stfd f26, 96(SP)
  119. stfd f27, 104(SP)
  120. stfd f28, 112(SP)
  121. stfd f29, 120(SP)
  122. stfd f30, 128(SP)
  123. stfd f31, 136(SP)
  124. #ifdef __64BIT__
  125. std r31, 144(SP)
  126. std r30, 152(SP)
  127. std r29, 160(SP)
  128. std r28, 168(SP)
  129. std r27, 176(SP)
  130. std r26, 184(SP)
  131. std r25, 192(SP)
  132. std r24, 200(SP)
  133. std r23, 208(SP)
  134. std r22, 216(SP)
  135. std r21, 224(SP)
  136. std r20, 232(SP)
  137. std r19, 240(SP)
  138. std r18, 248(SP)
  139. #else
  140. stw r31, 144(SP)
  141. stw r30, 148(SP)
  142. stw r29, 152(SP)
  143. stw r28, 156(SP)
  144. stw r27, 160(SP)
  145. stw r26, 164(SP)
  146. stw r25, 168(SP)
  147. stw r24, 172(SP)
  148. stw r23, 176(SP)
  149. stw r22, 180(SP)
  150. stw r21, 184(SP)
  151. stw r20, 188(SP)
  152. stw r19, 192(SP)
  153. stw r18, 196(SP)
  154. #endif
  155. stw r0, FZERO
  156. #if defined(_AIX) || defined(__APPLE__)
  157. #if !defined(__64BIT__) && defined(DOUBLE)
  158. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  159. #endif
  160. #endif
  161. slwi LDC, LDC, BASE_SHIFT
  162. #if defined(linux) && defined(__64BIT__)
  163. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  164. #endif
  165. #if defined(_AIX) || defined(__APPLE__)
  166. #ifdef __64BIT__
  167. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  168. #else
  169. #ifdef DOUBLE
  170. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  171. #else
  172. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  173. #endif
  174. #endif
  175. #endif
  176. #ifdef LN
  177. mullw r0, M, K
  178. slwi r0, r0, BASE_SHIFT
  179. add A, A, r0
  180. slwi r0, M, BASE_SHIFT
  181. add C, C, r0
  182. #endif
  183. #ifdef RN
  184. neg KK, OFFSET
  185. #endif
  186. #ifdef RT
  187. mullw r0, N, K
  188. slwi r0, r0, BASE_SHIFT
  189. add B, B, r0
  190. mullw r0, N, LDC
  191. add C, C, r0
  192. sub KK, N, OFFSET
  193. #endif
  194. cmpwi cr0, M, 0
  195. ble LL(999)
  196. cmpwi cr0, N, 0
  197. ble LL(999)
  198. cmpwi cr0, K, 0
  199. ble LL(999)
  200. #ifndef PREFETCHTEST
  201. #if defined(TRSMKERNEL) && defined(LN)
  202. /* Direction is special */
  203. #ifdef PPC970
  204. li PREC, -4 * SIZE
  205. #endif
  206. #ifdef POWER4
  207. li PREC, -4 * SIZE
  208. #endif
  209. #ifdef POWER5
  210. li PREC, -4 * SIZE
  211. #endif
  212. #ifdef CELL
  213. li PREC, -4 * SIZE
  214. #endif
  215. #else
  216. /* Normal prefetch */
  217. #ifdef PPC970
  218. li PREC, 4 * SIZE
  219. #endif
  220. #ifdef POWER4
  221. li PREC, 4 * SIZE /* is 12 best? */
  222. #endif
  223. #ifdef POWER5
  224. li PREC, 3 * SIZE
  225. #endif
  226. #endif
  227. #else
  228. #ifdef linux
  229. #ifndef __64BIT__
  230. mr PREA, r10
  231. lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
  232. lwz PREC, FRAMESLOT(1) + STACKSIZE(SP)
  233. #else
  234. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  235. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  236. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  237. #endif
  238. #endif
  239. #if defined(_AIX) || defined(__APPLE__)
  240. #ifdef __64BIT__
  241. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  242. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  243. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  244. #else
  245. #ifdef DOUBLE
  246. lwz PREA, FRAMESLOT(1) + STACKSIZE(SP)
  247. lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
  248. lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
  249. #else
  250. lwz PREA, FRAMESLOT(0) + STACKSIZE(SP)
  251. lwz PREB, FRAMESLOT(1) + STACKSIZE(SP)
  252. lwz PREC, FRAMESLOT(2) + STACKSIZE(SP)
  253. #endif
  254. #endif
  255. #endif
  256. #endif
  257. #ifndef PREFETCHTEST
  258. #ifdef PPC970
  259. #ifdef ALLOC_HUGETLB
  260. li PREA, (16 * 5 * SIZE | 1)
  261. li PREB, (16 * 5 * SIZE | 3)
  262. #else
  263. li PREA, (16 * 14 * SIZE | 1)
  264. li PREB, (16 * 8 * SIZE | 3)
  265. #endif
  266. #endif
  267. #ifdef POWER4
  268. #ifdef ALLOC_HUGETLB
  269. li PREA, (16 * 1 * SIZE + 16)
  270. li PREB, (16 * 1 * SIZE + 16)
  271. #else
  272. li PREA, (16 * 2 * SIZE + 16)
  273. li PREB, (16 * 2 * SIZE + 16)
  274. #endif
  275. #endif
  276. #ifdef POWER5
  277. #ifdef ALLOC_HUGETLB
  278. li PREA, (16 * 7 * SIZE | 1)
  279. li PREB, (16 * 7 * SIZE | 3)
  280. #else
  281. li PREA, (16 * 12 * SIZE | 1)
  282. li PREB, (16 * 6 * SIZE | 3)
  283. #endif
  284. #endif
  285. #ifdef CELL
  286. li PREA, (16 * 12 * SIZE)
  287. li PREB, (16 * 12 * SIZE)
  288. #endif
  289. #endif
  290. lfs f0, FZERO
  291. srawi. J, N, 2
  292. ble LL(40)
  293. .align 4
  294. LL(10):
  295. #ifdef RT
  296. slwi r0, K, 2 + BASE_SHIFT
  297. sub B, B, r0
  298. slwi r0, LDC, 2
  299. sub C, C, r0
  300. #endif
  301. mr CO1, C
  302. add CO2, C, LDC
  303. add CO3, CO2, LDC
  304. add CO4, CO3, LDC
  305. #ifdef LN
  306. add KK, M, OFFSET
  307. #endif
  308. #ifdef LT
  309. mr KK, OFFSET
  310. #endif
  311. fmr f1, f0
  312. fmr f2, f0
  313. fmr f3, f0
  314. fmr f4, f0
  315. fmr f5, f0
  316. fmr f6, f0
  317. fmr f7, f0
  318. fmr f8, f0
  319. fmr f9, f0
  320. fmr f10, f0
  321. fmr f11, f0
  322. fmr f12, f0
  323. fmr f13, f0
  324. fmr f14, f0
  325. fmr f15, f0
  326. srawi. I, M, 2
  327. #if defined(LN) || defined(RT)
  328. mr AORIG, A
  329. #else
  330. mr AO, A
  331. #endif
  332. #ifndef RT
  333. add C, CO4, LDC
  334. #endif
  335. ble LL(20)
  336. .align 4
  337. LL(11):
  338. #if defined(LT) || defined(RN)
  339. LFD f16, 0 * SIZE(AO)
  340. LFD f17, 1 * SIZE(AO)
  341. LFD f18, 2 * SIZE(AO)
  342. LFD f19, 3 * SIZE(AO)
  343. LFD f20, 0 * SIZE(B)
  344. LFD f21, 1 * SIZE(B)
  345. LFD f22, 2 * SIZE(B)
  346. LFD f23, 3 * SIZE(B)
  347. dcbt CO1, PREC
  348. dcbt CO2, PREC
  349. dcbt CO3, PREC
  350. dcbt CO4, PREC
  351. srawi. r0, KK, 2
  352. mtspr CTR, r0
  353. mr BO, B
  354. #else
  355. #ifdef LN
  356. slwi r0, K, 2 + BASE_SHIFT
  357. sub AORIG, AORIG, r0
  358. #endif
  359. slwi TEMP, KK, 2 + BASE_SHIFT
  360. add AO, AORIG, TEMP
  361. add BO, B, TEMP
  362. sub TEMP, K, KK
  363. LFD f16, 0 * SIZE(AO)
  364. LFD f17, 1 * SIZE(AO)
  365. LFD f18, 2 * SIZE(AO)
  366. LFD f19, 3 * SIZE(AO)
  367. LFD f20, 0 * SIZE(BO)
  368. LFD f21, 1 * SIZE(BO)
  369. LFD f22, 2 * SIZE(BO)
  370. LFD f23, 3 * SIZE(BO)
  371. dcbt CO1, PREC
  372. dcbt CO2, PREC
  373. dcbt CO3, PREC
  374. dcbt CO4, PREC
  375. srawi. r0, TEMP, 2
  376. mtspr CTR, r0
  377. #endif
  378. ble LL(15)
  379. .align 4
  380. LL(12):
  381. FMADD f0, f16, f20, f0
  382. FMADD f5, f17, f21, f5
  383. FMADD f10, f18, f22, f10
  384. FMADD f15, f19, f23, f15
  385. LFD f28, 4 * SIZE(BO)
  386. LFD f29, 5 * SIZE(BO)
  387. LFD f30, 6 * SIZE(BO)
  388. LFD f31, 7 * SIZE(BO)
  389. FMADD f1, f17, f20, f1
  390. FMADD f2, f18, f20, f2
  391. FMADD f3, f19, f20, f3
  392. FMADD f4, f16, f21, f4
  393. LFD f24, 4 * SIZE(AO)
  394. LFD f25, 5 * SIZE(AO)
  395. LFD f26, 6 * SIZE(AO)
  396. LFD f27, 7 * SIZE(AO)
  397. FMADD f6, f18, f21, f6
  398. FMADD f7, f19, f21, f7
  399. FMADD f8, f16, f22, f8
  400. FMADD f9, f17, f22, f9
  401. FMADD f11, f19, f22, f11
  402. FMADD f12, f16, f23, f12
  403. FMADD f13, f17, f23, f13
  404. FMADD f14, f18, f23, f14
  405. LFD f20, 8 * SIZE(BO)
  406. LFD f21, 9 * SIZE(BO)
  407. LFD f22, 10 * SIZE(BO)
  408. LFD f23, 11 * SIZE(BO)
  409. FMADD f0, f24, f28, f0
  410. FMADD f5, f25, f29, f5
  411. FMADD f10, f26, f30, f10
  412. FMADD f15, f27, f31, f15
  413. LFD f16, 8 * SIZE(AO)
  414. LFD f17, 9 * SIZE(AO)
  415. LFD f18, 10 * SIZE(AO)
  416. LFD f19, 11 * SIZE(AO)
  417. FMADD f1, f25, f28, f1
  418. FMADD f2, f26, f28, f2
  419. FMADD f3, f27, f28, f3
  420. FMADD f4, f24, f29, f4
  421. FMADD f6, f26, f29, f6
  422. FMADD f7, f27, f29, f7
  423. FMADD f8, f24, f30, f8
  424. FMADD f9, f25, f30, f9
  425. FMADD f11, f27, f30, f11
  426. FMADD f12, f24, f31, f12
  427. FMADD f13, f25, f31, f13
  428. FMADD f14, f26, f31, f14
  429. LFD f28, 12 * SIZE(BO)
  430. LFD f29, 13 * SIZE(BO)
  431. LFD f30, 14 * SIZE(BO)
  432. LFD f31, 15 * SIZE(BO)
  433. FMADD f0, f16, f20, f0
  434. FMADD f5, f17, f21, f5
  435. FMADD f10, f18, f22, f10
  436. FMADD f15, f19, f23, f15
  437. LFD f24, 12 * SIZE(AO)
  438. LFD f25, 13 * SIZE(AO)
  439. LFD f26, 14 * SIZE(AO)
  440. LFD f27, 15 * SIZE(AO)
  441. FMADD f1, f17, f20, f1
  442. FMADD f2, f18, f20, f2
  443. FMADD f3, f19, f20, f3
  444. FMADD f4, f16, f21, f4
  445. FMADD f6, f18, f21, f6
  446. FMADD f7, f19, f21, f7
  447. FMADD f8, f16, f22, f8
  448. FMADD f9, f17, f22, f9
  449. FMADD f11, f19, f22, f11
  450. FMADD f12, f16, f23, f12
  451. FMADD f13, f17, f23, f13
  452. FMADD f14, f18, f23, f14
  453. LFD f20, 16 * SIZE(BO)
  454. LFD f21, 17 * SIZE(BO)
  455. LFD f22, 18 * SIZE(BO)
  456. LFD f23, 19 * SIZE(BO)
  457. FMADD f0, f24, f28, f0
  458. FMADD f5, f25, f29, f5
  459. FMADD f10, f26, f30, f10
  460. FMADD f15, f27, f31, f15
  461. LFD f16, 16 * SIZE(AO)
  462. LFD f17, 17 * SIZE(AO)
  463. LFD f18, 18 * SIZE(AO)
  464. LFD f19, 19 * SIZE(AO)
  465. FMADD f1, f25, f28, f1
  466. FMADD f2, f26, f28, f2
  467. FMADD f3, f27, f28, f3
  468. FMADD f4, f24, f29, f4
  469. FMADD f6, f26, f29, f6
  470. FMADD f7, f27, f29, f7
  471. FMADD f8, f24, f30, f8
  472. FMADD f9, f25, f30, f9
  473. FMADD f11, f27, f30, f11
  474. FMADD f12, f24, f31, f12
  475. FMADD f13, f25, f31, f13
  476. FMADD f14, f26, f31, f14
  477. addi AO, AO, 16 * SIZE
  478. addi BO, BO, 16 * SIZE
  479. #ifdef PPC970
  480. #ifndef ALLOC_HUGETLB
  481. DCBT(AO, PREA)
  482. #endif
  483. DCBT(BO, PREB)
  484. #endif
  485. #ifdef POWER4
  486. #ifndef ALLOC_HUGETLB
  487. DCBT(AO, PREA)
  488. #endif
  489. DCBT(BO, PREB)
  490. #endif
  491. #ifdef POWER5
  492. DCBT(AO, PREA)
  493. DCBT(BO, PREB)
  494. #endif
  495. bdnz LL(12)
  496. .align 4
  497. LL(15):
  498. #if defined(LT) || defined(RN)
  499. andi. r0, KK, 3
  500. #else
  501. andi. r0, TEMP, 3
  502. #endif
  503. mtspr CTR, r0
  504. ble+ LL(18)
  505. .align 4
  506. LL(16):
  507. FMADD f0, f16, f20, f0
  508. FMADD f5, f17, f21, f5
  509. FMADD f10, f18, f22, f10
  510. FMADD f15, f19, f23, f15
  511. FMADD f1, f17, f20, f1
  512. FMADD f2, f18, f20, f2
  513. FMADD f3, f19, f20, f3
  514. FMADD f4, f16, f21, f4
  515. FMADD f6, f18, f21, f6
  516. FMADD f7, f19, f21, f7
  517. FMADD f8, f16, f22, f8
  518. FMADD f9, f17, f22, f9
  519. FMADD f11, f19, f22, f11
  520. FMADD f12, f16, f23, f12
  521. FMADD f13, f17, f23, f13
  522. FMADD f14, f18, f23, f14
  523. LFD f16, 4 * SIZE(AO)
  524. LFD f17, 5 * SIZE(AO)
  525. LFD f18, 6 * SIZE(AO)
  526. LFD f19, 7 * SIZE(AO)
  527. LFD f20, 4 * SIZE(BO)
  528. LFD f21, 5 * SIZE(BO)
  529. LFD f22, 6 * SIZE(BO)
  530. LFD f23, 7 * SIZE(BO)
  531. addi BO, BO, 4 * SIZE
  532. addi AO, AO, 4 * SIZE
  533. bdnz LL(16)
  534. .align 4
  535. LL(18):
  536. #if defined(LN) || defined(RT)
  537. subi r0, KK, 4
  538. slwi r0, r0, 2 + BASE_SHIFT
  539. add AO, AORIG, r0
  540. add BO, B, r0
  541. #endif
  542. #if defined(LN) || defined(LT)
  543. LFD f16, 0 * SIZE(BO)
  544. LFD f17, 1 * SIZE(BO)
  545. LFD f18, 2 * SIZE(BO)
  546. LFD f19, 3 * SIZE(BO)
  547. LFD f20, 4 * SIZE(BO)
  548. LFD f21, 5 * SIZE(BO)
  549. LFD f22, 6 * SIZE(BO)
  550. LFD f23, 7 * SIZE(BO)
  551. LFD f24, 8 * SIZE(BO)
  552. LFD f25, 9 * SIZE(BO)
  553. LFD f26, 10 * SIZE(BO)
  554. LFD f27, 11 * SIZE(BO)
  555. LFD f28, 12 * SIZE(BO)
  556. LFD f29, 13 * SIZE(BO)
  557. LFD f30, 14 * SIZE(BO)
  558. LFD f31, 15 * SIZE(BO)
  559. FSUB f0, f16, f0
  560. FSUB f4, f17, f4
  561. FSUB f8, f18, f8
  562. FSUB f12, f19, f12
  563. FSUB f1, f20, f1
  564. FSUB f5, f21, f5
  565. FSUB f9, f22, f9
  566. FSUB f13, f23, f13
  567. FSUB f2, f24, f2
  568. FSUB f6, f25, f6
  569. FSUB f10, f26, f10
  570. FSUB f14, f27, f14
  571. FSUB f3, f28, f3
  572. FSUB f7, f29, f7
  573. FSUB f11, f30, f11
  574. FSUB f15, f31, f15
  575. #else
  576. LFD f16, 0 * SIZE(AO)
  577. LFD f17, 1 * SIZE(AO)
  578. LFD f18, 2 * SIZE(AO)
  579. LFD f19, 3 * SIZE(AO)
  580. LFD f20, 4 * SIZE(AO)
  581. LFD f21, 5 * SIZE(AO)
  582. LFD f22, 6 * SIZE(AO)
  583. LFD f23, 7 * SIZE(AO)
  584. LFD f24, 8 * SIZE(AO)
  585. LFD f25, 9 * SIZE(AO)
  586. LFD f26, 10 * SIZE(AO)
  587. LFD f27, 11 * SIZE(AO)
  588. LFD f28, 12 * SIZE(AO)
  589. LFD f29, 13 * SIZE(AO)
  590. LFD f30, 14 * SIZE(AO)
  591. LFD f31, 15 * SIZE(AO)
  592. FSUB f0, f16, f0
  593. FSUB f1, f17, f1
  594. FSUB f2, f18, f2
  595. FSUB f3, f19, f3
  596. FSUB f4, f20, f4
  597. FSUB f5, f21, f5
  598. FSUB f6, f22, f6
  599. FSUB f7, f23, f7
  600. FSUB f8, f24, f8
  601. FSUB f9, f25, f9
  602. FSUB f10, f26, f10
  603. FSUB f11, f27, f11
  604. FSUB f12, f28, f12
  605. FSUB f13, f29, f13
  606. FSUB f14, f30, f14
  607. FSUB f15, f31, f15
  608. #endif
  609. #ifdef LN
  610. LFD f16, 15 * SIZE(AO)
  611. LFD f17, 14 * SIZE(AO)
  612. LFD f18, 13 * SIZE(AO)
  613. LFD f19, 12 * SIZE(AO)
  614. FMUL f3, f16, f3
  615. FMUL f7, f16, f7
  616. FMUL f11, f16, f11
  617. FMUL f15, f16, f15
  618. FNMSUB f2, f17, f3, f2
  619. FNMSUB f6, f17, f7, f6
  620. FNMSUB f10, f17, f11, f10
  621. FNMSUB f14, f17, f15, f14
  622. FNMSUB f1, f18, f3, f1
  623. FNMSUB f5, f18, f7, f5
  624. FNMSUB f9, f18, f11, f9
  625. FNMSUB f13, f18, f15, f13
  626. FNMSUB f0, f19, f3, f0
  627. FNMSUB f4, f19, f7, f4
  628. FNMSUB f8, f19, f11, f8
  629. FNMSUB f12, f19, f15, f12
  630. LFD f16, 10 * SIZE(AO)
  631. LFD f17, 9 * SIZE(AO)
  632. LFD f18, 8 * SIZE(AO)
  633. LFD f19, 5 * SIZE(AO)
  634. FMUL f2, f16, f2
  635. FMUL f6, f16, f6
  636. FMUL f10, f16, f10
  637. FMUL f14, f16, f14
  638. LFD f20, 4 * SIZE(AO)
  639. LFD f21, 0 * SIZE(AO)
  640. FNMSUB f1, f17, f2, f1
  641. FNMSUB f5, f17, f6, f5
  642. FNMSUB f9, f17, f10, f9
  643. FNMSUB f13, f17, f14, f13
  644. FNMSUB f0, f18, f2, f0
  645. FNMSUB f4, f18, f6, f4
  646. FNMSUB f8, f18, f10, f8
  647. FNMSUB f12, f18, f14, f12
  648. FMUL f1, f19, f1
  649. FMUL f5, f19, f5
  650. FMUL f9, f19, f9
  651. FMUL f13, f19, f13
  652. FNMSUB f0, f20, f1, f0
  653. FNMSUB f4, f20, f5, f4
  654. FNMSUB f8, f20, f9, f8
  655. FNMSUB f12, f20, f13, f12
  656. FMUL f0, f21, f0
  657. FMUL f4, f21, f4
  658. FMUL f8, f21, f8
  659. FMUL f12, f21, f12
  660. #endif
  661. #ifdef LT
  662. LFD f16, 0 * SIZE(AO)
  663. LFD f17, 1 * SIZE(AO)
  664. LFD f18, 2 * SIZE(AO)
  665. LFD f19, 3 * SIZE(AO)
  666. FMUL f0, f16, f0
  667. FMUL f4, f16, f4
  668. FMUL f8, f16, f8
  669. FMUL f12, f16, f12
  670. FNMSUB f1, f17, f0, f1
  671. FNMSUB f5, f17, f4, f5
  672. FNMSUB f9, f17, f8, f9
  673. FNMSUB f13, f17, f12, f13
  674. FNMSUB f2, f18, f0, f2
  675. FNMSUB f6, f18, f4, f6
  676. FNMSUB f10, f18, f8, f10
  677. FNMSUB f14, f18, f12, f14
  678. FNMSUB f3, f19, f0, f3
  679. FNMSUB f7, f19, f4, f7
  680. FNMSUB f11, f19, f8, f11
  681. FNMSUB f15, f19, f12, f15
  682. LFD f16, 5 * SIZE(AO)
  683. LFD f17, 6 * SIZE(AO)
  684. LFD f18, 7 * SIZE(AO)
  685. LFD f19, 10 * SIZE(AO)
  686. FMUL f1, f16, f1
  687. FMUL f5, f16, f5
  688. FMUL f9, f16, f9
  689. FMUL f13, f16, f13
  690. LFD f20, 11 * SIZE(AO)
  691. LFD f21, 15 * SIZE(AO)
  692. FNMSUB f2, f17, f1, f2
  693. FNMSUB f6, f17, f5, f6
  694. FNMSUB f10, f17, f9, f10
  695. FNMSUB f14, f17, f13, f14
  696. FNMSUB f3, f18, f1, f3
  697. FNMSUB f7, f18, f5, f7
  698. FNMSUB f11, f18, f9, f11
  699. FNMSUB f15, f18, f13, f15
  700. FMUL f2, f19, f2
  701. FMUL f6, f19, f6
  702. FMUL f10, f19, f10
  703. FMUL f14, f19, f14
  704. FNMSUB f3, f20, f2, f3
  705. FNMSUB f7, f20, f6, f7
  706. FNMSUB f11, f20, f10, f11
  707. FNMSUB f15, f20, f14, f15
  708. FMUL f3, f21, f3
  709. FMUL f7, f21, f7
  710. FMUL f11, f21, f11
  711. FMUL f15, f21, f15
  712. #endif
  713. #ifdef RN
  714. LFD f16, 0 * SIZE(BO)
  715. LFD f17, 1 * SIZE(BO)
  716. LFD f18, 2 * SIZE(BO)
  717. LFD f19, 3 * SIZE(BO)
  718. FMUL f0, f16, f0
  719. FMUL f1, f16, f1
  720. FMUL f2, f16, f2
  721. FMUL f3, f16, f3
  722. FNMSUB f4, f17, f0, f4
  723. FNMSUB f5, f17, f1, f5
  724. FNMSUB f6, f17, f2, f6
  725. FNMSUB f7, f17, f3, f7
  726. FNMSUB f8, f18, f0, f8
  727. FNMSUB f9, f18, f1, f9
  728. FNMSUB f10, f18, f2, f10
  729. FNMSUB f11, f18, f3, f11
  730. FNMSUB f12, f19, f0, f12
  731. FNMSUB f13, f19, f1, f13
  732. FNMSUB f14, f19, f2, f14
  733. FNMSUB f15, f19, f3, f15
  734. LFD f16, 5 * SIZE(BO)
  735. LFD f17, 6 * SIZE(BO)
  736. LFD f18, 7 * SIZE(BO)
  737. LFD f19, 10 * SIZE(BO)
  738. FMUL f4, f16, f4
  739. FMUL f5, f16, f5
  740. FMUL f6, f16, f6
  741. FMUL f7, f16, f7
  742. LFD f20, 11 * SIZE(BO)
  743. LFD f21, 15 * SIZE(BO)
  744. FNMSUB f8, f17, f4, f8
  745. FNMSUB f9, f17, f5, f9
  746. FNMSUB f10, f17, f6, f10
  747. FNMSUB f11, f17, f7, f11
  748. FNMSUB f12, f18, f4, f12
  749. FNMSUB f13, f18, f5, f13
  750. FNMSUB f14, f18, f6, f14
  751. FNMSUB f15, f18, f7, f15
  752. FMUL f8, f19, f8
  753. FMUL f9, f19, f9
  754. FMUL f10, f19, f10
  755. FMUL f11, f19, f11
  756. FNMSUB f12, f20, f8, f12
  757. FNMSUB f13, f20, f9, f13
  758. FNMSUB f14, f20, f10, f14
  759. FNMSUB f15, f20, f11, f15
  760. FMUL f12, f21, f12
  761. FMUL f13, f21, f13
  762. FMUL f14, f21, f14
  763. FMUL f15, f21, f15
  764. #endif
  765. #ifdef RT
  766. LFD f16, 15 * SIZE(BO)
  767. LFD f17, 14 * SIZE(BO)
  768. LFD f18, 13 * SIZE(BO)
  769. LFD f19, 12 * SIZE(BO)
  770. FMUL f12, f16, f12
  771. FMUL f13, f16, f13
  772. FMUL f14, f16, f14
  773. FMUL f15, f16, f15
  774. FNMSUB f8, f17, f12, f8
  775. FNMSUB f9, f17, f13, f9
  776. FNMSUB f10, f17, f14, f10
  777. FNMSUB f11, f17, f15, f11
  778. FNMSUB f4, f18, f12, f4
  779. FNMSUB f5, f18, f13, f5
  780. FNMSUB f6, f18, f14, f6
  781. FNMSUB f7, f18, f15, f7
  782. FNMSUB f0, f19, f12, f0
  783. FNMSUB f1, f19, f13, f1
  784. FNMSUB f2, f19, f14, f2
  785. FNMSUB f3, f19, f15, f3
  786. LFD f16, 10 * SIZE(BO)
  787. LFD f17, 9 * SIZE(BO)
  788. LFD f18, 8 * SIZE(BO)
  789. LFD f19, 5 * SIZE(BO)
  790. FMUL f8, f16, f8
  791. FMUL f9, f16, f9
  792. FMUL f10, f16, f10
  793. FMUL f11, f16, f11
  794. LFD f20, 4 * SIZE(BO)
  795. LFD f21, 0 * SIZE(BO)
  796. FNMSUB f4, f17, f8, f4
  797. FNMSUB f5, f17, f9, f5
  798. FNMSUB f6, f17, f10, f6
  799. FNMSUB f7, f17, f11, f7
  800. FNMSUB f0, f18, f8, f0
  801. FNMSUB f1, f18, f9, f1
  802. FNMSUB f2, f18, f10, f2
  803. FNMSUB f3, f18, f11, f3
  804. FMUL f4, f19, f4
  805. FMUL f5, f19, f5
  806. FMUL f6, f19, f6
  807. FMUL f7, f19, f7
  808. FNMSUB f0, f20, f4, f0
  809. FNMSUB f1, f20, f5, f1
  810. FNMSUB f2, f20, f6, f2
  811. FNMSUB f3, f20, f7, f3
  812. FMUL f0, f21, f0
  813. FMUL f1, f21, f1
  814. FMUL f2, f21, f2
  815. FMUL f3, f21, f3
  816. #endif
  817. #ifdef LN
  818. subi CO1, CO1, 4 * SIZE
  819. subi CO2, CO2, 4 * SIZE
  820. subi CO3, CO3, 4 * SIZE
  821. subi CO4, CO4, 4 * SIZE
  822. #endif
  823. #if defined(LN) || defined(LT)
  824. STFD f0, 0 * SIZE(BO)
  825. STFD f4, 1 * SIZE(BO)
  826. STFD f8, 2 * SIZE(BO)
  827. STFD f12, 3 * SIZE(BO)
  828. STFD f1, 4 * SIZE(BO)
  829. STFD f5, 5 * SIZE(BO)
  830. STFD f9, 6 * SIZE(BO)
  831. STFD f13, 7 * SIZE(BO)
  832. STFD f2, 8 * SIZE(BO)
  833. STFD f6, 9 * SIZE(BO)
  834. STFD f10, 10 * SIZE(BO)
  835. STFD f14, 11 * SIZE(BO)
  836. STFD f3, 12 * SIZE(BO)
  837. STFD f7, 13 * SIZE(BO)
  838. STFD f11, 14 * SIZE(BO)
  839. STFD f15, 15 * SIZE(BO)
  840. #else
  841. STFD f0, 0 * SIZE(AO)
  842. STFD f1, 1 * SIZE(AO)
  843. STFD f2, 2 * SIZE(AO)
  844. STFD f3, 3 * SIZE(AO)
  845. STFD f4, 4 * SIZE(AO)
  846. STFD f5, 5 * SIZE(AO)
  847. STFD f6, 6 * SIZE(AO)
  848. STFD f7, 7 * SIZE(AO)
  849. STFD f8, 8 * SIZE(AO)
  850. STFD f9, 9 * SIZE(AO)
  851. STFD f10, 10 * SIZE(AO)
  852. STFD f11, 11 * SIZE(AO)
  853. STFD f12, 12 * SIZE(AO)
  854. STFD f13, 13 * SIZE(AO)
  855. STFD f14, 14 * SIZE(AO)
  856. STFD f15, 15 * SIZE(AO)
  857. #endif
  858. STFD f0, 0 * SIZE(CO1)
  859. STFD f1, 1 * SIZE(CO1)
  860. STFD f2, 2 * SIZE(CO1)
  861. STFD f3, 3 * SIZE(CO1)
  862. STFD f4, 0 * SIZE(CO2)
  863. STFD f5, 1 * SIZE(CO2)
  864. STFD f6, 2 * SIZE(CO2)
  865. STFD f7, 3 * SIZE(CO2)
  866. STFD f8, 0 * SIZE(CO3)
  867. STFD f9, 1 * SIZE(CO3)
  868. STFD f10, 2 * SIZE(CO3)
  869. STFD f11, 3 * SIZE(CO3)
  870. STFD f12, 0 * SIZE(CO4)
  871. STFD f13, 1 * SIZE(CO4)
  872. STFD f14, 2 * SIZE(CO4)
  873. STFD f15, 3 * SIZE(CO4)
  874. lfs f0, FZERO
  875. fmr f1, f0
  876. fmr f2, f0
  877. fmr f3, f0
  878. fmr f4, f0
  879. fmr f5, f0
  880. fmr f6, f0
  881. fmr f7, f0
  882. fmr f8, f0
  883. fmr f9, f0
  884. fmr f10, f0
  885. fmr f11, f0
  886. fmr f12, f0
  887. fmr f13, f0
  888. fmr f14, f0
  889. fmr f15, f0
  890. #ifndef LN
  891. addi CO1, CO1, 4 * SIZE
  892. addi CO2, CO2, 4 * SIZE
  893. addi CO3, CO3, 4 * SIZE
  894. addi CO4, CO4, 4 * SIZE
  895. #endif
  896. #ifdef RT
  897. slwi r0, K, 2 + BASE_SHIFT
  898. add AORIG, AORIG, r0
  899. #endif
  900. #if defined(LT) || defined(RN)
  901. sub TEMP, K, KK
  902. slwi TEMP, TEMP, 2 + BASE_SHIFT
  903. add AO, AO, TEMP
  904. add BO, BO, TEMP
  905. #endif
  906. #ifdef LT
  907. addi KK, KK, 4
  908. #endif
  909. #ifdef LN
  910. subi KK, KK, 4
  911. #endif
  912. addic. I, I, -1
  913. bgt+ LL(11)
  914. .align 4
  915. LL(20):
  916. andi. I, M, 2
  917. ble LL(30)
  918. #if defined(LT) || defined(RN)
  919. LFD f16, 0 * SIZE(AO)
  920. LFD f17, 1 * SIZE(AO)
  921. LFD f18, 2 * SIZE(AO)
  922. LFD f19, 3 * SIZE(AO)
  923. LFD f20, 0 * SIZE(B)
  924. LFD f21, 1 * SIZE(B)
  925. LFD f22, 2 * SIZE(B)
  926. LFD f23, 3 * SIZE(B)
  927. LFD f24, 4 * SIZE(B)
  928. LFD f25, 5 * SIZE(B)
  929. LFD f26, 6 * SIZE(B)
  930. LFD f27, 7 * SIZE(B)
  931. srawi. r0, KK, 2
  932. mtspr CTR, r0
  933. mr BO, B
  934. #else
  935. #ifdef LN
  936. slwi r0, K, 1 + BASE_SHIFT
  937. sub AORIG, AORIG, r0
  938. #endif
  939. slwi r0, KK, 1 + BASE_SHIFT
  940. slwi TEMP, KK, 2 + BASE_SHIFT
  941. add AO, AORIG, r0
  942. add BO, B, TEMP
  943. sub TEMP, K, KK
  944. LFD f16, 0 * SIZE(AO)
  945. LFD f17, 1 * SIZE(AO)
  946. LFD f18, 2 * SIZE(AO)
  947. LFD f19, 3 * SIZE(AO)
  948. LFD f20, 0 * SIZE(BO)
  949. LFD f21, 1 * SIZE(BO)
  950. LFD f22, 2 * SIZE(BO)
  951. LFD f23, 3 * SIZE(BO)
  952. LFD f24, 4 * SIZE(BO)
  953. LFD f25, 5 * SIZE(BO)
  954. LFD f26, 6 * SIZE(BO)
  955. LFD f27, 7 * SIZE(BO)
  956. srawi. r0, TEMP, 2
  957. mtspr CTR, r0
  958. #endif
  959. ble LL(25)
  960. .align 5
  961. LL(22):
  962. FMADD f0, f16, f20, f0
  963. FMADD f1, f17, f20, f1
  964. FMADD f4, f16, f21, f4
  965. FMADD f5, f17, f21, f5
  966. FMADD f8, f16, f22, f8
  967. FMADD f9, f17, f22, f9
  968. FMADD f12, f16, f23, f12
  969. FMADD f13, f17, f23, f13
  970. LFD f20, 8 * SIZE(BO)
  971. LFD f21, 9 * SIZE(BO)
  972. LFD f22, 10 * SIZE(BO)
  973. LFD f23, 11 * SIZE(BO)
  974. FMADD f2, f18, f24, f2
  975. FMADD f3, f19, f24, f3
  976. FMADD f6, f18, f25, f6
  977. FMADD f7, f19, f25, f7
  978. FMADD f10, f18, f26, f10
  979. FMADD f11, f19, f26, f11
  980. FMADD f14, f18, f27, f14
  981. FMADD f15, f19, f27, f15
  982. LFD f16, 4 * SIZE(AO)
  983. LFD f17, 5 * SIZE(AO)
  984. LFD f18, 6 * SIZE(AO)
  985. LFD f19, 7 * SIZE(AO)
  986. FMADD f0, f16, f20, f0
  987. FMADD f1, f17, f20, f1
  988. FMADD f4, f16, f21, f4
  989. FMADD f5, f17, f21, f5
  990. LFD f24, 12 * SIZE(BO)
  991. LFD f25, 13 * SIZE(BO)
  992. LFD f26, 14 * SIZE(BO)
  993. LFD f27, 15 * SIZE(BO)
  994. FMADD f8, f16, f22, f8
  995. FMADD f9, f17, f22, f9
  996. FMADD f12, f16, f23, f12
  997. FMADD f13, f17, f23, f13
  998. LFD f20, 16 * SIZE(BO)
  999. LFD f21, 17 * SIZE(BO)
  1000. LFD f22, 18 * SIZE(BO)
  1001. LFD f23, 19 * SIZE(BO)
  1002. FMADD f2, f18, f24, f2
  1003. FMADD f3, f19, f24, f3
  1004. FMADD f6, f18, f25, f6
  1005. FMADD f7, f19, f25, f7
  1006. FMADD f10, f18, f26, f10
  1007. FMADD f11, f19, f26, f11
  1008. FMADD f14, f18, f27, f14
  1009. FMADD f15, f19, f27, f15
  1010. LFD f16, 8 * SIZE(AO)
  1011. LFD f17, 9 * SIZE(AO)
  1012. LFD f18, 10 * SIZE(AO)
  1013. LFD f19, 11 * SIZE(AO)
  1014. LFD f24, 20 * SIZE(BO)
  1015. LFD f25, 21 * SIZE(BO)
  1016. LFD f26, 22 * SIZE(BO)
  1017. LFD f27, 23 * SIZE(BO)
  1018. addi AO, AO, 8 * SIZE
  1019. addi BO, BO, 16 * SIZE
  1020. DCBT(BO, PREB)
  1021. bdnz LL(22)
  1022. fadd f0, f2, f0
  1023. fadd f1, f3, f1
  1024. fadd f4, f6, f4
  1025. fadd f5, f7, f5
  1026. fadd f8, f10, f8
  1027. fadd f9, f11, f9
  1028. fadd f12, f14, f12
  1029. fadd f13, f15, f13
  1030. .align 4
  1031. LL(25):
  1032. #if defined(LT) || defined(RN)
  1033. andi. r0, KK, 3
  1034. #else
  1035. andi. r0, TEMP, 3
  1036. #endif
  1037. mtspr CTR, r0
  1038. ble+ LL(28)
  1039. .align 4
  1040. LL(26):
  1041. FMADD f0, f16, f20, f0
  1042. FMADD f1, f17, f20, f1
  1043. FMADD f4, f16, f21, f4
  1044. FMADD f5, f17, f21, f5
  1045. FMADD f8, f16, f22, f8
  1046. FMADD f9, f17, f22, f9
  1047. FMADD f12, f16, f23, f12
  1048. FMADD f13, f17, f23, f13
  1049. LFD f16, 2 * SIZE(AO)
  1050. LFD f17, 3 * SIZE(AO)
  1051. LFD f20, 4 * SIZE(BO)
  1052. LFD f21, 5 * SIZE(BO)
  1053. LFD f22, 6 * SIZE(BO)
  1054. LFD f23, 7 * SIZE(BO)
  1055. addi BO, BO, 4 * SIZE
  1056. addi AO, AO, 2 * SIZE
  1057. bdnz LL(26)
  1058. .align 4
  1059. LL(28):
  1060. #if defined(LN) || defined(RT)
  1061. #ifdef LN
  1062. subi r0, KK, 2
  1063. #else
  1064. subi r0, KK, 4
  1065. #endif
  1066. slwi TEMP, r0, 1 + BASE_SHIFT
  1067. slwi r0, r0, 2 + BASE_SHIFT
  1068. add AO, AORIG, TEMP
  1069. add BO, B, r0
  1070. #endif
  1071. #if defined(LN) || defined(LT)
  1072. LFD f16, 0 * SIZE(BO)
  1073. LFD f17, 1 * SIZE(BO)
  1074. LFD f18, 2 * SIZE(BO)
  1075. LFD f19, 3 * SIZE(BO)
  1076. LFD f20, 4 * SIZE(BO)
  1077. LFD f21, 5 * SIZE(BO)
  1078. LFD f22, 6 * SIZE(BO)
  1079. LFD f23, 7 * SIZE(BO)
  1080. FSUB f0, f16, f0
  1081. FSUB f4, f17, f4
  1082. FSUB f8, f18, f8
  1083. FSUB f12, f19, f12
  1084. FSUB f1, f20, f1
  1085. FSUB f5, f21, f5
  1086. FSUB f9, f22, f9
  1087. FSUB f13, f23, f13
  1088. #else
  1089. LFD f16, 0 * SIZE(AO)
  1090. LFD f17, 1 * SIZE(AO)
  1091. LFD f20, 2 * SIZE(AO)
  1092. LFD f21, 3 * SIZE(AO)
  1093. LFD f24, 4 * SIZE(AO)
  1094. LFD f25, 5 * SIZE(AO)
  1095. LFD f28, 6 * SIZE(AO)
  1096. LFD f29, 7 * SIZE(AO)
  1097. FSUB f0, f16, f0
  1098. FSUB f1, f17, f1
  1099. FSUB f4, f20, f4
  1100. FSUB f5, f21, f5
  1101. FSUB f8, f24, f8
  1102. FSUB f9, f25, f9
  1103. FSUB f12, f28, f12
  1104. FSUB f13, f29, f13
  1105. #endif
  1106. #ifdef LN
  1107. LFD f19, 3 * SIZE(AO)
  1108. LFD f20, 2 * SIZE(AO)
  1109. LFD f21, 0 * SIZE(AO)
  1110. FMUL f1, f19, f1
  1111. FMUL f5, f19, f5
  1112. FMUL f9, f19, f9
  1113. FMUL f13, f19, f13
  1114. FNMSUB f0, f20, f1, f0
  1115. FNMSUB f4, f20, f5, f4
  1116. FNMSUB f8, f20, f9, f8
  1117. FNMSUB f12, f20, f13, f12
  1118. FMUL f0, f21, f0
  1119. FMUL f4, f21, f4
  1120. FMUL f8, f21, f8
  1121. FMUL f12, f21, f12
  1122. #endif
  1123. #ifdef LT
  1124. LFD f16, 0 * SIZE(AO)
  1125. LFD f17, 1 * SIZE(AO)
  1126. FMUL f0, f16, f0
  1127. FMUL f4, f16, f4
  1128. FMUL f8, f16, f8
  1129. FMUL f12, f16, f12
  1130. FNMSUB f1, f17, f0, f1
  1131. FNMSUB f5, f17, f4, f5
  1132. FNMSUB f9, f17, f8, f9
  1133. FNMSUB f13, f17, f12, f13
  1134. LFD f17, 3 * SIZE(AO)
  1135. FMUL f1, f17, f1
  1136. FMUL f5, f17, f5
  1137. FMUL f9, f17, f9
  1138. FMUL f13, f17, f13
  1139. #endif
  1140. #ifdef RN
  1141. LFD f16, 0 * SIZE(BO)
  1142. LFD f17, 1 * SIZE(BO)
  1143. LFD f18, 2 * SIZE(BO)
  1144. LFD f19, 3 * SIZE(BO)
  1145. FMUL f0, f16, f0
  1146. FMUL f1, f16, f1
  1147. FNMSUB f4, f17, f0, f4
  1148. FNMSUB f5, f17, f1, f5
  1149. FNMSUB f8, f18, f0, f8
  1150. FNMSUB f9, f18, f1, f9
  1151. FNMSUB f12, f19, f0, f12
  1152. FNMSUB f13, f19, f1, f13
  1153. LFD f16, 5 * SIZE(BO)
  1154. LFD f17, 6 * SIZE(BO)
  1155. LFD f18, 7 * SIZE(BO)
  1156. LFD f19, 10 * SIZE(BO)
  1157. LFD f20, 11 * SIZE(BO)
  1158. LFD f21, 15 * SIZE(BO)
  1159. FMUL f4, f16, f4
  1160. FMUL f5, f16, f5
  1161. FNMSUB f8, f17, f4, f8
  1162. FNMSUB f9, f17, f5, f9
  1163. FNMSUB f12, f18, f4, f12
  1164. FNMSUB f13, f18, f5, f13
  1165. FMUL f8, f19, f8
  1166. FMUL f9, f19, f9
  1167. FNMSUB f12, f20, f8, f12
  1168. FNMSUB f13, f20, f9, f13
  1169. FMUL f12, f21, f12
  1170. FMUL f13, f21, f13
  1171. #endif
  1172. #ifdef RT
  1173. LFD f16, 15 * SIZE(BO)
  1174. LFD f17, 14 * SIZE(BO)
  1175. LFD f18, 13 * SIZE(BO)
  1176. LFD f19, 12 * SIZE(BO)
  1177. FMUL f12, f16, f12
  1178. FMUL f13, f16, f13
  1179. FNMSUB f8, f17, f12, f8
  1180. FNMSUB f9, f17, f13, f9
  1181. FNMSUB f4, f18, f12, f4
  1182. FNMSUB f5, f18, f13, f5
  1183. FNMSUB f0, f19, f12, f0
  1184. FNMSUB f1, f19, f13, f1
  1185. LFD f16, 10 * SIZE(BO)
  1186. LFD f17, 9 * SIZE(BO)
  1187. LFD f18, 8 * SIZE(BO)
  1188. LFD f19, 5 * SIZE(BO)
  1189. LFD f20, 4 * SIZE(BO)
  1190. LFD f21, 0 * SIZE(BO)
  1191. FMUL f8, f16, f8
  1192. FMUL f9, f16, f9
  1193. FNMSUB f4, f17, f8, f4
  1194. FNMSUB f5, f17, f9, f5
  1195. FNMSUB f0, f18, f8, f0
  1196. FNMSUB f1, f18, f9, f1
  1197. FMUL f4, f19, f4
  1198. FMUL f5, f19, f5
  1199. FNMSUB f0, f20, f4, f0
  1200. FNMSUB f1, f20, f5, f1
  1201. FMUL f0, f21, f0
  1202. FMUL f1, f21, f1
  1203. #endif
  1204. #ifdef LN
  1205. subi CO1, CO1, 2 * SIZE
  1206. subi CO2, CO2, 2 * SIZE
  1207. subi CO3, CO3, 2 * SIZE
  1208. subi CO4, CO4, 2 * SIZE
  1209. #endif
  1210. #if defined(LN) || defined(LT)
  1211. STFD f0, 0 * SIZE(BO)
  1212. STFD f4, 1 * SIZE(BO)
  1213. STFD f8, 2 * SIZE(BO)
  1214. STFD f12, 3 * SIZE(BO)
  1215. STFD f1, 4 * SIZE(BO)
  1216. STFD f5, 5 * SIZE(BO)
  1217. STFD f9, 6 * SIZE(BO)
  1218. STFD f13, 7 * SIZE(BO)
  1219. #else
  1220. STFD f0, 0 * SIZE(AO)
  1221. STFD f1, 1 * SIZE(AO)
  1222. STFD f4, 2 * SIZE(AO)
  1223. STFD f5, 3 * SIZE(AO)
  1224. STFD f8, 4 * SIZE(AO)
  1225. STFD f9, 5 * SIZE(AO)
  1226. STFD f12, 6 * SIZE(AO)
  1227. STFD f13, 7 * SIZE(AO)
  1228. #endif
  1229. STFD f0, 0 * SIZE(CO1)
  1230. STFD f1, 1 * SIZE(CO1)
  1231. STFD f4, 0 * SIZE(CO2)
  1232. STFD f5, 1 * SIZE(CO2)
  1233. STFD f8, 0 * SIZE(CO3)
  1234. STFD f9, 1 * SIZE(CO3)
  1235. STFD f12, 0 * SIZE(CO4)
  1236. STFD f13, 1 * SIZE(CO4)
  1237. lfs f0, FZERO
  1238. fmr f1, f0
  1239. fmr f2, f0
  1240. fmr f3, f0
  1241. fmr f4, f0
  1242. fmr f5, f0
  1243. fmr f6, f0
  1244. fmr f7, f0
  1245. fmr f8, f0
  1246. fmr f9, f0
  1247. fmr f10, f0
  1248. fmr f11, f0
  1249. fmr f12, f0
  1250. fmr f13, f0
  1251. fmr f14, f0
  1252. fmr f15, f0
  1253. #ifndef LN
  1254. addi CO1, CO1, 2 * SIZE
  1255. addi CO2, CO2, 2 * SIZE
  1256. addi CO3, CO3, 2 * SIZE
  1257. addi CO4, CO4, 2 * SIZE
  1258. #endif
  1259. #ifdef RT
  1260. slwi r0, K, 1 + BASE_SHIFT
  1261. add AORIG, AORIG, r0
  1262. #endif
  1263. #if defined(LT) || defined(RN)
  1264. sub TEMP, K, KK
  1265. slwi r0, TEMP, 1 + BASE_SHIFT
  1266. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1267. add AO, AO, r0
  1268. add BO, BO, TEMP
  1269. #endif
  1270. #ifdef LN
  1271. subi KK, KK, 2
  1272. #endif
  1273. #ifdef LT
  1274. addi KK, KK, 2
  1275. #endif
  1276. .align 4
  1277. LL(30):
  1278. andi. I, M, 1
  1279. ble LL(39)
  1280. #if defined(LT) || defined(RN)
  1281. LFD f16, 0 * SIZE(AO)
  1282. LFD f17, 1 * SIZE(AO)
  1283. LFD f18, 2 * SIZE(AO)
  1284. LFD f19, 3 * SIZE(AO)
  1285. LFD f20, 0 * SIZE(B)
  1286. LFD f21, 1 * SIZE(B)
  1287. LFD f22, 2 * SIZE(B)
  1288. LFD f23, 3 * SIZE(B)
  1289. LFD f24, 4 * SIZE(B)
  1290. LFD f25, 5 * SIZE(B)
  1291. LFD f26, 6 * SIZE(B)
  1292. LFD f27, 7 * SIZE(B)
  1293. srawi. r0, KK, 2
  1294. mtspr CTR, r0
  1295. mr BO, B
  1296. #else
  1297. #ifdef LN
  1298. slwi r0, K, BASE_SHIFT
  1299. sub AORIG, AORIG, r0
  1300. #endif
  1301. slwi r0, KK, 0 + BASE_SHIFT
  1302. slwi TEMP, KK, 2 + BASE_SHIFT
  1303. add AO, AORIG, r0
  1304. add BO, B, TEMP
  1305. sub TEMP, K, KK
  1306. LFD f16, 0 * SIZE(AO)
  1307. LFD f17, 1 * SIZE(AO)
  1308. LFD f18, 2 * SIZE(AO)
  1309. LFD f19, 3 * SIZE(AO)
  1310. LFD f20, 0 * SIZE(BO)
  1311. LFD f21, 1 * SIZE(BO)
  1312. LFD f22, 2 * SIZE(BO)
  1313. LFD f23, 3 * SIZE(BO)
  1314. LFD f24, 4 * SIZE(BO)
  1315. LFD f25, 5 * SIZE(BO)
  1316. LFD f26, 6 * SIZE(BO)
  1317. LFD f27, 7 * SIZE(BO)
  1318. srawi. r0, TEMP, 2
  1319. mtspr CTR, r0
  1320. #endif
  1321. ble LL(35)
  1322. .align 5
  1323. LL(32):
  1324. FMADD f0, f16, f20, f0
  1325. FMADD f4, f16, f21, f4
  1326. FMADD f8, f16, f22, f8
  1327. FMADD f12, f16, f23, f12
  1328. LFD f20, 8 * SIZE(BO)
  1329. LFD f21, 9 * SIZE(BO)
  1330. LFD f22, 10 * SIZE(BO)
  1331. LFD f23, 11 * SIZE(BO)
  1332. FMADD f1, f17, f24, f1
  1333. FMADD f5, f17, f25, f5
  1334. FMADD f9, f17, f26, f9
  1335. FMADD f13, f17, f27, f13
  1336. LFD f24, 12 * SIZE(BO)
  1337. LFD f25, 13 * SIZE(BO)
  1338. LFD f26, 14 * SIZE(BO)
  1339. LFD f27, 15 * SIZE(BO)
  1340. FMADD f0, f18, f20, f0
  1341. FMADD f4, f18, f21, f4
  1342. FMADD f8, f18, f22, f8
  1343. FMADD f12, f18, f23, f12
  1344. LFD f20, 16 * SIZE(BO)
  1345. LFD f21, 17 * SIZE(BO)
  1346. LFD f22, 18 * SIZE(BO)
  1347. LFD f23, 19 * SIZE(BO)
  1348. FMADD f1, f19, f24, f1
  1349. FMADD f5, f19, f25, f5
  1350. FMADD f9, f19, f26, f9
  1351. FMADD f13, f19, f27, f13
  1352. LFD f16, 4 * SIZE(AO)
  1353. LFD f17, 5 * SIZE(AO)
  1354. LFD f18, 6 * SIZE(AO)
  1355. LFD f19, 7 * SIZE(AO)
  1356. LFD f24, 20 * SIZE(BO)
  1357. LFD f25, 21 * SIZE(BO)
  1358. LFD f26, 22 * SIZE(BO)
  1359. LFD f27, 23 * SIZE(BO)
  1360. addi AO, AO, 4 * SIZE
  1361. addi BO, BO, 16 * SIZE
  1362. DCBT(BO, PREB)
  1363. bdnz LL(32)
  1364. fadd f0, f1, f0
  1365. fadd f4, f5, f4
  1366. fadd f8, f9, f8
  1367. fadd f12, f13, f12
  1368. .align 4
  1369. LL(35):
  1370. #if defined(LT) || defined(RN)
  1371. andi. r0, KK, 3
  1372. #else
  1373. andi. r0, TEMP, 3
  1374. #endif
  1375. mtspr CTR, r0
  1376. ble+ LL(38)
  1377. .align 4
  1378. LL(36):
  1379. FMADD f0, f16, f20, f0
  1380. FMADD f4, f16, f21, f4
  1381. FMADD f8, f16, f22, f8
  1382. FMADD f12, f16, f23, f12
  1383. LFD f16, 1 * SIZE(AO)
  1384. LFD f20, 4 * SIZE(BO)
  1385. LFD f21, 5 * SIZE(BO)
  1386. LFD f22, 6 * SIZE(BO)
  1387. LFD f23, 7 * SIZE(BO)
  1388. addi BO, BO, 4 * SIZE
  1389. addi AO, AO, 1 * SIZE
  1390. bdnz LL(36)
  1391. .align 4
  1392. LL(38):
  1393. #if defined(LN) || defined(RT)
  1394. #ifdef LN
  1395. subi r0, KK, 1
  1396. #else
  1397. subi r0, KK, 4
  1398. #endif
  1399. slwi TEMP, r0, 0 + BASE_SHIFT
  1400. slwi r0, r0, 2 + BASE_SHIFT
  1401. add AO, AORIG, TEMP
  1402. add BO, B, r0
  1403. #endif
  1404. #if defined(LN) || defined(LT)
  1405. LFD f16, 0 * SIZE(BO)
  1406. LFD f17, 1 * SIZE(BO)
  1407. LFD f18, 2 * SIZE(BO)
  1408. LFD f19, 3 * SIZE(BO)
  1409. FSUB f0, f16, f0
  1410. FSUB f4, f17, f4
  1411. FSUB f8, f18, f8
  1412. FSUB f12, f19, f12
  1413. #else
  1414. LFD f16, 0 * SIZE(AO)
  1415. LFD f20, 1 * SIZE(AO)
  1416. LFD f24, 2 * SIZE(AO)
  1417. LFD f28, 3 * SIZE(AO)
  1418. FSUB f0, f16, f0
  1419. FSUB f4, f20, f4
  1420. FSUB f8, f24, f8
  1421. FSUB f12, f28, f12
  1422. #endif
  1423. #ifdef LN
  1424. LFD f21, 0 * SIZE(AO)
  1425. FMUL f0, f21, f0
  1426. FMUL f4, f21, f4
  1427. FMUL f8, f21, f8
  1428. FMUL f12, f21, f12
  1429. #endif
  1430. #ifdef LT
  1431. LFD f16, 0 * SIZE(AO)
  1432. FMUL f0, f16, f0
  1433. FMUL f4, f16, f4
  1434. FMUL f8, f16, f8
  1435. FMUL f12, f16, f12
  1436. #endif
  1437. #ifdef RN
  1438. LFD f16, 0 * SIZE(BO)
  1439. LFD f17, 1 * SIZE(BO)
  1440. LFD f18, 2 * SIZE(BO)
  1441. LFD f19, 3 * SIZE(BO)
  1442. FMUL f0, f16, f0
  1443. FNMSUB f4, f17, f0, f4
  1444. FNMSUB f8, f18, f0, f8
  1445. FNMSUB f12, f19, f0, f12
  1446. LFD f16, 5 * SIZE(BO)
  1447. LFD f17, 6 * SIZE(BO)
  1448. LFD f18, 7 * SIZE(BO)
  1449. LFD f19, 10 * SIZE(BO)
  1450. LFD f20, 11 * SIZE(BO)
  1451. LFD f21, 15 * SIZE(BO)
  1452. FMUL f4, f16, f4
  1453. FNMSUB f8, f17, f4, f8
  1454. FNMSUB f12, f18, f4, f12
  1455. FMUL f8, f19, f8
  1456. FNMSUB f12, f20, f8, f12
  1457. FMUL f12, f21, f12
  1458. #endif
  1459. #ifdef RT
  1460. LFD f16, 15 * SIZE(BO)
  1461. LFD f17, 14 * SIZE(BO)
  1462. LFD f18, 13 * SIZE(BO)
  1463. LFD f19, 12 * SIZE(BO)
  1464. FMUL f12, f16, f12
  1465. FNMSUB f8, f17, f12, f8
  1466. FNMSUB f4, f18, f12, f4
  1467. FNMSUB f0, f19, f12, f0
  1468. LFD f16, 10 * SIZE(BO)
  1469. LFD f17, 9 * SIZE(BO)
  1470. LFD f18, 8 * SIZE(BO)
  1471. LFD f19, 5 * SIZE(BO)
  1472. FMUL f8, f16, f8
  1473. LFD f20, 4 * SIZE(BO)
  1474. LFD f21, 0 * SIZE(BO)
  1475. FNMSUB f4, f17, f8, f4
  1476. FNMSUB f0, f18, f8, f0
  1477. FMUL f4, f19, f4
  1478. FNMSUB f0, f20, f4, f0
  1479. FMUL f0, f21, f0
  1480. #endif
  1481. #ifdef LN
  1482. subi CO1, CO1, 1 * SIZE
  1483. subi CO2, CO2, 1 * SIZE
  1484. subi CO3, CO3, 1 * SIZE
  1485. subi CO4, CO4, 1 * SIZE
  1486. #endif
  1487. #if defined(LN) || defined(LT)
  1488. STFD f0, 0 * SIZE(BO)
  1489. STFD f4, 1 * SIZE(BO)
  1490. STFD f8, 2 * SIZE(BO)
  1491. STFD f12, 3 * SIZE(BO)
  1492. #else
  1493. STFD f0, 0 * SIZE(AO)
  1494. STFD f4, 1 * SIZE(AO)
  1495. STFD f8, 2 * SIZE(AO)
  1496. STFD f12, 3 * SIZE(AO)
  1497. #endif
  1498. STFD f0, 0 * SIZE(CO1)
  1499. STFD f4, 0 * SIZE(CO2)
  1500. STFD f8, 0 * SIZE(CO3)
  1501. STFD f12, 0 * SIZE(CO4)
  1502. lfs f0, FZERO
  1503. fmr f1, f0
  1504. fmr f4, f0
  1505. fmr f5, f0
  1506. fmr f8, f0
  1507. fmr f9, f0
  1508. fmr f12, f0
  1509. fmr f13, f0
  1510. #ifndef LN
  1511. addi CO1, CO1, 1 * SIZE
  1512. addi CO2, CO2, 1 * SIZE
  1513. addi CO3, CO3, 1 * SIZE
  1514. addi CO4, CO4, 1 * SIZE
  1515. #endif
  1516. #ifdef RT
  1517. slwi r0, K, 0 + BASE_SHIFT
  1518. add AORIG, AORIG, r0
  1519. #endif
  1520. #if defined(LT) || defined(RN)
  1521. sub TEMP, K, KK
  1522. slwi r0, TEMP, 0 + BASE_SHIFT
  1523. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1524. add AO, AO, r0
  1525. add BO, BO, TEMP
  1526. #endif
  1527. #ifdef LN
  1528. subi KK, KK, 1
  1529. #endif
  1530. #ifdef LT
  1531. addi KK, KK, 1
  1532. #endif
  1533. .align 4
  1534. LL(39):
  1535. #ifdef LN
  1536. slwi r0, K, 2 + BASE_SHIFT
  1537. add B, B, r0
  1538. #endif
  1539. #if defined(LT) || defined(RN)
  1540. mr B, BO
  1541. #endif
  1542. #ifdef RN
  1543. addi KK, KK, 4
  1544. #endif
  1545. #ifdef RT
  1546. subi KK, KK, 4
  1547. #endif
  1548. addic. J, J, -1
  1549. lfs f0, FZERO
  1550. bgt LL(10)
  1551. .align 4
  1552. LL(40):
  1553. andi. J, N, 2
  1554. ble LL(70)
  1555. #ifdef RT
  1556. slwi r0, K, 1 + BASE_SHIFT
  1557. sub B, B, r0
  1558. slwi r0, LDC, 1
  1559. sub C, C, r0
  1560. #endif
  1561. mr CO1, C
  1562. add CO2, C, LDC
  1563. #ifdef LN
  1564. add KK, M, OFFSET
  1565. #endif
  1566. #ifdef LT
  1567. mr KK, OFFSET
  1568. #endif
  1569. fmr f1, f0
  1570. fmr f2, f0
  1571. fmr f3, f0
  1572. fmr f4, f0
  1573. fmr f5, f0
  1574. fmr f6, f0
  1575. fmr f7, f0
  1576. srawi. I, M, 2
  1577. #if defined(LN) || defined(RT)
  1578. mr AORIG, A
  1579. #else
  1580. mr AO, A
  1581. #endif
  1582. #ifndef RT
  1583. add C, CO2, LDC
  1584. #endif
  1585. ble LL(50)
  1586. .align 4
  1587. LL(41):
  1588. #if defined(LT) || defined(RN)
  1589. LFD f16, 0 * SIZE(AO)
  1590. LFD f17, 1 * SIZE(AO)
  1591. LFD f18, 2 * SIZE(AO)
  1592. LFD f19, 3 * SIZE(AO)
  1593. LFD f20, 0 * SIZE(B)
  1594. LFD f21, 1 * SIZE(B)
  1595. LFD f22, 2 * SIZE(B)
  1596. LFD f23, 3 * SIZE(B)
  1597. dcbt CO1, PREC
  1598. dcbt CO2, PREC
  1599. srawi. r0, KK, 2
  1600. mtspr CTR, r0
  1601. mr BO, B
  1602. #else
  1603. #ifdef LN
  1604. slwi r0, K, 2 + BASE_SHIFT
  1605. sub AORIG, AORIG, r0
  1606. #endif
  1607. slwi r0, KK, 2 + BASE_SHIFT
  1608. slwi TEMP, KK, 1 + BASE_SHIFT
  1609. add AO, AORIG, r0
  1610. add BO, B, TEMP
  1611. sub TEMP, K, KK
  1612. LFD f16, 0 * SIZE(AO)
  1613. LFD f17, 1 * SIZE(AO)
  1614. LFD f18, 2 * SIZE(AO)
  1615. LFD f19, 3 * SIZE(AO)
  1616. LFD f20, 0 * SIZE(BO)
  1617. LFD f21, 1 * SIZE(BO)
  1618. LFD f22, 2 * SIZE(BO)
  1619. LFD f23, 3 * SIZE(BO)
  1620. dcbt CO1, PREC
  1621. dcbt CO2, PREC
  1622. srawi. r0, TEMP, 2
  1623. mtspr CTR, r0
  1624. #endif
  1625. ble LL(45)
  1626. .align 5
  1627. LL(42):
  1628. FMADD f0, f16, f20, f0
  1629. FMADD f1, f17, f20, f1
  1630. FMADD f2, f18, f20, f2
  1631. FMADD f3, f19, f20, f3
  1632. FMADD f4, f16, f21, f4
  1633. FMADD f5, f17, f21, f5
  1634. FMADD f6, f18, f21, f6
  1635. FMADD f7, f19, f21, f7
  1636. LFD f16, 4 * SIZE(AO)
  1637. LFD f17, 5 * SIZE(AO)
  1638. LFD f18, 6 * SIZE(AO)
  1639. LFD f19, 7 * SIZE(AO)
  1640. FMADD f0, f16, f22, f0
  1641. FMADD f1, f17, f22, f1
  1642. FMADD f2, f18, f22, f2
  1643. FMADD f3, f19, f22, f3
  1644. FMADD f4, f16, f23, f4
  1645. FMADD f5, f17, f23, f5
  1646. FMADD f6, f18, f23, f6
  1647. FMADD f7, f19, f23, f7
  1648. LFD f16, 8 * SIZE(AO)
  1649. LFD f17, 9 * SIZE(AO)
  1650. LFD f18, 10 * SIZE(AO)
  1651. LFD f19, 11 * SIZE(AO)
  1652. LFD f20, 4 * SIZE(BO)
  1653. LFD f21, 5 * SIZE(BO)
  1654. LFD f22, 6 * SIZE(BO)
  1655. LFD f23, 7 * SIZE(BO)
  1656. FMADD f0, f16, f20, f0
  1657. FMADD f1, f17, f20, f1
  1658. FMADD f2, f18, f20, f2
  1659. FMADD f3, f19, f20, f3
  1660. FMADD f4, f16, f21, f4
  1661. FMADD f5, f17, f21, f5
  1662. FMADD f6, f18, f21, f6
  1663. FMADD f7, f19, f21, f7
  1664. LFD f16, 12 * SIZE(AO)
  1665. LFD f17, 13 * SIZE(AO)
  1666. LFD f18, 14 * SIZE(AO)
  1667. LFD f19, 15 * SIZE(AO)
  1668. FMADD f0, f16, f22, f0
  1669. FMADD f1, f17, f22, f1
  1670. FMADD f2, f18, f22, f2
  1671. FMADD f3, f19, f22, f3
  1672. FMADD f4, f16, f23, f4
  1673. FMADD f5, f17, f23, f5
  1674. FMADD f6, f18, f23, f6
  1675. FMADD f7, f19, f23, f7
  1676. LFD f16, 16 * SIZE(AO)
  1677. LFD f17, 17 * SIZE(AO)
  1678. LFD f18, 18 * SIZE(AO)
  1679. LFD f19, 19 * SIZE(AO)
  1680. LFD f20, 8 * SIZE(BO)
  1681. LFD f21, 9 * SIZE(BO)
  1682. LFD f22, 10 * SIZE(BO)
  1683. LFD f23, 11 * SIZE(BO)
  1684. addi AO, AO, 16 * SIZE
  1685. addi BO, BO, 8 * SIZE
  1686. DCBT(BO, PREB)
  1687. bdnz LL(42)
  1688. .align 4
  1689. LL(45):
  1690. #if defined(LT) || defined(RN)
  1691. andi. r0, KK, 3
  1692. #else
  1693. andi. r0, TEMP, 3
  1694. #endif
  1695. mtspr CTR, r0
  1696. ble+ LL(48)
  1697. .align 4
  1698. LL(46):
  1699. FMADD f0, f16, f20, f0
  1700. FMADD f1, f17, f20, f1
  1701. FMADD f2, f18, f20, f2
  1702. FMADD f3, f19, f20, f3
  1703. FMADD f4, f16, f21, f4
  1704. FMADD f5, f17, f21, f5
  1705. FMADD f6, f18, f21, f6
  1706. FMADD f7, f19, f21, f7
  1707. LFD f16, 4 * SIZE(AO)
  1708. LFD f17, 5 * SIZE(AO)
  1709. LFD f18, 6 * SIZE(AO)
  1710. LFD f19, 7 * SIZE(AO)
  1711. LFD f20, 2 * SIZE(BO)
  1712. LFD f21, 3 * SIZE(BO)
  1713. addi BO, BO, 2 * SIZE
  1714. addi AO, AO, 4 * SIZE
  1715. bdnz LL(46)
  1716. .align 4
  1717. LL(48):
  1718. #if defined(LN) || defined(RT)
  1719. #ifdef LN
  1720. subi r0, KK, 4
  1721. #else
  1722. subi r0, KK, 2
  1723. #endif
  1724. slwi TEMP, r0, 2 + BASE_SHIFT
  1725. slwi r0, r0, 1 + BASE_SHIFT
  1726. add AO, AORIG, TEMP
  1727. add BO, B, r0
  1728. #endif
  1729. #if defined(LN) || defined(LT)
  1730. LFD f16, 0 * SIZE(BO)
  1731. LFD f17, 1 * SIZE(BO)
  1732. LFD f20, 2 * SIZE(BO)
  1733. LFD f21, 3 * SIZE(BO)
  1734. LFD f24, 4 * SIZE(BO)
  1735. LFD f25, 5 * SIZE(BO)
  1736. LFD f28, 6 * SIZE(BO)
  1737. LFD f29, 7 * SIZE(BO)
  1738. FSUB f0, f16, f0
  1739. FSUB f4, f17, f4
  1740. FSUB f1, f20, f1
  1741. FSUB f5, f21, f5
  1742. FSUB f2, f24, f2
  1743. FSUB f6, f25, f6
  1744. FSUB f3, f28, f3
  1745. FSUB f7, f29, f7
  1746. #else
  1747. LFD f16, 0 * SIZE(AO)
  1748. LFD f17, 1 * SIZE(AO)
  1749. LFD f18, 2 * SIZE(AO)
  1750. LFD f19, 3 * SIZE(AO)
  1751. LFD f20, 4 * SIZE(AO)
  1752. LFD f21, 5 * SIZE(AO)
  1753. LFD f22, 6 * SIZE(AO)
  1754. LFD f23, 7 * SIZE(AO)
  1755. FSUB f0, f16, f0
  1756. FSUB f1, f17, f1
  1757. FSUB f2, f18, f2
  1758. FSUB f3, f19, f3
  1759. FSUB f4, f20, f4
  1760. FSUB f5, f21, f5
  1761. FSUB f6, f22, f6
  1762. FSUB f7, f23, f7
  1763. #endif
  1764. #ifdef LN
  1765. LFD f16, 15 * SIZE(AO)
  1766. LFD f17, 14 * SIZE(AO)
  1767. LFD f18, 13 * SIZE(AO)
  1768. LFD f19, 12 * SIZE(AO)
  1769. FMUL f3, f16, f3
  1770. FMUL f7, f16, f7
  1771. FNMSUB f2, f17, f3, f2
  1772. FNMSUB f6, f17, f7, f6
  1773. FNMSUB f1, f18, f3, f1
  1774. FNMSUB f5, f18, f7, f5
  1775. FNMSUB f0, f19, f3, f0
  1776. FNMSUB f4, f19, f7, f4
  1777. LFD f16, 10 * SIZE(AO)
  1778. LFD f17, 9 * SIZE(AO)
  1779. LFD f18, 8 * SIZE(AO)
  1780. LFD f19, 5 * SIZE(AO)
  1781. LFD f20, 4 * SIZE(AO)
  1782. LFD f21, 0 * SIZE(AO)
  1783. FMUL f2, f16, f2
  1784. FMUL f6, f16, f6
  1785. FNMSUB f1, f17, f2, f1
  1786. FNMSUB f5, f17, f6, f5
  1787. FNMSUB f0, f18, f2, f0
  1788. FNMSUB f4, f18, f6, f4
  1789. FMUL f1, f19, f1
  1790. FMUL f5, f19, f5
  1791. FNMSUB f0, f20, f1, f0
  1792. FNMSUB f4, f20, f5, f4
  1793. FMUL f0, f21, f0
  1794. FMUL f4, f21, f4
  1795. #endif
  1796. #ifdef LT
  1797. LFD f16, 0 * SIZE(AO)
  1798. LFD f17, 1 * SIZE(AO)
  1799. LFD f18, 2 * SIZE(AO)
  1800. LFD f19, 3 * SIZE(AO)
  1801. FMUL f0, f16, f0
  1802. FMUL f4, f16, f4
  1803. FNMSUB f1, f17, f0, f1
  1804. FNMSUB f5, f17, f4, f5
  1805. FNMSUB f2, f18, f0, f2
  1806. FNMSUB f6, f18, f4, f6
  1807. FNMSUB f3, f19, f0, f3
  1808. FNMSUB f7, f19, f4, f7
  1809. LFD f17, 5 * SIZE(AO)
  1810. LFD f18, 6 * SIZE(AO)
  1811. LFD f19, 7 * SIZE(AO)
  1812. FMUL f1, f17, f1
  1813. FMUL f5, f17, f5
  1814. FNMSUB f2, f18, f1, f2
  1815. FNMSUB f6, f18, f5, f6
  1816. FNMSUB f3, f19, f1, f3
  1817. FNMSUB f7, f19, f5, f7
  1818. LFD f18, 10 * SIZE(AO)
  1819. LFD f19, 11 * SIZE(AO)
  1820. FMUL f2, f18, f2
  1821. FMUL f6, f18, f6
  1822. FNMSUB f3, f19, f2, f3
  1823. FNMSUB f7, f19, f6, f7
  1824. LFD f19, 15 * SIZE(AO)
  1825. FMUL f3, f19, f3
  1826. FMUL f7, f19, f7
  1827. #endif
  1828. #ifdef RN
  1829. LFD f16, 0 * SIZE(BO)
  1830. LFD f17, 1 * SIZE(BO)
  1831. LFD f18, 3 * SIZE(BO)
  1832. FMUL f0, f16, f0
  1833. FMUL f1, f16, f1
  1834. FMUL f2, f16, f2
  1835. FMUL f3, f16, f3
  1836. FNMSUB f4, f17, f0, f4
  1837. FNMSUB f5, f17, f1, f5
  1838. FNMSUB f6, f17, f2, f6
  1839. FNMSUB f7, f17, f3, f7
  1840. FMUL f4, f18, f4
  1841. FMUL f5, f18, f5
  1842. FMUL f6, f18, f6
  1843. FMUL f7, f18, f7
  1844. #endif
  1845. #ifdef RT
  1846. LFD f19, 3 * SIZE(BO)
  1847. LFD f20, 2 * SIZE(BO)
  1848. LFD f21, 0 * SIZE(BO)
  1849. FMUL f4, f19, f4
  1850. FMUL f5, f19, f5
  1851. FMUL f6, f19, f6
  1852. FMUL f7, f19, f7
  1853. FNMSUB f0, f20, f4, f0
  1854. FNMSUB f1, f20, f5, f1
  1855. FNMSUB f2, f20, f6, f2
  1856. FNMSUB f3, f20, f7, f3
  1857. FMUL f0, f21, f0
  1858. FMUL f1, f21, f1
  1859. FMUL f2, f21, f2
  1860. FMUL f3, f21, f3
  1861. #endif
  1862. #ifdef LN
  1863. subi CO1, CO1, 4 * SIZE
  1864. subi CO2, CO2, 4 * SIZE
  1865. #endif
  1866. #if defined(LN) || defined(LT)
  1867. STFD f0, 0 * SIZE(BO)
  1868. STFD f4, 1 * SIZE(BO)
  1869. STFD f1, 2 * SIZE(BO)
  1870. STFD f5, 3 * SIZE(BO)
  1871. STFD f2, 4 * SIZE(BO)
  1872. STFD f6, 5 * SIZE(BO)
  1873. STFD f3, 6 * SIZE(BO)
  1874. STFD f7, 7 * SIZE(BO)
  1875. #else
  1876. STFD f0, 0 * SIZE(AO)
  1877. STFD f1, 1 * SIZE(AO)
  1878. STFD f2, 2 * SIZE(AO)
  1879. STFD f3, 3 * SIZE(AO)
  1880. STFD f4, 4 * SIZE(AO)
  1881. STFD f5, 5 * SIZE(AO)
  1882. STFD f6, 6 * SIZE(AO)
  1883. STFD f7, 7 * SIZE(AO)
  1884. #endif
  1885. STFD f0, 0 * SIZE(CO1)
  1886. STFD f1, 1 * SIZE(CO1)
  1887. STFD f2, 2 * SIZE(CO1)
  1888. STFD f3, 3 * SIZE(CO1)
  1889. STFD f4, 0 * SIZE(CO2)
  1890. STFD f5, 1 * SIZE(CO2)
  1891. STFD f6, 2 * SIZE(CO2)
  1892. STFD f7, 3 * SIZE(CO2)
  1893. lfs f0, FZERO
  1894. fmr f1, f0
  1895. fmr f2, f0
  1896. fmr f3, f0
  1897. fmr f4, f0
  1898. fmr f5, f0
  1899. fmr f6, f0
  1900. fmr f7, f0
  1901. #ifndef LN
  1902. addi CO1, CO1, 4 * SIZE
  1903. addi CO2, CO2, 4 * SIZE
  1904. #endif
  1905. #ifdef RT
  1906. slwi r0, K, 2 + BASE_SHIFT
  1907. add AORIG, AORIG, r0
  1908. #endif
  1909. #if defined(LT) || defined(RN)
  1910. sub TEMP, K, KK
  1911. slwi r0, TEMP, 2 + BASE_SHIFT
  1912. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1913. add AO, AO, r0
  1914. add BO, BO, TEMP
  1915. #endif
  1916. #ifdef LN
  1917. subi KK, KK, 4
  1918. #endif
  1919. #ifdef LT
  1920. addi KK, KK, 4
  1921. #endif
  1922. addic. I, I, -1
  1923. bgt+ LL(41)
  1924. .align 4
  1925. LL(50):
  1926. andi. I, M, 2
  1927. ble LL(60)
  1928. #if defined(LT) || defined(RN)
  1929. LFD f16, 0 * SIZE(AO)
  1930. LFD f17, 1 * SIZE(AO)
  1931. LFD f18, 2 * SIZE(AO)
  1932. LFD f19, 3 * SIZE(AO)
  1933. LFD f20, 0 * SIZE(B)
  1934. LFD f21, 1 * SIZE(B)
  1935. LFD f22, 2 * SIZE(B)
  1936. LFD f23, 3 * SIZE(B)
  1937. LFD f24, 4 * SIZE(B)
  1938. LFD f25, 5 * SIZE(B)
  1939. LFD f26, 6 * SIZE(B)
  1940. LFD f27, 7 * SIZE(B)
  1941. srawi. r0, KK, 2
  1942. mtspr CTR, r0
  1943. mr BO, B
  1944. #else
  1945. #ifdef LN
  1946. slwi r0, K, 1 + BASE_SHIFT
  1947. sub AORIG, AORIG, r0
  1948. #endif
  1949. slwi r0, KK, 1 + BASE_SHIFT
  1950. slwi TEMP, KK, 1 + BASE_SHIFT
  1951. add AO, AORIG, r0
  1952. add BO, B, TEMP
  1953. sub TEMP, K, KK
  1954. LFD f16, 0 * SIZE(AO)
  1955. LFD f17, 1 * SIZE(AO)
  1956. LFD f18, 2 * SIZE(AO)
  1957. LFD f19, 3 * SIZE(AO)
  1958. LFD f20, 0 * SIZE(BO)
  1959. LFD f21, 1 * SIZE(BO)
  1960. LFD f22, 2 * SIZE(BO)
  1961. LFD f23, 3 * SIZE(BO)
  1962. LFD f24, 4 * SIZE(BO)
  1963. LFD f25, 5 * SIZE(BO)
  1964. LFD f26, 6 * SIZE(BO)
  1965. LFD f27, 7 * SIZE(BO)
  1966. srawi. r0, TEMP, 2
  1967. mtspr CTR, r0
  1968. #endif
  1969. ble LL(55)
  1970. .align 5
  1971. LL(52):
  1972. FMADD f0, f16, f20, f0
  1973. FMADD f1, f17, f20, f1
  1974. FMADD f2, f16, f21, f2
  1975. FMADD f3, f17, f21, f3
  1976. FMADD f4, f18, f22, f4
  1977. FMADD f5, f19, f22, f5
  1978. FMADD f6, f18, f23, f6
  1979. FMADD f7, f19, f23, f7
  1980. LFD f16, 4 * SIZE(AO)
  1981. LFD f17, 5 * SIZE(AO)
  1982. LFD f18, 6 * SIZE(AO)
  1983. LFD f19, 7 * SIZE(AO)
  1984. LFD f20, 8 * SIZE(BO)
  1985. LFD f21, 9 * SIZE(BO)
  1986. LFD f22, 10 * SIZE(BO)
  1987. LFD f23, 11 * SIZE(BO)
  1988. FMADD f0, f16, f24, f0
  1989. FMADD f1, f17, f24, f1
  1990. FMADD f2, f16, f25, f2
  1991. FMADD f3, f17, f25, f3
  1992. FMADD f4, f18, f26, f4
  1993. FMADD f5, f19, f26, f5
  1994. FMADD f6, f18, f27, f6
  1995. FMADD f7, f19, f27, f7
  1996. LFD f16, 8 * SIZE(AO)
  1997. LFD f17, 9 * SIZE(AO)
  1998. LFD f18, 10 * SIZE(AO)
  1999. LFD f19, 11 * SIZE(AO)
  2000. LFD f24, 12 * SIZE(BO)
  2001. LFD f25, 13 * SIZE(BO)
  2002. LFD f26, 14 * SIZE(BO)
  2003. LFD f27, 15 * SIZE(BO)
  2004. addi AO, AO, 8 * SIZE
  2005. addi BO, BO, 8 * SIZE
  2006. DCBT(BO, PREB)
  2007. bdnz LL(52)
  2008. .align 4
  2009. LL(55):
  2010. #if defined(LT) || defined(RN)
  2011. andi. r0, KK, 3
  2012. #else
  2013. andi. r0, TEMP, 3
  2014. #endif
  2015. mtspr CTR, r0
  2016. ble+ LL(58)
  2017. .align 4
  2018. LL(56):
  2019. FMADD f0, f16, f20, f0
  2020. FMADD f1, f17, f20, f1
  2021. FMADD f2, f16, f21, f2
  2022. FMADD f3, f17, f21, f3
  2023. LFD f16, 2 * SIZE(AO)
  2024. LFD f17, 3 * SIZE(AO)
  2025. LFD f20, 2 * SIZE(BO)
  2026. LFD f21, 3 * SIZE(BO)
  2027. addi BO, BO, 2 * SIZE
  2028. addi AO, AO, 2 * SIZE
  2029. bdnz LL(56)
  2030. .align 4
  2031. LL(58):
  2032. FADD f0, f4, f0
  2033. FADD f1, f5, f1
  2034. FADD f2, f6, f2
  2035. FADD f3, f7, f3
  2036. #if defined(LN) || defined(RT)
  2037. #ifdef LN
  2038. subi r0, KK, 2
  2039. #else
  2040. subi r0, KK, 2
  2041. #endif
  2042. slwi TEMP, r0, 1 + BASE_SHIFT
  2043. slwi r0, r0, 1 + BASE_SHIFT
  2044. add AO, AORIG, TEMP
  2045. add BO, B, r0
  2046. #endif
  2047. #if defined(LN) || defined(LT)
  2048. LFD f16, 0 * SIZE(BO)
  2049. LFD f17, 1 * SIZE(BO)
  2050. LFD f20, 2 * SIZE(BO)
  2051. LFD f21, 3 * SIZE(BO)
  2052. FSUB f0, f16, f0
  2053. FSUB f2, f17, f2
  2054. FSUB f1, f20, f1
  2055. FSUB f3, f21, f3
  2056. #else
  2057. LFD f16, 0 * SIZE(AO)
  2058. LFD f17, 1 * SIZE(AO)
  2059. LFD f20, 2 * SIZE(AO)
  2060. LFD f21, 3 * SIZE(AO)
  2061. FSUB f0, f16, f0
  2062. FSUB f1, f17, f1
  2063. FSUB f2, f20, f2
  2064. FSUB f3, f21, f3
  2065. #endif
  2066. #ifdef LN
  2067. LFD f19, 3 * SIZE(AO)
  2068. LFD f20, 2 * SIZE(AO)
  2069. LFD f21, 0 * SIZE(AO)
  2070. FMUL f1, f19, f1
  2071. FMUL f3, f19, f3
  2072. FNMSUB f0, f20, f1, f0
  2073. FNMSUB f2, f20, f3, f2
  2074. FMUL f0, f21, f0
  2075. FMUL f2, f21, f2
  2076. #endif
  2077. #ifdef LT
  2078. LFD f16, 0 * SIZE(AO)
  2079. LFD f17, 1 * SIZE(AO)
  2080. FMUL f0, f16, f0
  2081. FMUL f2, f16, f2
  2082. FNMSUB f1, f17, f0, f1
  2083. FNMSUB f3, f17, f2, f3
  2084. LFD f17, 3 * SIZE(AO)
  2085. FMUL f1, f17, f1
  2086. FMUL f3, f17, f3
  2087. #endif
  2088. #ifdef RN
  2089. LFD f16, 0 * SIZE(BO)
  2090. LFD f17, 1 * SIZE(BO)
  2091. LFD f18, 3 * SIZE(BO)
  2092. FMUL f0, f16, f0
  2093. FMUL f1, f16, f1
  2094. FNMSUB f2, f17, f0, f2
  2095. FNMSUB f3, f17, f1, f3
  2096. FMUL f2, f18, f2
  2097. FMUL f3, f18, f3
  2098. #endif
  2099. #ifdef RT
  2100. LFD f19, 3 * SIZE(BO)
  2101. LFD f20, 2 * SIZE(BO)
  2102. LFD f21, 0 * SIZE(BO)
  2103. FMUL f2, f19, f2
  2104. FMUL f3, f19, f3
  2105. FNMSUB f0, f20, f2, f0
  2106. FNMSUB f1, f20, f3, f1
  2107. FMUL f0, f21, f0
  2108. FMUL f1, f21, f1
  2109. #endif
  2110. #ifdef LN
  2111. subi CO1, CO1, 2 * SIZE
  2112. subi CO2, CO2, 2 * SIZE
  2113. #endif
  2114. #if defined(LN) || defined(LT)
  2115. STFD f0, 0 * SIZE(BO)
  2116. STFD f2, 1 * SIZE(BO)
  2117. STFD f1, 2 * SIZE(BO)
  2118. STFD f3, 3 * SIZE(BO)
  2119. #else
  2120. STFD f0, 0 * SIZE(AO)
  2121. STFD f1, 1 * SIZE(AO)
  2122. STFD f2, 2 * SIZE(AO)
  2123. STFD f3, 3 * SIZE(AO)
  2124. #endif
  2125. STFD f0, 0 * SIZE(CO1)
  2126. STFD f1, 1 * SIZE(CO1)
  2127. STFD f2, 0 * SIZE(CO2)
  2128. STFD f3, 1 * SIZE(CO2)
  2129. lfs f0, FZERO
  2130. fmr f1, f0
  2131. fmr f2, f0
  2132. fmr f3, f0
  2133. fmr f4, f0
  2134. fmr f5, f0
  2135. fmr f6, f0
  2136. fmr f7, f0
  2137. #ifndef LN
  2138. addi CO1, CO1, 2 * SIZE
  2139. addi CO2, CO2, 2 * SIZE
  2140. #endif
  2141. #ifdef RT
  2142. slwi r0, K, 1 + BASE_SHIFT
  2143. add AORIG, AORIG, r0
  2144. #endif
  2145. #if defined(LT) || defined(RN)
  2146. sub TEMP, K, KK
  2147. slwi r0, TEMP, 1 + BASE_SHIFT
  2148. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2149. add AO, AO, r0
  2150. add BO, BO, TEMP
  2151. #endif
  2152. #ifdef LN
  2153. subi KK, KK, 2
  2154. #endif
  2155. #ifdef LT
  2156. addi KK, KK, 2
  2157. #endif
  2158. .align 4
  2159. LL(60):
  2160. andi. I, M, 1
  2161. ble LL(69)
  2162. #if defined(LT) || defined(RN)
  2163. LFD f16, 0 * SIZE(AO)
  2164. LFD f17, 1 * SIZE(AO)
  2165. LFD f18, 2 * SIZE(AO)
  2166. LFD f19, 3 * SIZE(AO)
  2167. LFD f20, 0 * SIZE(B)
  2168. LFD f21, 1 * SIZE(B)
  2169. LFD f22, 2 * SIZE(B)
  2170. LFD f23, 3 * SIZE(B)
  2171. LFD f24, 4 * SIZE(B)
  2172. LFD f25, 5 * SIZE(B)
  2173. LFD f26, 6 * SIZE(B)
  2174. LFD f27, 7 * SIZE(B)
  2175. srawi. r0, KK, 2
  2176. mtspr CTR, r0
  2177. mr BO, B
  2178. #else
  2179. #ifdef LN
  2180. slwi r0, K, BASE_SHIFT
  2181. sub AORIG, AORIG, r0
  2182. #endif
  2183. slwi r0, KK, 0 + BASE_SHIFT
  2184. slwi TEMP, KK, 1 + BASE_SHIFT
  2185. add AO, AORIG, r0
  2186. add BO, B, TEMP
  2187. sub TEMP, K, KK
  2188. LFD f16, 0 * SIZE(AO)
  2189. LFD f17, 1 * SIZE(AO)
  2190. LFD f18, 2 * SIZE(AO)
  2191. LFD f19, 3 * SIZE(AO)
  2192. LFD f20, 0 * SIZE(BO)
  2193. LFD f21, 1 * SIZE(BO)
  2194. LFD f22, 2 * SIZE(BO)
  2195. LFD f23, 3 * SIZE(BO)
  2196. LFD f24, 4 * SIZE(BO)
  2197. LFD f25, 5 * SIZE(BO)
  2198. LFD f26, 6 * SIZE(BO)
  2199. LFD f27, 7 * SIZE(BO)
  2200. srawi. r0, TEMP, 2
  2201. mtspr CTR, r0
  2202. #endif
  2203. ble LL(65)
  2204. .align 5
  2205. LL(62):
  2206. FMADD f0, f16, f20, f0
  2207. FMADD f1, f16, f21, f1
  2208. FMADD f2, f17, f22, f2
  2209. FMADD f3, f17, f23, f3
  2210. LFD f20, 8 * SIZE(BO)
  2211. LFD f21, 9 * SIZE(BO)
  2212. LFD f22, 10 * SIZE(BO)
  2213. LFD f23, 11 * SIZE(BO)
  2214. FMADD f0, f18, f24, f0
  2215. FMADD f1, f18, f25, f1
  2216. FMADD f2, f19, f26, f2
  2217. FMADD f3, f19, f27, f3
  2218. LFD f16, 4 * SIZE(AO)
  2219. LFD f17, 5 * SIZE(AO)
  2220. LFD f18, 6 * SIZE(AO)
  2221. LFD f19, 7 * SIZE(AO)
  2222. LFD f24, 12 * SIZE(BO)
  2223. LFD f25, 13 * SIZE(BO)
  2224. LFD f26, 14 * SIZE(BO)
  2225. LFD f27, 15 * SIZE(BO)
  2226. addi AO, AO, 4 * SIZE
  2227. addi BO, BO, 8 * SIZE
  2228. bdnz LL(62)
  2229. .align 4
  2230. LL(65):
  2231. #if defined(LT) || defined(RN)
  2232. andi. r0, KK, 3
  2233. #else
  2234. andi. r0, TEMP, 3
  2235. #endif
  2236. mtspr CTR, r0
  2237. ble+ LL(68)
  2238. .align 4
  2239. LL(66):
  2240. FMADD f0, f16, f20, f0
  2241. FMADD f1, f16, f21, f1
  2242. LFD f16, 1 * SIZE(AO)
  2243. LFD f20, 2 * SIZE(BO)
  2244. LFD f21, 3 * SIZE(BO)
  2245. addi BO, BO, 2 * SIZE
  2246. addi AO, AO, 1 * SIZE
  2247. bdnz LL(66)
  2248. .align 4
  2249. LL(68):
  2250. FADD f0, f2, f0
  2251. FADD f1, f3, f1
  2252. #if defined(LN) || defined(RT)
  2253. #ifdef LN
  2254. subi r0, KK, 1
  2255. #else
  2256. subi r0, KK, 2
  2257. #endif
  2258. slwi TEMP, r0, 0 + BASE_SHIFT
  2259. slwi r0, r0, 1 + BASE_SHIFT
  2260. add AO, AORIG, TEMP
  2261. add BO, B, r0
  2262. #endif
  2263. #if defined(LN) || defined(LT)
  2264. LFD f16, 0 * SIZE(BO)
  2265. LFD f17, 1 * SIZE(BO)
  2266. FSUB f0, f16, f0
  2267. FSUB f1, f17, f1
  2268. #else
  2269. LFD f16, 0 * SIZE(AO)
  2270. LFD f20, 1 * SIZE(AO)
  2271. FSUB f0, f16, f0
  2272. FSUB f1, f20, f1
  2273. #endif
  2274. #ifdef LN
  2275. LFD f21, 0 * SIZE(AO)
  2276. FMUL f0, f21, f0
  2277. FMUL f1, f21, f1
  2278. #endif
  2279. #ifdef LT
  2280. LFD f16, 0 * SIZE(AO)
  2281. FMUL f0, f16, f0
  2282. FMUL f1, f16, f1
  2283. #endif
  2284. #ifdef RN
  2285. LFD f16, 0 * SIZE(BO)
  2286. LFD f17, 1 * SIZE(BO)
  2287. LFD f18, 3 * SIZE(BO)
  2288. FMUL f0, f16, f0
  2289. FNMSUB f1, f17, f0, f1
  2290. FMUL f1, f18, f1
  2291. #endif
  2292. #ifdef RT
  2293. LFD f19, 3 * SIZE(BO)
  2294. LFD f20, 2 * SIZE(BO)
  2295. LFD f21, 0 * SIZE(BO)
  2296. FMUL f1, f19, f1
  2297. FNMSUB f0, f20, f1, f0
  2298. FMUL f0, f21, f0
  2299. #endif
  2300. #ifdef LN
  2301. subi CO1, CO1, 1 * SIZE
  2302. subi CO2, CO2, 1 * SIZE
  2303. #endif
  2304. #if defined(LN) || defined(LT)
  2305. STFD f0, 0 * SIZE(BO)
  2306. STFD f1, 1 * SIZE(BO)
  2307. #else
  2308. STFD f0, 0 * SIZE(AO)
  2309. STFD f1, 1 * SIZE(AO)
  2310. #endif
  2311. STFD f0, 0 * SIZE(CO1)
  2312. STFD f1, 0 * SIZE(CO2)
  2313. lfs f0, FZERO
  2314. fmr f1, f0
  2315. fmr f4, f0
  2316. fmr f5, f0
  2317. #ifndef LN
  2318. addi CO1, CO1, 1 * SIZE
  2319. addi CO2, CO2, 1 * SIZE
  2320. #endif
  2321. #ifdef RT
  2322. slwi r0, K, 0 + BASE_SHIFT
  2323. add AORIG, AORIG, r0
  2324. #endif
  2325. #if defined(LT) || defined(RN)
  2326. sub TEMP, K, KK
  2327. slwi r0, TEMP, 0 + BASE_SHIFT
  2328. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2329. add AO, AO, r0
  2330. add BO, BO, TEMP
  2331. #endif
  2332. #ifdef LN
  2333. subi KK, KK, 1
  2334. #endif
  2335. #ifdef LT
  2336. addi KK, KK, 1
  2337. #endif
  2338. .align 4
  2339. LL(69):
  2340. #ifdef LN
  2341. slwi r0, K, 1 + BASE_SHIFT
  2342. add B, B, r0
  2343. #endif
  2344. #if defined(LT) || defined(RN)
  2345. mr B, BO
  2346. #endif
  2347. #ifdef RN
  2348. addi KK, KK, 2
  2349. #endif
  2350. #ifdef RT
  2351. subi KK, KK, 2
  2352. #endif
  2353. lfs f0, FZERO
  2354. .align 4
  2355. LL(70):
  2356. andi. J, N, 1
  2357. ble LL(999)
  2358. #ifdef RT
  2359. slwi r0, K, 0 + BASE_SHIFT
  2360. sub B, B, r0
  2361. sub C, C, LDC
  2362. #endif
  2363. mr CO1, C
  2364. #ifdef LN
  2365. add KK, M, OFFSET
  2366. #endif
  2367. #ifdef LT
  2368. mr KK, OFFSET
  2369. #endif
  2370. fmr f1, f0
  2371. fmr f2, f0
  2372. fmr f3, f0
  2373. srawi. I, M, 2
  2374. #if defined(LN) || defined(RT)
  2375. mr AORIG, A
  2376. #else
  2377. mr AO, A
  2378. #endif
  2379. #ifndef RT
  2380. add C, CO1, LDC
  2381. #endif
  2382. ble LL(80)
  2383. .align 4
  2384. LL(71):
  2385. #if defined(LT) || defined(RN)
  2386. LFD f16, 0 * SIZE(AO)
  2387. LFD f17, 1 * SIZE(AO)
  2388. LFD f18, 2 * SIZE(AO)
  2389. LFD f19, 3 * SIZE(AO)
  2390. LFD f20, 0 * SIZE(B)
  2391. LFD f21, 1 * SIZE(B)
  2392. LFD f22, 2 * SIZE(B)
  2393. LFD f23, 3 * SIZE(B)
  2394. dcbt CO1, PREC
  2395. srawi. r0, KK, 2
  2396. mtspr CTR, r0
  2397. mr BO, B
  2398. #else
  2399. #ifdef LN
  2400. slwi r0, K, 2 + BASE_SHIFT
  2401. sub AORIG, AORIG, r0
  2402. #endif
  2403. slwi r0, KK, 2 + BASE_SHIFT
  2404. slwi TEMP, KK, 0 + BASE_SHIFT
  2405. add AO, AORIG, r0
  2406. add BO, B, TEMP
  2407. sub TEMP, K, KK
  2408. LFD f16, 0 * SIZE(AO)
  2409. LFD f17, 1 * SIZE(AO)
  2410. LFD f18, 2 * SIZE(AO)
  2411. LFD f19, 3 * SIZE(AO)
  2412. LFD f20, 0 * SIZE(BO)
  2413. LFD f21, 1 * SIZE(BO)
  2414. LFD f22, 2 * SIZE(BO)
  2415. LFD f23, 3 * SIZE(BO)
  2416. dcbt CO1, PREC
  2417. srawi. r0, TEMP, 2
  2418. mtspr CTR, r0
  2419. #endif
  2420. ble LL(75)
  2421. .align 5
  2422. LL(72):
  2423. FMADD f0, f16, f20, f0
  2424. FMADD f1, f17, f20, f1
  2425. FMADD f2, f18, f20, f2
  2426. FMADD f3, f19, f20, f3
  2427. LFD f16, 4 * SIZE(AO)
  2428. LFD f17, 5 * SIZE(AO)
  2429. LFD f18, 6 * SIZE(AO)
  2430. LFD f19, 7 * SIZE(AO)
  2431. FMADD f0, f16, f21, f0
  2432. FMADD f1, f17, f21, f1
  2433. FMADD f2, f18, f21, f2
  2434. FMADD f3, f19, f21, f3
  2435. LFD f16, 8 * SIZE(AO)
  2436. LFD f17, 9 * SIZE(AO)
  2437. LFD f18, 10 * SIZE(AO)
  2438. LFD f19, 11 * SIZE(AO)
  2439. FMADD f0, f16, f22, f0
  2440. FMADD f1, f17, f22, f1
  2441. FMADD f2, f18, f22, f2
  2442. FMADD f3, f19, f22, f3
  2443. LFD f16, 12 * SIZE(AO)
  2444. LFD f17, 13 * SIZE(AO)
  2445. LFD f18, 14 * SIZE(AO)
  2446. LFD f19, 15 * SIZE(AO)
  2447. FMADD f0, f16, f23, f0
  2448. FMADD f1, f17, f23, f1
  2449. FMADD f2, f18, f23, f2
  2450. FMADD f3, f19, f23, f3
  2451. LFD f16, 16 * SIZE(AO)
  2452. LFD f17, 17 * SIZE(AO)
  2453. LFD f18, 18 * SIZE(AO)
  2454. LFD f19, 19 * SIZE(AO)
  2455. LFD f20, 4 * SIZE(BO)
  2456. LFD f21, 5 * SIZE(BO)
  2457. LFD f22, 6 * SIZE(BO)
  2458. LFD f23, 7 * SIZE(BO)
  2459. addi AO, AO, 16 * SIZE
  2460. addi BO, BO, 4 * SIZE
  2461. DCBT(BO, PREB)
  2462. bdnz LL(72)
  2463. .align 4
  2464. LL(75):
  2465. #if defined(LT) || defined(RN)
  2466. andi. r0, KK, 3
  2467. #else
  2468. andi. r0, TEMP, 3
  2469. #endif
  2470. mtspr CTR, r0
  2471. ble+ LL(78)
  2472. .align 4
  2473. LL(76):
  2474. FMADD f0, f16, f20, f0
  2475. FMADD f1, f17, f20, f1
  2476. FMADD f2, f18, f20, f2
  2477. FMADD f3, f19, f20, f3
  2478. LFD f16, 4 * SIZE(AO)
  2479. LFD f17, 5 * SIZE(AO)
  2480. LFD f18, 6 * SIZE(AO)
  2481. LFD f19, 7 * SIZE(AO)
  2482. LFD f20, 1 * SIZE(BO)
  2483. addi BO, BO, 1 * SIZE
  2484. addi AO, AO, 4 * SIZE
  2485. bdnz LL(76)
  2486. .align 4
  2487. LL(78):
  2488. #if defined(LN) || defined(RT)
  2489. #ifdef LN
  2490. subi r0, KK, 4
  2491. #else
  2492. subi r0, KK, 1
  2493. #endif
  2494. slwi TEMP, r0, 2 + BASE_SHIFT
  2495. slwi r0, r0, 0 + BASE_SHIFT
  2496. add AO, AORIG, TEMP
  2497. add BO, B, r0
  2498. #endif
  2499. #if defined(LN) || defined(LT)
  2500. LFD f16, 0 * SIZE(BO)
  2501. LFD f20, 1 * SIZE(BO)
  2502. LFD f24, 2 * SIZE(BO)
  2503. LFD f28, 3 * SIZE(BO)
  2504. FSUB f0, f16, f0
  2505. FSUB f1, f20, f1
  2506. FSUB f2, f24, f2
  2507. FSUB f3, f28, f3
  2508. #else
  2509. LFD f16, 0 * SIZE(AO)
  2510. LFD f17, 1 * SIZE(AO)
  2511. LFD f18, 2 * SIZE(AO)
  2512. LFD f19, 3 * SIZE(AO)
  2513. FSUB f0, f16, f0
  2514. FSUB f1, f17, f1
  2515. FSUB f2, f18, f2
  2516. FSUB f3, f19, f3
  2517. #endif
  2518. #ifdef LN
  2519. LFD f16, 15 * SIZE(AO)
  2520. LFD f17, 14 * SIZE(AO)
  2521. LFD f18, 13 * SIZE(AO)
  2522. LFD f19, 12 * SIZE(AO)
  2523. FMUL f3, f16, f3
  2524. FNMSUB f2, f17, f3, f2
  2525. FNMSUB f1, f18, f3, f1
  2526. FNMSUB f0, f19, f3, f0
  2527. LFD f16, 10 * SIZE(AO)
  2528. LFD f17, 9 * SIZE(AO)
  2529. LFD f18, 8 * SIZE(AO)
  2530. LFD f19, 5 * SIZE(AO)
  2531. LFD f20, 4 * SIZE(AO)
  2532. LFD f21, 0 * SIZE(AO)
  2533. FMUL f2, f16, f2
  2534. FNMSUB f1, f17, f2, f1
  2535. FNMSUB f0, f18, f2, f0
  2536. FMUL f1, f19, f1
  2537. FNMSUB f0, f20, f1, f0
  2538. FMUL f0, f21, f0
  2539. #endif
  2540. #ifdef LT
  2541. LFD f16, 0 * SIZE(AO)
  2542. LFD f17, 1 * SIZE(AO)
  2543. LFD f18, 2 * SIZE(AO)
  2544. LFD f19, 3 * SIZE(AO)
  2545. FMUL f0, f16, f0
  2546. FNMSUB f1, f17, f0, f1
  2547. FNMSUB f2, f18, f0, f2
  2548. FNMSUB f3, f19, f0, f3
  2549. LFD f17, 5 * SIZE(AO)
  2550. LFD f18, 6 * SIZE(AO)
  2551. LFD f19, 7 * SIZE(AO)
  2552. FMUL f1, f17, f1
  2553. FNMSUB f2, f18, f1, f2
  2554. FNMSUB f3, f19, f1, f3
  2555. LFD f18, 10 * SIZE(AO)
  2556. LFD f19, 11 * SIZE(AO)
  2557. FMUL f2, f18, f2
  2558. FNMSUB f3, f19, f2, f3
  2559. LFD f19, 15 * SIZE(AO)
  2560. FMUL f3, f19, f3
  2561. #endif
  2562. #ifdef RN
  2563. LFD f16, 0 * SIZE(BO)
  2564. FMUL f0, f16, f0
  2565. FMUL f1, f16, f1
  2566. FMUL f2, f16, f2
  2567. FMUL f3, f16, f3
  2568. #endif
  2569. #ifdef RT
  2570. LFD f21, 0 * SIZE(BO)
  2571. FMUL f0, f21, f0
  2572. FMUL f1, f21, f1
  2573. FMUL f2, f21, f2
  2574. FMUL f3, f21, f3
  2575. #endif
  2576. #ifdef LN
  2577. subi CO1, CO1, 4 * SIZE
  2578. #endif
  2579. #if defined(LN) || defined(LT)
  2580. STFD f0, 0 * SIZE(BO)
  2581. STFD f1, 1 * SIZE(BO)
  2582. STFD f2, 2 * SIZE(BO)
  2583. STFD f3, 3 * SIZE(BO)
  2584. #else
  2585. STFD f0, 0 * SIZE(AO)
  2586. STFD f1, 1 * SIZE(AO)
  2587. STFD f2, 2 * SIZE(AO)
  2588. STFD f3, 3 * SIZE(AO)
  2589. #endif
  2590. STFD f0, 0 * SIZE(CO1)
  2591. STFD f1, 1 * SIZE(CO1)
  2592. STFD f2, 2 * SIZE(CO1)
  2593. STFD f3, 3 * SIZE(CO1)
  2594. lfs f0, FZERO
  2595. fmr f1, f0
  2596. fmr f2, f0
  2597. fmr f3, f0
  2598. #ifndef LN
  2599. addi CO1, CO1, 4 * SIZE
  2600. #endif
  2601. #ifdef RT
  2602. slwi r0, K, 2 + BASE_SHIFT
  2603. add AORIG, AORIG, r0
  2604. #endif
  2605. #if defined(LT) || defined(RN)
  2606. sub TEMP, K, KK
  2607. slwi r0, TEMP, 2 + BASE_SHIFT
  2608. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2609. add AO, AO, r0
  2610. add BO, BO, TEMP
  2611. #endif
  2612. #ifdef LN
  2613. subi KK, KK, 4
  2614. #endif
  2615. #ifdef LT
  2616. addi KK, KK, 4
  2617. #endif
  2618. addic. I, I, -1
  2619. bgt+ LL(71)
  2620. .align 4
  2621. LL(80):
  2622. andi. I, M, 2
  2623. ble LL(90)
  2624. #if defined(LT) || defined(RN)
  2625. LFD f16, 0 * SIZE(AO)
  2626. LFD f17, 1 * SIZE(AO)
  2627. LFD f18, 2 * SIZE(AO)
  2628. LFD f19, 3 * SIZE(AO)
  2629. LFD f20, 0 * SIZE(B)
  2630. LFD f21, 1 * SIZE(B)
  2631. LFD f22, 2 * SIZE(B)
  2632. LFD f23, 3 * SIZE(B)
  2633. srawi. r0, KK, 2
  2634. mtspr CTR, r0
  2635. mr BO, B
  2636. #else
  2637. #ifdef LN
  2638. slwi r0, K, 1 + BASE_SHIFT
  2639. sub AORIG, AORIG, r0
  2640. #endif
  2641. slwi r0, KK, 1 + BASE_SHIFT
  2642. slwi TEMP, KK, 0 + BASE_SHIFT
  2643. add AO, AORIG, r0
  2644. add BO, B, TEMP
  2645. sub TEMP, K, KK
  2646. LFD f16, 0 * SIZE(AO)
  2647. LFD f17, 1 * SIZE(AO)
  2648. LFD f18, 2 * SIZE(AO)
  2649. LFD f19, 3 * SIZE(AO)
  2650. LFD f20, 0 * SIZE(BO)
  2651. LFD f21, 1 * SIZE(BO)
  2652. LFD f22, 2 * SIZE(BO)
  2653. LFD f23, 3 * SIZE(BO)
  2654. srawi. r0, TEMP, 2
  2655. mtspr CTR, r0
  2656. #endif
  2657. ble LL(85)
  2658. .align 5
  2659. LL(82):
  2660. FMADD f0, f16, f20, f0
  2661. FMADD f1, f17, f20, f1
  2662. FMADD f2, f18, f21, f2
  2663. FMADD f3, f19, f21, f3
  2664. LFD f16, 4 * SIZE(AO)
  2665. LFD f17, 5 * SIZE(AO)
  2666. LFD f18, 6 * SIZE(AO)
  2667. LFD f19, 7 * SIZE(AO)
  2668. FMADD f0, f16, f22, f0
  2669. FMADD f1, f17, f22, f1
  2670. FMADD f2, f18, f23, f2
  2671. FMADD f3, f19, f23, f3
  2672. LFD f16, 8 * SIZE(AO)
  2673. LFD f17, 9 * SIZE(AO)
  2674. LFD f18, 10 * SIZE(AO)
  2675. LFD f19, 11 * SIZE(AO)
  2676. LFD f20, 4 * SIZE(BO)
  2677. LFD f21, 5 * SIZE(BO)
  2678. LFD f22, 6 * SIZE(BO)
  2679. LFD f23, 7 * SIZE(BO)
  2680. addi AO, AO, 8 * SIZE
  2681. addi BO, BO, 4 * SIZE
  2682. DCBT(BO, PREB)
  2683. bdnz LL(82)
  2684. .align 4
  2685. LL(85):
  2686. #if defined(LT) || defined(RN)
  2687. andi. r0, KK, 3
  2688. #else
  2689. andi. r0, TEMP, 3
  2690. #endif
  2691. mtspr CTR, r0
  2692. ble+ LL(88)
  2693. .align 4
  2694. LL(86):
  2695. FMADD f0, f16, f20, f0
  2696. FMADD f1, f17, f20, f1
  2697. LFD f16, 2 * SIZE(AO)
  2698. LFD f17, 3 * SIZE(AO)
  2699. LFD f20, 1 * SIZE(BO)
  2700. addi BO, BO, 1 * SIZE
  2701. addi AO, AO, 2 * SIZE
  2702. bdnz LL(86)
  2703. .align 4
  2704. LL(88):
  2705. FADD f0, f2, f0
  2706. FADD f1, f3, f1
  2707. #if defined(LN) || defined(RT)
  2708. #ifdef LN
  2709. subi r0, KK, 2
  2710. #else
  2711. subi r0, KK, 1
  2712. #endif
  2713. slwi TEMP, r0, 1 + BASE_SHIFT
  2714. slwi r0, r0, 0 + BASE_SHIFT
  2715. add AO, AORIG, TEMP
  2716. add BO, B, r0
  2717. #endif
  2718. #if defined(LN) || defined(LT)
  2719. LFD f16, 0 * SIZE(BO)
  2720. LFD f20, 1 * SIZE(BO)
  2721. FSUB f0, f16, f0
  2722. FSUB f1, f20, f1
  2723. #else
  2724. LFD f16, 0 * SIZE(AO)
  2725. LFD f17, 1 * SIZE(AO)
  2726. FSUB f0, f16, f0
  2727. FSUB f1, f17, f1
  2728. #endif
  2729. #ifdef LN
  2730. LFD f19, 3 * SIZE(AO)
  2731. LFD f20, 2 * SIZE(AO)
  2732. LFD f21, 0 * SIZE(AO)
  2733. FMUL f1, f19, f1
  2734. FNMSUB f0, f20, f1, f0
  2735. FMUL f0, f21, f0
  2736. #endif
  2737. #ifdef LT
  2738. LFD f16, 0 * SIZE(AO)
  2739. LFD f17, 1 * SIZE(AO)
  2740. FMUL f0, f16, f0
  2741. FNMSUB f1, f17, f0, f1
  2742. LFD f17, 3 * SIZE(AO)
  2743. FMUL f1, f17, f1
  2744. #endif
  2745. #ifdef RN
  2746. LFD f16, 0 * SIZE(BO)
  2747. FMUL f0, f16, f0
  2748. FMUL f1, f16, f1
  2749. #endif
  2750. #ifdef RT
  2751. LFD f21, 0 * SIZE(BO)
  2752. FMUL f0, f21, f0
  2753. FMUL f1, f21, f1
  2754. #endif
  2755. #ifdef LN
  2756. subi CO1, CO1, 2 * SIZE
  2757. #endif
  2758. #if defined(LN) || defined(LT)
  2759. STFD f0, 0 * SIZE(BO)
  2760. STFD f1, 1 * SIZE(BO)
  2761. #else
  2762. STFD f0, 0 * SIZE(AO)
  2763. STFD f1, 1 * SIZE(AO)
  2764. #endif
  2765. STFD f0, 0 * SIZE(CO1)
  2766. STFD f1, 1 * SIZE(CO1)
  2767. lfs f0, FZERO
  2768. fmr f1, f0
  2769. fmr f2, f0
  2770. fmr f3, f0
  2771. #ifndef LN
  2772. addi CO1, CO1, 2 * SIZE
  2773. #endif
  2774. #ifdef RT
  2775. slwi r0, K, 1 + BASE_SHIFT
  2776. add AORIG, AORIG, r0
  2777. #endif
  2778. #if defined(LT) || defined(RN)
  2779. sub TEMP, K, KK
  2780. slwi r0, TEMP, 1 + BASE_SHIFT
  2781. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2782. add AO, AO, r0
  2783. add BO, BO, TEMP
  2784. #endif
  2785. #ifdef LN
  2786. subi KK, KK, 2
  2787. #endif
  2788. #ifdef LT
  2789. addi KK, KK, 2
  2790. #endif
  2791. .align 4
  2792. LL(90):
  2793. andi. I, M, 1
  2794. ble LL(999)
  2795. #if defined(LT) || defined(RN)
  2796. LFD f16, 0 * SIZE(AO)
  2797. LFD f17, 1 * SIZE(AO)
  2798. LFD f18, 2 * SIZE(AO)
  2799. LFD f19, 3 * SIZE(AO)
  2800. LFD f20, 0 * SIZE(B)
  2801. LFD f21, 1 * SIZE(B)
  2802. LFD f22, 2 * SIZE(B)
  2803. LFD f23, 3 * SIZE(B)
  2804. srawi. r0, KK, 3
  2805. mtspr CTR, r0
  2806. mr BO, B
  2807. #else
  2808. #ifdef LN
  2809. slwi r0, K, BASE_SHIFT
  2810. sub AORIG, AORIG, r0
  2811. #endif
  2812. slwi r0, KK, 0 + BASE_SHIFT
  2813. slwi TEMP, KK, 0 + BASE_SHIFT
  2814. add AO, AORIG, r0
  2815. add BO, B, TEMP
  2816. sub TEMP, K, KK
  2817. LFD f16, 0 * SIZE(AO)
  2818. LFD f17, 1 * SIZE(AO)
  2819. LFD f18, 2 * SIZE(AO)
  2820. LFD f19, 3 * SIZE(AO)
  2821. LFD f20, 0 * SIZE(BO)
  2822. LFD f21, 1 * SIZE(BO)
  2823. LFD f22, 2 * SIZE(BO)
  2824. LFD f23, 3 * SIZE(BO)
  2825. srawi. r0, TEMP, 3
  2826. mtspr CTR, r0
  2827. #endif
  2828. ble LL(95)
  2829. .align 5
  2830. LL(92):
  2831. FMADD f0, f16, f20, f0
  2832. FMADD f1, f17, f21, f1
  2833. FMADD f2, f18, f22, f2
  2834. FMADD f3, f19, f23, f3
  2835. LFD f16, 4 * SIZE(AO)
  2836. LFD f17, 5 * SIZE(AO)
  2837. LFD f18, 6 * SIZE(AO)
  2838. LFD f19, 7 * SIZE(AO)
  2839. LFD f20, 4 * SIZE(BO)
  2840. LFD f21, 5 * SIZE(BO)
  2841. LFD f22, 6 * SIZE(BO)
  2842. LFD f23, 7 * SIZE(BO)
  2843. FMADD f0, f16, f20, f0
  2844. FMADD f1, f17, f21, f1
  2845. FMADD f2, f18, f22, f2
  2846. FMADD f3, f19, f23, f3
  2847. LFD f16, 8 * SIZE(AO)
  2848. LFD f17, 9 * SIZE(AO)
  2849. LFD f18, 10 * SIZE(AO)
  2850. LFD f19, 11 * SIZE(AO)
  2851. LFD f20, 8 * SIZE(BO)
  2852. LFD f21, 9 * SIZE(BO)
  2853. LFD f22, 10 * SIZE(BO)
  2854. LFD f23, 11 * SIZE(BO)
  2855. addi AO, AO, 8 * SIZE
  2856. addi BO, BO, 8 * SIZE
  2857. bdnz LL(92)
  2858. .align 4
  2859. LL(95):
  2860. #if defined(LT) || defined(RN)
  2861. andi. r0, KK, 7
  2862. #else
  2863. andi. r0, TEMP, 7
  2864. #endif
  2865. mtspr CTR, r0
  2866. ble+ LL(98)
  2867. .align 4
  2868. LL(96):
  2869. FMADD f0, f16, f20, f0
  2870. LFD f16, 1 * SIZE(AO)
  2871. LFD f20, 1 * SIZE(BO)
  2872. addi BO, BO, 1 * SIZE
  2873. addi AO, AO, 1 * SIZE
  2874. bdnz LL(96)
  2875. .align 4
  2876. LL(98):
  2877. FADD f0, f1, f0
  2878. FADD f2, f3, f2
  2879. FADD f0, f2, f0
  2880. #if defined(LN) || defined(RT)
  2881. #ifdef LN
  2882. subi r0, KK, 1
  2883. #else
  2884. subi r0, KK, 1
  2885. #endif
  2886. slwi TEMP, r0, 0 + BASE_SHIFT
  2887. slwi r0, r0, 0 + BASE_SHIFT
  2888. add AO, AORIG, TEMP
  2889. add BO, B, r0
  2890. #endif
  2891. #if defined(LN) || defined(LT)
  2892. LFD f16, 0 * SIZE(BO)
  2893. FSUB f0, f16, f0
  2894. #else
  2895. LFD f16, 0 * SIZE(AO)
  2896. FSUB f0, f16, f0
  2897. #endif
  2898. #ifdef LN
  2899. LFD f21, 0 * SIZE(AO)
  2900. FMUL f0, f21, f0
  2901. #endif
  2902. #ifdef LT
  2903. LFD f16, 0 * SIZE(AO)
  2904. FMUL f0, f16, f0
  2905. #endif
  2906. #ifdef RN
  2907. LFD f16, 0 * SIZE(BO)
  2908. FMUL f0, f16, f0
  2909. #endif
  2910. #ifdef RT
  2911. LFD f21, 0 * SIZE(BO)
  2912. FMUL f0, f21, f0
  2913. #endif
  2914. #ifdef LN
  2915. subi CO1, CO1, 1 * SIZE
  2916. #endif
  2917. #if defined(LN) || defined(LT)
  2918. STFD f0, 0 * SIZE(BO)
  2919. #else
  2920. STFD f0, 0 * SIZE(AO)
  2921. #endif
  2922. STFD f0, 0 * SIZE(CO1)
  2923. #ifndef LN
  2924. addi CO1, CO1, 1 * SIZE
  2925. #endif
  2926. #ifdef RT
  2927. slwi r0, K, 0 + BASE_SHIFT
  2928. add AORIG, AORIG, r0
  2929. #endif
  2930. #if defined(LT) || defined(RN)
  2931. sub TEMP, K, KK
  2932. slwi r0, TEMP, 0 + BASE_SHIFT
  2933. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2934. add AO, AO, r0
  2935. add BO, BO, TEMP
  2936. #endif
  2937. #ifdef LN
  2938. subi KK, KK, 1
  2939. #endif
  2940. #ifdef LT
  2941. addi KK, KK, 1
  2942. #endif
  2943. .align 4
  2944. LL(999):
  2945. addi r3, 0, 0
  2946. lfd f14, 0(SP)
  2947. lfd f15, 8(SP)
  2948. lfd f16, 16(SP)
  2949. lfd f17, 24(SP)
  2950. lfd f18, 32(SP)
  2951. lfd f19, 40(SP)
  2952. lfd f20, 48(SP)
  2953. lfd f21, 56(SP)
  2954. lfd f22, 64(SP)
  2955. lfd f23, 72(SP)
  2956. lfd f24, 80(SP)
  2957. lfd f25, 88(SP)
  2958. lfd f26, 96(SP)
  2959. lfd f27, 104(SP)
  2960. lfd f28, 112(SP)
  2961. lfd f29, 120(SP)
  2962. lfd f30, 128(SP)
  2963. lfd f31, 136(SP)
  2964. #ifdef __64BIT__
  2965. ld r31, 144(SP)
  2966. ld r30, 152(SP)
  2967. ld r29, 160(SP)
  2968. ld r28, 168(SP)
  2969. ld r27, 176(SP)
  2970. ld r26, 184(SP)
  2971. ld r25, 192(SP)
  2972. ld r24, 200(SP)
  2973. ld r23, 208(SP)
  2974. ld r22, 216(SP)
  2975. ld r21, 224(SP)
  2976. ld r20, 232(SP)
  2977. ld r19, 240(SP)
  2978. ld r18, 248(SP)
  2979. #else
  2980. lwz r31, 144(SP)
  2981. lwz r30, 148(SP)
  2982. lwz r29, 152(SP)
  2983. lwz r28, 156(SP)
  2984. lwz r27, 160(SP)
  2985. lwz r26, 164(SP)
  2986. lwz r25, 168(SP)
  2987. lwz r24, 172(SP)
  2988. lwz r23, 176(SP)
  2989. lwz r22, 180(SP)
  2990. lwz r21, 184(SP)
  2991. lwz r20, 188(SP)
  2992. lwz r19, 192(SP)
  2993. lwz r18, 196(SP)
  2994. #endif
  2995. addi SP, SP, STACKSIZE
  2996. blr
  2997. EPILOGUE
  2998. #endif