You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n.S 63 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define I r11
  83. #define J r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define AO5 r18
  89. #define AO6 r19
  90. #define AO7 r20
  91. #define AO8 r21
  92. #define LDA8 r22
  93. #define Y1 r23
  94. #define PREA r24
  95. #define PREC r25
  96. #define YY r26
  97. #define BUFFER r27
  98. #define y01 f0
  99. #define y02 f1
  100. #define y03 f2
  101. #define y04 f3
  102. #define y05 f4
  103. #define y06 f5
  104. #define y07 f6
  105. #define y08 f7
  106. #define y09 f8
  107. #define y10 f9
  108. #define y11 f10
  109. #define y12 f11
  110. #define y13 f12
  111. #define y14 f13
  112. #define y15 f14
  113. #define y16 f15
  114. #define alpha1 f16
  115. #define alpha2 f17
  116. #define alpha3 f18
  117. #define alpha4 f19
  118. #define alpha5 f20
  119. #define alpha6 f21
  120. #define alpha7 f22
  121. #define alpha8 f23
  122. #define a1 f24
  123. #define a2 f25
  124. #define a3 f26
  125. #define a4 f27
  126. #define a5 f28
  127. #define a6 f29
  128. #define a7 f30
  129. #define a8 f31
  130. #define alpha f31
  131. #if defined(PPCG4)
  132. #define PREFETCHSIZE_A 24
  133. #define PREFETCHSIZE_C 16
  134. #endif
  135. #if defined(PPC440) || defined(PPC440FP2)
  136. #define PREFETCHSIZE_A 24
  137. #define PREFETCHSIZE_C 16
  138. #endif
  139. #ifdef PPC970
  140. #define PREFETCHSIZE_A 16
  141. #define PREFETCHSIZE_C 16
  142. #endif
  143. #ifdef CELL
  144. #define PREFETCHSIZE_A 16
  145. #define PREFETCHSIZE_C 16
  146. #endif
  147. #ifdef POWER3
  148. #define PREFETCHSIZE_A 16
  149. #define PREFETCHSIZE_C 16
  150. #endif
  151. #ifdef POWER4
  152. #define PREFETCHSIZE_A 16
  153. #define PREFETCHSIZE_C 16
  154. #endif
  155. #ifdef POWER5
  156. #define PREFETCHSIZE_A 40
  157. #define PREFETCHSIZE_C 24
  158. #endif
  159. #ifdef POWER6
  160. #define PREFETCHSIZE_A 96
  161. #define PREFETCHSIZE_C 40
  162. #endif
  163. #ifdef POWER8
  164. #define PREFETCHSIZE_A 96
  165. #define PREFETCHSIZE_C 40
  166. #endif
  167. #ifndef NEEDPARAM
  168. #ifndef __64BIT__
  169. #define STACKSIZE 224
  170. #define ALPHA 200(SP)
  171. #define FZERO 208(SP)
  172. #else
  173. #define STACKSIZE 280
  174. #define ALPHA 256(SP)
  175. #define FZERO 264(SP)
  176. #endif
  177. PROLOGUE
  178. PROFCODE
  179. addi SP, SP, -STACKSIZE
  180. li r0, 0
  181. stfd f14, 0(SP)
  182. stfd f15, 8(SP)
  183. stfd f16, 16(SP)
  184. stfd f17, 24(SP)
  185. stfd f18, 32(SP)
  186. stfd f19, 40(SP)
  187. stfd f20, 48(SP)
  188. stfd f21, 56(SP)
  189. stfd f22, 64(SP)
  190. stfd f23, 72(SP)
  191. stfd f24, 80(SP)
  192. stfd f25, 88(SP)
  193. stfd f26, 96(SP)
  194. stfd f27, 104(SP)
  195. stfd f28, 112(SP)
  196. stfd f29, 120(SP)
  197. stfd f30, 128(SP)
  198. stfd f31, 136(SP)
  199. #ifdef __64BIT__
  200. std r0, FZERO
  201. std r14, 144(SP)
  202. std r15, 152(SP)
  203. std r16, 160(SP)
  204. std r17, 168(SP)
  205. std r18, 176(SP)
  206. std r19, 184(SP)
  207. std r20, 192(SP)
  208. std r21, 200(SP)
  209. std r22, 208(SP)
  210. std r23, 216(SP)
  211. std r24, 224(SP)
  212. std r25, 232(SP)
  213. std r26, 240(SP)
  214. std r27, 248(SP)
  215. #else
  216. stw r0, 0 + FZERO
  217. stw r0, 4 + FZERO
  218. stw r14, 144(SP)
  219. stw r15, 148(SP)
  220. stw r16, 152(SP)
  221. stw r17, 156(SP)
  222. stw r18, 160(SP)
  223. stw r19, 164(SP)
  224. stw r20, 168(SP)
  225. stw r21, 172(SP)
  226. stw r22, 176(SP)
  227. stw r23, 180(SP)
  228. stw r24, 184(SP)
  229. stw r25, 188(SP)
  230. stw r26, 192(SP)
  231. stw r27, 196(SP)
  232. #endif
  233. #if defined(linux) || defined(__FreeBSD__)
  234. #ifndef __64BIT__
  235. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  236. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  237. #else
  238. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  239. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  240. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  241. #endif
  242. #endif
  243. #if defined(_AIX) || defined(__APPLE__)
  244. #ifndef __64BIT__
  245. #ifdef DOUBLE
  246. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  247. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  248. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  249. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  250. #else
  251. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  252. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  253. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  254. #endif
  255. #else
  256. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  257. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  258. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  259. #endif
  260. #endif
  261. stfd f1, ALPHA
  262. fmr alpha, f1
  263. slwi LDA8, LDA, BASE_SHIFT + 3
  264. slwi LDA, LDA, BASE_SHIFT
  265. slwi INCX, INCX, BASE_SHIFT
  266. slwi INCY, INCY, BASE_SHIFT
  267. li PREA, PREFETCHSIZE_A * SIZE
  268. li PREC, PREFETCHSIZE_C * SIZE
  269. cmpwi cr0, M, 0
  270. ble- LL(999)
  271. cmpwi cr0, N, 0
  272. ble- LL(999)
  273. mr YY, Y
  274. lfd f0, FZERO
  275. cmpi cr0, 0, INCY, SIZE
  276. beq LL(10)
  277. mr YY, BUFFER
  278. mr Y1, BUFFER
  279. addi r0, M, 7
  280. srawi. r0, r0, 3
  281. mtspr CTR, r0
  282. .align 4
  283. LL(02):
  284. STFD f0, 0 * SIZE(Y1)
  285. STFD f0, 1 * SIZE(Y1)
  286. STFD f0, 2 * SIZE(Y1)
  287. STFD f0, 3 * SIZE(Y1)
  288. STFD f0, 4 * SIZE(Y1)
  289. STFD f0, 5 * SIZE(Y1)
  290. STFD f0, 6 * SIZE(Y1)
  291. STFD f0, 7 * SIZE(Y1)
  292. addi Y1, Y1, 8 * SIZE
  293. bdnz LL(02)
  294. .align 4
  295. LL(10):
  296. srawi. J, N, 3
  297. ble LL(20)
  298. .align 4
  299. LL(11):
  300. LFD alpha1, 0 * SIZE(X)
  301. add X, X, INCX
  302. LFD alpha2, 0 * SIZE(X)
  303. add X, X, INCX
  304. LFD alpha3, 0 * SIZE(X)
  305. add X, X, INCX
  306. LFD alpha4, 0 * SIZE(X)
  307. add X, X, INCX
  308. LFD alpha5, 0 * SIZE(X)
  309. add X, X, INCX
  310. LFD alpha6, 0 * SIZE(X)
  311. add X, X, INCX
  312. LFD alpha7, 0 * SIZE(X)
  313. add X, X, INCX
  314. LFD alpha8, 0 * SIZE(X)
  315. add X, X, INCX
  316. FMUL alpha1, alpha, alpha1
  317. FMUL alpha2, alpha, alpha2
  318. FMUL alpha3, alpha, alpha3
  319. FMUL alpha4, alpha, alpha4
  320. FMUL alpha5, alpha, alpha5
  321. FMUL alpha6, alpha, alpha6
  322. FMUL alpha7, alpha, alpha7
  323. FMUL alpha8, alpha, alpha8
  324. mr AO1, A
  325. add AO2, A, LDA
  326. add AO3, AO2, LDA
  327. add AO4, AO3, LDA
  328. add AO5, AO4, LDA
  329. add AO6, AO5, LDA
  330. add AO7, AO6, LDA
  331. add AO8, AO7, LDA
  332. add A, AO8, LDA
  333. mr Y1, YY
  334. srawi. r0, M, 4
  335. mtspr CTR, r0
  336. ble LL(15)
  337. LFD y01, 0 * SIZE(Y1)
  338. LFD y02, 1 * SIZE(Y1)
  339. LFD y03, 2 * SIZE(Y1)
  340. LFD y04, 3 * SIZE(Y1)
  341. LFD y05, 4 * SIZE(Y1)
  342. LFD y06, 5 * SIZE(Y1)
  343. LFD y07, 6 * SIZE(Y1)
  344. LFD y08, 7 * SIZE(Y1)
  345. LFD a1, 0 * SIZE(AO1)
  346. LFD a2, 1 * SIZE(AO1)
  347. LFD a3, 2 * SIZE(AO1)
  348. LFD a4, 3 * SIZE(AO1)
  349. LFD a5, 4 * SIZE(AO1)
  350. LFD a6, 5 * SIZE(AO1)
  351. LFD a7, 6 * SIZE(AO1)
  352. LFD a8, 7 * SIZE(AO1)
  353. LFD y09, 8 * SIZE(Y1)
  354. LFD y10, 9 * SIZE(Y1)
  355. LFD y11, 10 * SIZE(Y1)
  356. LFD y12, 11 * SIZE(Y1)
  357. LFD y13, 12 * SIZE(Y1)
  358. LFD y14, 13 * SIZE(Y1)
  359. LFD y15, 14 * SIZE(Y1)
  360. LFD y16, 15 * SIZE(Y1)
  361. FMADD y01, alpha1, a1, y01
  362. FMADD y02, alpha1, a2, y02
  363. FMADD y03, alpha1, a3, y03
  364. FMADD y04, alpha1, a4, y04
  365. LFD a1, 8 * SIZE(AO1)
  366. LFD a2, 9 * SIZE(AO1)
  367. LFD a3, 10 * SIZE(AO1)
  368. LFD a4, 11 * SIZE(AO1)
  369. FMADD y05, alpha1, a5, y05
  370. FMADD y06, alpha1, a6, y06
  371. FMADD y07, alpha1, a7, y07
  372. FMADD y08, alpha1, a8, y08
  373. LFD a5, 12 * SIZE(AO1)
  374. LFD a6, 13 * SIZE(AO1)
  375. LFD a7, 14 * SIZE(AO1)
  376. LFD a8, 15 * SIZE(AO1)
  377. addi AO1, AO1, 16 * SIZE
  378. nop
  379. nop
  380. DCBT(AO1, PREA)
  381. FMADD y09, alpha1, a1, y09
  382. FMADD y10, alpha1, a2, y10
  383. FMADD y11, alpha1, a3, y11
  384. FMADD y12, alpha1, a4, y12
  385. LFD a1, 0 * SIZE(AO2)
  386. LFD a2, 1 * SIZE(AO2)
  387. LFD a3, 2 * SIZE(AO2)
  388. LFD a4, 3 * SIZE(AO2)
  389. FMADD y13, alpha1, a5, y13
  390. FMADD y14, alpha1, a6, y14
  391. FMADD y15, alpha1, a7, y15
  392. FMADD y16, alpha1, a8, y16
  393. LFD a5, 4 * SIZE(AO2)
  394. LFD a6, 5 * SIZE(AO2)
  395. LFD a7, 6 * SIZE(AO2)
  396. LFD a8, 7 * SIZE(AO2)
  397. FMADD y01, alpha2, a1, y01
  398. FMADD y02, alpha2, a2, y02
  399. FMADD y03, alpha2, a3, y03
  400. FMADD y04, alpha2, a4, y04
  401. LFD a1, 8 * SIZE(AO2)
  402. LFD a2, 9 * SIZE(AO2)
  403. LFD a3, 10 * SIZE(AO2)
  404. LFD a4, 11 * SIZE(AO2)
  405. FMADD y05, alpha2, a5, y05
  406. FMADD y06, alpha2, a6, y06
  407. FMADD y07, alpha2, a7, y07
  408. FMADD y08, alpha2, a8, y08
  409. LFD a5, 12 * SIZE(AO2)
  410. LFD a6, 13 * SIZE(AO2)
  411. LFD a7, 14 * SIZE(AO2)
  412. LFD a8, 15 * SIZE(AO2)
  413. addi AO2, AO2, 16 * SIZE
  414. nop
  415. nop
  416. DCBT(AO2, PREA)
  417. FMADD y09, alpha2, a1, y09
  418. FMADD y10, alpha2, a2, y10
  419. FMADD y11, alpha2, a3, y11
  420. FMADD y12, alpha2, a4, y12
  421. LFD a1, 0 * SIZE(AO3)
  422. LFD a2, 1 * SIZE(AO3)
  423. LFD a3, 2 * SIZE(AO3)
  424. LFD a4, 3 * SIZE(AO3)
  425. FMADD y13, alpha2, a5, y13
  426. FMADD y14, alpha2, a6, y14
  427. FMADD y15, alpha2, a7, y15
  428. FMADD y16, alpha2, a8, y16
  429. LFD a5, 4 * SIZE(AO3)
  430. LFD a6, 5 * SIZE(AO3)
  431. LFD a7, 6 * SIZE(AO3)
  432. LFD a8, 7 * SIZE(AO3)
  433. FMADD y01, alpha3, a1, y01
  434. FMADD y02, alpha3, a2, y02
  435. FMADD y03, alpha3, a3, y03
  436. FMADD y04, alpha3, a4, y04
  437. LFD a1, 8 * SIZE(AO3)
  438. LFD a2, 9 * SIZE(AO3)
  439. LFD a3, 10 * SIZE(AO3)
  440. LFD a4, 11 * SIZE(AO3)
  441. FMADD y05, alpha3, a5, y05
  442. FMADD y06, alpha3, a6, y06
  443. FMADD y07, alpha3, a7, y07
  444. FMADD y08, alpha3, a8, y08
  445. LFD a5, 12 * SIZE(AO3)
  446. LFD a6, 13 * SIZE(AO3)
  447. LFD a7, 14 * SIZE(AO3)
  448. LFD a8, 15 * SIZE(AO3)
  449. addi AO3, AO3, 16 * SIZE
  450. nop
  451. nop
  452. DCBT(AO3, PREA)
  453. FMADD y09, alpha3, a1, y09
  454. FMADD y10, alpha3, a2, y10
  455. FMADD y11, alpha3, a3, y11
  456. FMADD y12, alpha3, a4, y12
  457. LFD a1, 0 * SIZE(AO4)
  458. LFD a2, 1 * SIZE(AO4)
  459. LFD a3, 2 * SIZE(AO4)
  460. LFD a4, 3 * SIZE(AO4)
  461. FMADD y13, alpha3, a5, y13
  462. FMADD y14, alpha3, a6, y14
  463. FMADD y15, alpha3, a7, y15
  464. FMADD y16, alpha3, a8, y16
  465. LFD a5, 4 * SIZE(AO4)
  466. LFD a6, 5 * SIZE(AO4)
  467. LFD a7, 6 * SIZE(AO4)
  468. LFD a8, 7 * SIZE(AO4)
  469. FMADD y01, alpha4, a1, y01
  470. FMADD y02, alpha4, a2, y02
  471. FMADD y03, alpha4, a3, y03
  472. FMADD y04, alpha4, a4, y04
  473. LFD a1, 8 * SIZE(AO4)
  474. LFD a2, 9 * SIZE(AO4)
  475. LFD a3, 10 * SIZE(AO4)
  476. LFD a4, 11 * SIZE(AO4)
  477. FMADD y05, alpha4, a5, y05
  478. FMADD y06, alpha4, a6, y06
  479. FMADD y07, alpha4, a7, y07
  480. FMADD y08, alpha4, a8, y08
  481. LFD a5, 12 * SIZE(AO4)
  482. LFD a6, 13 * SIZE(AO4)
  483. LFD a7, 14 * SIZE(AO4)
  484. LFD a8, 15 * SIZE(AO4)
  485. addi AO4, AO4, 16 * SIZE
  486. nop
  487. nop
  488. DCBT(AO4, PREA)
  489. FMADD y09, alpha4, a1, y09
  490. FMADD y10, alpha4, a2, y10
  491. FMADD y11, alpha4, a3, y11
  492. FMADD y12, alpha4, a4, y12
  493. LFD a1, 0 * SIZE(AO5)
  494. LFD a2, 1 * SIZE(AO5)
  495. LFD a3, 2 * SIZE(AO5)
  496. LFD a4, 3 * SIZE(AO5)
  497. FMADD y13, alpha4, a5, y13
  498. FMADD y14, alpha4, a6, y14
  499. FMADD y15, alpha4, a7, y15
  500. FMADD y16, alpha4, a8, y16
  501. LFD a5, 4 * SIZE(AO5)
  502. LFD a6, 5 * SIZE(AO5)
  503. LFD a7, 6 * SIZE(AO5)
  504. LFD a8, 7 * SIZE(AO5)
  505. FMADD y01, alpha5, a1, y01
  506. FMADD y02, alpha5, a2, y02
  507. FMADD y03, alpha5, a3, y03
  508. FMADD y04, alpha5, a4, y04
  509. LFD a1, 8 * SIZE(AO5)
  510. LFD a2, 9 * SIZE(AO5)
  511. LFD a3, 10 * SIZE(AO5)
  512. LFD a4, 11 * SIZE(AO5)
  513. FMADD y05, alpha5, a5, y05
  514. FMADD y06, alpha5, a6, y06
  515. FMADD y07, alpha5, a7, y07
  516. FMADD y08, alpha5, a8, y08
  517. LFD a5, 12 * SIZE(AO5)
  518. LFD a6, 13 * SIZE(AO5)
  519. LFD a7, 14 * SIZE(AO5)
  520. LFD a8, 15 * SIZE(AO5)
  521. addi AO5, AO5, 16 * SIZE
  522. nop
  523. nop
  524. DCBT(AO5, PREA)
  525. FMADD y09, alpha5, a1, y09
  526. FMADD y10, alpha5, a2, y10
  527. FMADD y11, alpha5, a3, y11
  528. FMADD y12, alpha5, a4, y12
  529. LFD a1, 0 * SIZE(AO6)
  530. LFD a2, 1 * SIZE(AO6)
  531. LFD a3, 2 * SIZE(AO6)
  532. LFD a4, 3 * SIZE(AO6)
  533. FMADD y13, alpha5, a5, y13
  534. FMADD y14, alpha5, a6, y14
  535. FMADD y15, alpha5, a7, y15
  536. FMADD y16, alpha5, a8, y16
  537. LFD a5, 4 * SIZE(AO6)
  538. LFD a6, 5 * SIZE(AO6)
  539. LFD a7, 6 * SIZE(AO6)
  540. LFD a8, 7 * SIZE(AO6)
  541. FMADD y01, alpha6, a1, y01
  542. FMADD y02, alpha6, a2, y02
  543. FMADD y03, alpha6, a3, y03
  544. FMADD y04, alpha6, a4, y04
  545. LFD a1, 8 * SIZE(AO6)
  546. LFD a2, 9 * SIZE(AO6)
  547. LFD a3, 10 * SIZE(AO6)
  548. LFD a4, 11 * SIZE(AO6)
  549. FMADD y05, alpha6, a5, y05
  550. FMADD y06, alpha6, a6, y06
  551. FMADD y07, alpha6, a7, y07
  552. FMADD y08, alpha6, a8, y08
  553. LFD a5, 12 * SIZE(AO6)
  554. LFD a6, 13 * SIZE(AO6)
  555. LFD a7, 14 * SIZE(AO6)
  556. LFD a8, 15 * SIZE(AO6)
  557. addi AO6, AO6, 16 * SIZE
  558. nop
  559. nop
  560. DCBT(AO6, PREA)
  561. FMADD y09, alpha6, a1, y09
  562. FMADD y10, alpha6, a2, y10
  563. FMADD y11, alpha6, a3, y11
  564. FMADD y12, alpha6, a4, y12
  565. LFD a1, 0 * SIZE(AO7)
  566. LFD a2, 1 * SIZE(AO7)
  567. LFD a3, 2 * SIZE(AO7)
  568. LFD a4, 3 * SIZE(AO7)
  569. FMADD y13, alpha6, a5, y13
  570. FMADD y14, alpha6, a6, y14
  571. FMADD y15, alpha6, a7, y15
  572. FMADD y16, alpha6, a8, y16
  573. LFD a5, 4 * SIZE(AO7)
  574. LFD a6, 5 * SIZE(AO7)
  575. LFD a7, 6 * SIZE(AO7)
  576. LFD a8, 7 * SIZE(AO7)
  577. FMADD y01, alpha7, a1, y01
  578. FMADD y02, alpha7, a2, y02
  579. FMADD y03, alpha7, a3, y03
  580. FMADD y04, alpha7, a4, y04
  581. LFD a1, 8 * SIZE(AO7)
  582. LFD a2, 9 * SIZE(AO7)
  583. LFD a3, 10 * SIZE(AO7)
  584. LFD a4, 11 * SIZE(AO7)
  585. FMADD y05, alpha7, a5, y05
  586. FMADD y06, alpha7, a6, y06
  587. FMADD y07, alpha7, a7, y07
  588. FMADD y08, alpha7, a8, y08
  589. LFD a5, 12 * SIZE(AO7)
  590. LFD a6, 13 * SIZE(AO7)
  591. LFD a7, 14 * SIZE(AO7)
  592. LFD a8, 15 * SIZE(AO7)
  593. addi AO7, AO7, 16 * SIZE
  594. nop
  595. nop
  596. DCBT(AO7, PREA)
  597. FMADD y09, alpha7, a1, y09
  598. FMADD y10, alpha7, a2, y10
  599. FMADD y11, alpha7, a3, y11
  600. FMADD y12, alpha7, a4, y12
  601. LFD a1, 0 * SIZE(AO8)
  602. LFD a2, 1 * SIZE(AO8)
  603. LFD a3, 2 * SIZE(AO8)
  604. LFD a4, 3 * SIZE(AO8)
  605. FMADD y13, alpha7, a5, y13
  606. FMADD y14, alpha7, a6, y14
  607. FMADD y15, alpha7, a7, y15
  608. FMADD y16, alpha7, a8, y16
  609. LFD a5, 4 * SIZE(AO8)
  610. LFD a6, 5 * SIZE(AO8)
  611. LFD a7, 6 * SIZE(AO8)
  612. LFD a8, 7 * SIZE(AO8)
  613. FMADD y01, alpha8, a1, y01
  614. FMADD y02, alpha8, a2, y02
  615. FMADD y03, alpha8, a3, y03
  616. FMADD y04, alpha8, a4, y04
  617. LFD a1, 8 * SIZE(AO8)
  618. LFD a2, 9 * SIZE(AO8)
  619. LFD a3, 10 * SIZE(AO8)
  620. LFD a4, 11 * SIZE(AO8)
  621. FMADD y05, alpha8, a5, y05
  622. FMADD y06, alpha8, a6, y06
  623. FMADD y07, alpha8, a7, y07
  624. FMADD y08, alpha8, a8, y08
  625. LFD a5, 12 * SIZE(AO8)
  626. LFD a6, 13 * SIZE(AO8)
  627. LFD a7, 14 * SIZE(AO8)
  628. LFD a8, 15 * SIZE(AO8)
  629. addi AO8, AO8, 16 * SIZE
  630. nop
  631. nop
  632. DCBT(AO8, PREA)
  633. FMADD y09, alpha8, a1, y09
  634. FMADD y10, alpha8, a2, y10
  635. FMADD y11, alpha8, a3, y11
  636. FMADD y12, alpha8, a4, y12
  637. LFD a1, 0 * SIZE(AO1)
  638. LFD a2, 1 * SIZE(AO1)
  639. LFD a3, 2 * SIZE(AO1)
  640. LFD a4, 3 * SIZE(AO1)
  641. FMADD y13, alpha8, a5, y13
  642. FMADD y14, alpha8, a6, y14
  643. FMADD y15, alpha8, a7, y15
  644. FMADD y16, alpha8, a8, y16
  645. LFD a5, 4 * SIZE(AO1)
  646. LFD a6, 5 * SIZE(AO1)
  647. LFD a7, 6 * SIZE(AO1)
  648. LFD a8, 7 * SIZE(AO1)
  649. STFD y01, 0 * SIZE(Y1)
  650. STFD y02, 1 * SIZE(Y1)
  651. STFD y03, 2 * SIZE(Y1)
  652. STFD y04, 3 * SIZE(Y1)
  653. LFD y01, 16 * SIZE(Y1)
  654. LFD y02, 17 * SIZE(Y1)
  655. LFD y03, 18 * SIZE(Y1)
  656. LFD y04, 19 * SIZE(Y1)
  657. DCBT(Y1, PREC)
  658. bdz LL(13)
  659. .align 4
  660. LL(12):
  661. FMADD y01, alpha1, a1, y01
  662. FMADD y02, alpha1, a2, y02
  663. FMADD y03, alpha1, a3, y03
  664. FMADD y04, alpha1, a4, y04
  665. LFD a1, 8 * SIZE(AO1)
  666. LFD a2, 9 * SIZE(AO1)
  667. LFD a3, 10 * SIZE(AO1)
  668. LFD a4, 11 * SIZE(AO1)
  669. STFD y05, 4 * SIZE(Y1)
  670. STFD y06, 5 * SIZE(Y1)
  671. STFD y07, 6 * SIZE(Y1)
  672. STFD y08, 7 * SIZE(Y1)
  673. LFD y05, 20 * SIZE(Y1)
  674. LFD y06, 21 * SIZE(Y1)
  675. LFD y07, 22 * SIZE(Y1)
  676. LFD y08, 23 * SIZE(Y1)
  677. FMADD y05, alpha1, a5, y05
  678. FMADD y06, alpha1, a6, y06
  679. FMADD y07, alpha1, a7, y07
  680. FMADD y08, alpha1, a8, y08
  681. LFD a5, 12 * SIZE(AO1)
  682. LFD a6, 13 * SIZE(AO1)
  683. LFD a7, 14 * SIZE(AO1)
  684. LFD a8, 15 * SIZE(AO1)
  685. STFD y09, 8 * SIZE(Y1)
  686. STFD y10, 9 * SIZE(Y1)
  687. STFD y11, 10 * SIZE(Y1)
  688. STFD y12, 11 * SIZE(Y1)
  689. LFD y09, 24 * SIZE(Y1)
  690. LFD y10, 25 * SIZE(Y1)
  691. LFD y11, 26 * SIZE(Y1)
  692. LFD y12, 27 * SIZE(Y1)
  693. FMADD y09, alpha1, a1, y09
  694. FMADD y10, alpha1, a2, y10
  695. FMADD y11, alpha1, a3, y11
  696. FMADD y12, alpha1, a4, y12
  697. LFD a1, 0 * SIZE(AO2)
  698. LFD a2, 1 * SIZE(AO2)
  699. LFD a3, 2 * SIZE(AO2)
  700. LFD a4, 3 * SIZE(AO2)
  701. STFD y13, 12 * SIZE(Y1)
  702. STFD y14, 13 * SIZE(Y1)
  703. STFD y15, 14 * SIZE(Y1)
  704. STFD y16, 15 * SIZE(Y1)
  705. LFD y13, 28 * SIZE(Y1)
  706. LFD y14, 29 * SIZE(Y1)
  707. LFD y15, 30 * SIZE(Y1)
  708. LFD y16, 31 * SIZE(Y1)
  709. FMADD y13, alpha1, a5, y13
  710. FMADD y14, alpha1, a6, y14
  711. FMADD y15, alpha1, a7, y15
  712. FMADD y16, alpha1, a8, y16
  713. LFD a5, 4 * SIZE(AO2)
  714. LFD a6, 5 * SIZE(AO2)
  715. LFD a7, 6 * SIZE(AO2)
  716. LFD a8, 7 * SIZE(AO2)
  717. FMADD y01, alpha2, a1, y01
  718. FMADD y02, alpha2, a2, y02
  719. FMADD y03, alpha2, a3, y03
  720. FMADD y04, alpha2, a4, y04
  721. LFD a1, 8 * SIZE(AO2)
  722. LFD a2, 9 * SIZE(AO2)
  723. LFD a3, 10 * SIZE(AO2)
  724. LFD a4, 11 * SIZE(AO2)
  725. FMADD y05, alpha2, a5, y05
  726. FMADD y06, alpha2, a6, y06
  727. FMADD y07, alpha2, a7, y07
  728. FMADD y08, alpha2, a8, y08
  729. LFD a5, 12 * SIZE(AO2)
  730. LFD a6, 13 * SIZE(AO2)
  731. LFD a7, 14 * SIZE(AO2)
  732. LFD a8, 15 * SIZE(AO2)
  733. FMADD y09, alpha2, a1, y09
  734. FMADD y10, alpha2, a2, y10
  735. FMADD y11, alpha2, a3, y11
  736. FMADD y12, alpha2, a4, y12
  737. LFD a1, 0 * SIZE(AO3)
  738. LFD a2, 1 * SIZE(AO3)
  739. LFD a3, 2 * SIZE(AO3)
  740. LFD a4, 3 * SIZE(AO3)
  741. FMADD y13, alpha2, a5, y13
  742. FMADD y14, alpha2, a6, y14
  743. FMADD y15, alpha2, a7, y15
  744. FMADD y16, alpha2, a8, y16
  745. LFD a5, 4 * SIZE(AO3)
  746. LFD a6, 5 * SIZE(AO3)
  747. LFD a7, 6 * SIZE(AO3)
  748. LFD a8, 7 * SIZE(AO3)
  749. FMADD y01, alpha3, a1, y01
  750. FMADD y02, alpha3, a2, y02
  751. FMADD y03, alpha3, a3, y03
  752. FMADD y04, alpha3, a4, y04
  753. LFD a1, 8 * SIZE(AO3)
  754. LFD a2, 9 * SIZE(AO3)
  755. LFD a3, 10 * SIZE(AO3)
  756. LFD a4, 11 * SIZE(AO3)
  757. FMADD y05, alpha3, a5, y05
  758. FMADD y06, alpha3, a6, y06
  759. FMADD y07, alpha3, a7, y07
  760. FMADD y08, alpha3, a8, y08
  761. LFD a5, 12 * SIZE(AO3)
  762. LFD a6, 13 * SIZE(AO3)
  763. LFD a7, 14 * SIZE(AO3)
  764. LFD a8, 15 * SIZE(AO3)
  765. FMADD y09, alpha3, a1, y09
  766. FMADD y10, alpha3, a2, y10
  767. FMADD y11, alpha3, a3, y11
  768. FMADD y12, alpha3, a4, y12
  769. LFD a1, 0 * SIZE(AO4)
  770. LFD a2, 1 * SIZE(AO4)
  771. LFD a3, 2 * SIZE(AO4)
  772. LFD a4, 3 * SIZE(AO4)
  773. FMADD y13, alpha3, a5, y13
  774. FMADD y14, alpha3, a6, y14
  775. FMADD y15, alpha3, a7, y15
  776. FMADD y16, alpha3, a8, y16
  777. LFD a5, 4 * SIZE(AO4)
  778. LFD a6, 5 * SIZE(AO4)
  779. LFD a7, 6 * SIZE(AO4)
  780. LFD a8, 7 * SIZE(AO4)
  781. FMADD y01, alpha4, a1, y01
  782. FMADD y02, alpha4, a2, y02
  783. FMADD y03, alpha4, a3, y03
  784. FMADD y04, alpha4, a4, y04
  785. LFD a1, 8 * SIZE(AO4)
  786. LFD a2, 9 * SIZE(AO4)
  787. LFD a3, 10 * SIZE(AO4)
  788. LFD a4, 11 * SIZE(AO4)
  789. FMADD y05, alpha4, a5, y05
  790. FMADD y06, alpha4, a6, y06
  791. FMADD y07, alpha4, a7, y07
  792. FMADD y08, alpha4, a8, y08
  793. LFD a5, 12 * SIZE(AO4)
  794. LFD a6, 13 * SIZE(AO4)
  795. LFD a7, 14 * SIZE(AO4)
  796. LFD a8, 15 * SIZE(AO4)
  797. addi AO1, AO1, 16 * SIZE
  798. addi AO2, AO2, 16 * SIZE
  799. addi AO3, AO3, 16 * SIZE
  800. addi AO4, AO4, 16 * SIZE
  801. DCBT(AO1, PREA)
  802. DCBT(AO2, PREA)
  803. DCBT(AO3, PREA)
  804. DCBT(AO4, PREA)
  805. FMADD y09, alpha4, a1, y09
  806. FMADD y10, alpha4, a2, y10
  807. FMADD y11, alpha4, a3, y11
  808. FMADD y12, alpha4, a4, y12
  809. LFD a1, 0 * SIZE(AO5)
  810. LFD a2, 1 * SIZE(AO5)
  811. LFD a3, 2 * SIZE(AO5)
  812. LFD a4, 3 * SIZE(AO5)
  813. FMADD y13, alpha4, a5, y13
  814. FMADD y14, alpha4, a6, y14
  815. FMADD y15, alpha4, a7, y15
  816. FMADD y16, alpha4, a8, y16
  817. LFD a5, 4 * SIZE(AO5)
  818. LFD a6, 5 * SIZE(AO5)
  819. LFD a7, 6 * SIZE(AO5)
  820. LFD a8, 7 * SIZE(AO5)
  821. FMADD y01, alpha5, a1, y01
  822. FMADD y02, alpha5, a2, y02
  823. FMADD y03, alpha5, a3, y03
  824. FMADD y04, alpha5, a4, y04
  825. LFD a1, 8 * SIZE(AO5)
  826. LFD a2, 9 * SIZE(AO5)
  827. LFD a3, 10 * SIZE(AO5)
  828. LFD a4, 11 * SIZE(AO5)
  829. FMADD y05, alpha5, a5, y05
  830. FMADD y06, alpha5, a6, y06
  831. FMADD y07, alpha5, a7, y07
  832. FMADD y08, alpha5, a8, y08
  833. LFD a5, 12 * SIZE(AO5)
  834. LFD a6, 13 * SIZE(AO5)
  835. LFD a7, 14 * SIZE(AO5)
  836. LFD a8, 15 * SIZE(AO5)
  837. FMADD y09, alpha5, a1, y09
  838. FMADD y10, alpha5, a2, y10
  839. FMADD y11, alpha5, a3, y11
  840. FMADD y12, alpha5, a4, y12
  841. LFD a1, 0 * SIZE(AO6)
  842. LFD a2, 1 * SIZE(AO6)
  843. LFD a3, 2 * SIZE(AO6)
  844. LFD a4, 3 * SIZE(AO6)
  845. FMADD y13, alpha5, a5, y13
  846. FMADD y14, alpha5, a6, y14
  847. FMADD y15, alpha5, a7, y15
  848. FMADD y16, alpha5, a8, y16
  849. LFD a5, 4 * SIZE(AO6)
  850. LFD a6, 5 * SIZE(AO6)
  851. LFD a7, 6 * SIZE(AO6)
  852. LFD a8, 7 * SIZE(AO6)
  853. FMADD y01, alpha6, a1, y01
  854. FMADD y02, alpha6, a2, y02
  855. FMADD y03, alpha6, a3, y03
  856. FMADD y04, alpha6, a4, y04
  857. LFD a1, 8 * SIZE(AO6)
  858. LFD a2, 9 * SIZE(AO6)
  859. LFD a3, 10 * SIZE(AO6)
  860. LFD a4, 11 * SIZE(AO6)
  861. FMADD y05, alpha6, a5, y05
  862. FMADD y06, alpha6, a6, y06
  863. FMADD y07, alpha6, a7, y07
  864. FMADD y08, alpha6, a8, y08
  865. LFD a5, 12 * SIZE(AO6)
  866. LFD a6, 13 * SIZE(AO6)
  867. LFD a7, 14 * SIZE(AO6)
  868. LFD a8, 15 * SIZE(AO6)
  869. FMADD y09, alpha6, a1, y09
  870. FMADD y10, alpha6, a2, y10
  871. FMADD y11, alpha6, a3, y11
  872. FMADD y12, alpha6, a4, y12
  873. LFD a1, 0 * SIZE(AO7)
  874. LFD a2, 1 * SIZE(AO7)
  875. LFD a3, 2 * SIZE(AO7)
  876. LFD a4, 3 * SIZE(AO7)
  877. FMADD y13, alpha6, a5, y13
  878. FMADD y14, alpha6, a6, y14
  879. FMADD y15, alpha6, a7, y15
  880. FMADD y16, alpha6, a8, y16
  881. LFD a5, 4 * SIZE(AO7)
  882. LFD a6, 5 * SIZE(AO7)
  883. LFD a7, 6 * SIZE(AO7)
  884. LFD a8, 7 * SIZE(AO7)
  885. FMADD y01, alpha7, a1, y01
  886. FMADD y02, alpha7, a2, y02
  887. FMADD y03, alpha7, a3, y03
  888. FMADD y04, alpha7, a4, y04
  889. LFD a1, 8 * SIZE(AO7)
  890. LFD a2, 9 * SIZE(AO7)
  891. LFD a3, 10 * SIZE(AO7)
  892. LFD a4, 11 * SIZE(AO7)
  893. FMADD y05, alpha7, a5, y05
  894. FMADD y06, alpha7, a6, y06
  895. FMADD y07, alpha7, a7, y07
  896. FMADD y08, alpha7, a8, y08
  897. LFD a5, 12 * SIZE(AO7)
  898. LFD a6, 13 * SIZE(AO7)
  899. LFD a7, 14 * SIZE(AO7)
  900. LFD a8, 15 * SIZE(AO7)
  901. FMADD y09, alpha7, a1, y09
  902. FMADD y10, alpha7, a2, y10
  903. FMADD y11, alpha7, a3, y11
  904. FMADD y12, alpha7, a4, y12
  905. LFD a1, 0 * SIZE(AO8)
  906. LFD a2, 1 * SIZE(AO8)
  907. LFD a3, 2 * SIZE(AO8)
  908. LFD a4, 3 * SIZE(AO8)
  909. FMADD y13, alpha7, a5, y13
  910. FMADD y14, alpha7, a6, y14
  911. FMADD y15, alpha7, a7, y15
  912. FMADD y16, alpha7, a8, y16
  913. LFD a5, 4 * SIZE(AO8)
  914. LFD a6, 5 * SIZE(AO8)
  915. LFD a7, 6 * SIZE(AO8)
  916. LFD a8, 7 * SIZE(AO8)
  917. FMADD y01, alpha8, a1, y01
  918. FMADD y02, alpha8, a2, y02
  919. FMADD y03, alpha8, a3, y03
  920. FMADD y04, alpha8, a4, y04
  921. LFD a1, 8 * SIZE(AO8)
  922. LFD a2, 9 * SIZE(AO8)
  923. LFD a3, 10 * SIZE(AO8)
  924. LFD a4, 11 * SIZE(AO8)
  925. FMADD y05, alpha8, a5, y05
  926. FMADD y06, alpha8, a6, y06
  927. FMADD y07, alpha8, a7, y07
  928. FMADD y08, alpha8, a8, y08
  929. LFD a5, 12 * SIZE(AO8)
  930. LFD a6, 13 * SIZE(AO8)
  931. LFD a7, 14 * SIZE(AO8)
  932. LFD a8, 15 * SIZE(AO8)
  933. addi AO5, AO5, 16 * SIZE
  934. addi AO6, AO6, 16 * SIZE
  935. addi AO7, AO7, 16 * SIZE
  936. addi AO8, AO8, 16 * SIZE
  937. DCBT(AO5, PREA)
  938. DCBT(AO6, PREA)
  939. DCBT(AO7, PREA)
  940. DCBT(AO8, PREA)
  941. FMADD y09, alpha8, a1, y09
  942. FMADD y10, alpha8, a2, y10
  943. FMADD y11, alpha8, a3, y11
  944. FMADD y12, alpha8, a4, y12
  945. LFD a1, 0 * SIZE(AO1)
  946. LFD a2, 1 * SIZE(AO1)
  947. LFD a3, 2 * SIZE(AO1)
  948. LFD a4, 3 * SIZE(AO1)
  949. FMADD y13, alpha8, a5, y13
  950. FMADD y14, alpha8, a6, y14
  951. FMADD y15, alpha8, a7, y15
  952. FMADD y16, alpha8, a8, y16
  953. LFD a5, 4 * SIZE(AO1)
  954. LFD a6, 5 * SIZE(AO1)
  955. LFD a7, 6 * SIZE(AO1)
  956. LFD a8, 7 * SIZE(AO1)
  957. STFD y01, 16 * SIZE(Y1)
  958. STFD y02, 17 * SIZE(Y1)
  959. STFD y03, 18 * SIZE(Y1)
  960. STFD y04, 19 * SIZE(Y1)
  961. LFD y01, 32 * SIZE(Y1)
  962. LFD y02, 33 * SIZE(Y1)
  963. LFD y03, 34 * SIZE(Y1)
  964. LFD y04, 35 * SIZE(Y1)
  965. DCBT(Y1, PREC)
  966. addi Y1, Y1, 16 * SIZE
  967. bdnz LL(12)
  968. .align 4
  969. LL(13):
  970. STFD y05, 4 * SIZE(Y1)
  971. STFD y06, 5 * SIZE(Y1)
  972. STFD y07, 6 * SIZE(Y1)
  973. STFD y08, 7 * SIZE(Y1)
  974. STFD y09, 8 * SIZE(Y1)
  975. STFD y10, 9 * SIZE(Y1)
  976. STFD y11, 10 * SIZE(Y1)
  977. STFD y12, 11 * SIZE(Y1)
  978. STFD y13, 12 * SIZE(Y1)
  979. STFD y14, 13 * SIZE(Y1)
  980. STFD y15, 14 * SIZE(Y1)
  981. STFD y16, 15 * SIZE(Y1)
  982. addi Y1, Y1, 16 * SIZE
  983. .align 4
  984. LL(15):
  985. andi. r0, M, 15
  986. ble LL(19)
  987. andi. r0, M, 8
  988. ble LL(16)
  989. LFD y01, 0 * SIZE(Y1)
  990. LFD y02, 1 * SIZE(Y1)
  991. LFD y03, 2 * SIZE(Y1)
  992. LFD y04, 3 * SIZE(Y1)
  993. LFD a1, 0 * SIZE(AO1)
  994. LFD a2, 1 * SIZE(AO1)
  995. LFD a3, 2 * SIZE(AO1)
  996. LFD a4, 3 * SIZE(AO1)
  997. LFD y05, 4 * SIZE(Y1)
  998. LFD y06, 5 * SIZE(Y1)
  999. LFD y07, 6 * SIZE(Y1)
  1000. LFD y08, 7 * SIZE(Y1)
  1001. LFD a5, 4 * SIZE(AO1)
  1002. LFD a6, 5 * SIZE(AO1)
  1003. LFD a7, 6 * SIZE(AO1)
  1004. LFD a8, 7 * SIZE(AO1)
  1005. FMADD y01, alpha1, a1, y01
  1006. LFD a1, 0 * SIZE(AO2)
  1007. FMADD y02, alpha1, a2, y02
  1008. LFD a2, 1 * SIZE(AO2)
  1009. FMADD y03, alpha1, a3, y03
  1010. LFD a3, 2 * SIZE(AO2)
  1011. FMADD y04, alpha1, a4, y04
  1012. LFD a4, 3 * SIZE(AO2)
  1013. FMADD y05, alpha1, a5, y05
  1014. LFD a5, 4 * SIZE(AO2)
  1015. FMADD y06, alpha1, a6, y06
  1016. LFD a6, 5 * SIZE(AO2)
  1017. FMADD y07, alpha1, a7, y07
  1018. LFD a7, 6 * SIZE(AO2)
  1019. FMADD y08, alpha1, a8, y08
  1020. LFD a8, 7 * SIZE(AO2)
  1021. FMADD y01, alpha2, a1, y01
  1022. LFD a1, 0 * SIZE(AO3)
  1023. FMADD y02, alpha2, a2, y02
  1024. LFD a2, 1 * SIZE(AO3)
  1025. FMADD y03, alpha2, a3, y03
  1026. LFD a3, 2 * SIZE(AO3)
  1027. FMADD y04, alpha2, a4, y04
  1028. LFD a4, 3 * SIZE(AO3)
  1029. FMADD y05, alpha2, a5, y05
  1030. LFD a5, 4 * SIZE(AO3)
  1031. FMADD y06, alpha2, a6, y06
  1032. LFD a6, 5 * SIZE(AO3)
  1033. FMADD y07, alpha2, a7, y07
  1034. LFD a7, 6 * SIZE(AO3)
  1035. FMADD y08, alpha2, a8, y08
  1036. LFD a8, 7 * SIZE(AO3)
  1037. FMADD y01, alpha3, a1, y01
  1038. LFD a1, 0 * SIZE(AO4)
  1039. FMADD y02, alpha3, a2, y02
  1040. LFD a2, 1 * SIZE(AO4)
  1041. FMADD y03, alpha3, a3, y03
  1042. LFD a3, 2 * SIZE(AO4)
  1043. FMADD y04, alpha3, a4, y04
  1044. LFD a4, 3 * SIZE(AO4)
  1045. FMADD y05, alpha3, a5, y05
  1046. LFD a5, 4 * SIZE(AO4)
  1047. FMADD y06, alpha3, a6, y06
  1048. LFD a6, 5 * SIZE(AO4)
  1049. FMADD y07, alpha3, a7, y07
  1050. LFD a7, 6 * SIZE(AO4)
  1051. FMADD y08, alpha3, a8, y08
  1052. LFD a8, 7 * SIZE(AO4)
  1053. FMADD y01, alpha4, a1, y01
  1054. LFD a1, 0 * SIZE(AO5)
  1055. FMADD y02, alpha4, a2, y02
  1056. LFD a2, 1 * SIZE(AO5)
  1057. FMADD y03, alpha4, a3, y03
  1058. LFD a3, 2 * SIZE(AO5)
  1059. FMADD y04, alpha4, a4, y04
  1060. LFD a4, 3 * SIZE(AO5)
  1061. FMADD y05, alpha4, a5, y05
  1062. LFD a5, 4 * SIZE(AO5)
  1063. FMADD y06, alpha4, a6, y06
  1064. LFD a6, 5 * SIZE(AO5)
  1065. FMADD y07, alpha4, a7, y07
  1066. LFD a7, 6 * SIZE(AO5)
  1067. FMADD y08, alpha4, a8, y08
  1068. LFD a8, 7 * SIZE(AO5)
  1069. FMADD y01, alpha5, a1, y01
  1070. LFD a1, 0 * SIZE(AO6)
  1071. FMADD y02, alpha5, a2, y02
  1072. LFD a2, 1 * SIZE(AO6)
  1073. FMADD y03, alpha5, a3, y03
  1074. LFD a3, 2 * SIZE(AO6)
  1075. FMADD y04, alpha5, a4, y04
  1076. LFD a4, 3 * SIZE(AO6)
  1077. FMADD y05, alpha5, a5, y05
  1078. LFD a5, 4 * SIZE(AO6)
  1079. FMADD y06, alpha5, a6, y06
  1080. LFD a6, 5 * SIZE(AO6)
  1081. FMADD y07, alpha5, a7, y07
  1082. LFD a7, 6 * SIZE(AO6)
  1083. FMADD y08, alpha5, a8, y08
  1084. LFD a8, 7 * SIZE(AO6)
  1085. FMADD y01, alpha6, a1, y01
  1086. LFD a1, 0 * SIZE(AO7)
  1087. FMADD y02, alpha6, a2, y02
  1088. LFD a2, 1 * SIZE(AO7)
  1089. FMADD y03, alpha6, a3, y03
  1090. LFD a3, 2 * SIZE(AO7)
  1091. FMADD y04, alpha6, a4, y04
  1092. LFD a4, 3 * SIZE(AO7)
  1093. FMADD y05, alpha6, a5, y05
  1094. LFD a5, 4 * SIZE(AO7)
  1095. FMADD y06, alpha6, a6, y06
  1096. LFD a6, 5 * SIZE(AO7)
  1097. FMADD y07, alpha6, a7, y07
  1098. LFD a7, 6 * SIZE(AO7)
  1099. FMADD y08, alpha6, a8, y08
  1100. LFD a8, 7 * SIZE(AO7)
  1101. FMADD y01, alpha7, a1, y01
  1102. LFD a1, 0 * SIZE(AO8)
  1103. FMADD y02, alpha7, a2, y02
  1104. LFD a2, 1 * SIZE(AO8)
  1105. FMADD y03, alpha7, a3, y03
  1106. LFD a3, 2 * SIZE(AO8)
  1107. FMADD y04, alpha7, a4, y04
  1108. LFD a4, 3 * SIZE(AO8)
  1109. FMADD y05, alpha7, a5, y05
  1110. LFD a5, 4 * SIZE(AO8)
  1111. FMADD y06, alpha7, a6, y06
  1112. LFD a6, 5 * SIZE(AO8)
  1113. FMADD y07, alpha7, a7, y07
  1114. LFD a7, 6 * SIZE(AO8)
  1115. FMADD y08, alpha7, a8, y08
  1116. LFD a8, 7 * SIZE(AO8)
  1117. FMADD y01, alpha8, a1, y01
  1118. addi AO1, AO1, 8 * SIZE
  1119. FMADD y02, alpha8, a2, y02
  1120. addi AO2, AO2, 8 * SIZE
  1121. FMADD y03, alpha8, a3, y03
  1122. addi AO3, AO3, 8 * SIZE
  1123. FMADD y04, alpha8, a4, y04
  1124. addi AO4, AO4, 8 * SIZE
  1125. STFD y01, 0 * SIZE(Y1)
  1126. STFD y02, 1 * SIZE(Y1)
  1127. STFD y03, 2 * SIZE(Y1)
  1128. STFD y04, 3 * SIZE(Y1)
  1129. FMADD y05, alpha8, a5, y05
  1130. addi AO5, AO5, 8 * SIZE
  1131. FMADD y06, alpha8, a6, y06
  1132. addi AO6, AO6, 8 * SIZE
  1133. FMADD y07, alpha8, a7, y07
  1134. addi AO7, AO7, 8 * SIZE
  1135. FMADD y08, alpha8, a8, y08
  1136. addi AO8, AO8, 8 * SIZE
  1137. STFD y05, 4 * SIZE(Y1)
  1138. STFD y06, 5 * SIZE(Y1)
  1139. STFD y07, 6 * SIZE(Y1)
  1140. STFD y08, 7 * SIZE(Y1)
  1141. addi Y1, Y1, 8 * SIZE
  1142. .align 4
  1143. LL(16):
  1144. andi. r0, M, 4
  1145. ble LL(17)
  1146. LFD y01, 0 * SIZE(Y1)
  1147. LFD y02, 1 * SIZE(Y1)
  1148. LFD y03, 2 * SIZE(Y1)
  1149. LFD y04, 3 * SIZE(Y1)
  1150. LFD a1, 0 * SIZE(AO1)
  1151. LFD a2, 1 * SIZE(AO1)
  1152. LFD a3, 2 * SIZE(AO1)
  1153. LFD a4, 3 * SIZE(AO1)
  1154. LFD a5, 0 * SIZE(AO2)
  1155. LFD a6, 1 * SIZE(AO2)
  1156. LFD a7, 2 * SIZE(AO2)
  1157. LFD a8, 3 * SIZE(AO2)
  1158. FMADD y01, alpha1, a1, y01
  1159. LFD a1, 0 * SIZE(AO3)
  1160. FMADD y02, alpha1, a2, y02
  1161. LFD a2, 1 * SIZE(AO3)
  1162. FMADD y03, alpha1, a3, y03
  1163. LFD a3, 2 * SIZE(AO3)
  1164. FMADD y04, alpha1, a4, y04
  1165. LFD a4, 3 * SIZE(AO3)
  1166. FMADD y01, alpha2, a5, y01
  1167. LFD a5, 0 * SIZE(AO4)
  1168. FMADD y02, alpha2, a6, y02
  1169. LFD a6, 1 * SIZE(AO4)
  1170. FMADD y03, alpha2, a7, y03
  1171. LFD a7, 2 * SIZE(AO4)
  1172. FMADD y04, alpha2, a8, y04
  1173. LFD a8, 3 * SIZE(AO4)
  1174. FMADD y01, alpha3, a1, y01
  1175. LFD a1, 0 * SIZE(AO5)
  1176. FMADD y02, alpha3, a2, y02
  1177. LFD a2, 1 * SIZE(AO5)
  1178. FMADD y03, alpha3, a3, y03
  1179. LFD a3, 2 * SIZE(AO5)
  1180. FMADD y04, alpha3, a4, y04
  1181. LFD a4, 3 * SIZE(AO5)
  1182. FMADD y01, alpha4, a5, y01
  1183. LFD a5, 0 * SIZE(AO6)
  1184. FMADD y02, alpha4, a6, y02
  1185. LFD a6, 1 * SIZE(AO6)
  1186. FMADD y03, alpha4, a7, y03
  1187. LFD a7, 2 * SIZE(AO6)
  1188. FMADD y04, alpha4, a8, y04
  1189. LFD a8, 3 * SIZE(AO6)
  1190. FMADD y01, alpha5, a1, y01
  1191. LFD a1, 0 * SIZE(AO7)
  1192. FMADD y02, alpha5, a2, y02
  1193. LFD a2, 1 * SIZE(AO7)
  1194. FMADD y03, alpha5, a3, y03
  1195. LFD a3, 2 * SIZE(AO7)
  1196. FMADD y04, alpha5, a4, y04
  1197. LFD a4, 3 * SIZE(AO7)
  1198. FMADD y01, alpha6, a5, y01
  1199. LFD a5, 0 * SIZE(AO8)
  1200. FMADD y02, alpha6, a6, y02
  1201. LFD a6, 1 * SIZE(AO8)
  1202. FMADD y03, alpha6, a7, y03
  1203. LFD a7, 2 * SIZE(AO8)
  1204. FMADD y04, alpha6, a8, y04
  1205. LFD a8, 3 * SIZE(AO8)
  1206. FMADD y01, alpha7, a1, y01
  1207. addi AO1, AO1, 4 * SIZE
  1208. FMADD y02, alpha7, a2, y02
  1209. addi AO2, AO2, 4 * SIZE
  1210. FMADD y03, alpha7, a3, y03
  1211. addi AO3, AO3, 4 * SIZE
  1212. FMADD y04, alpha7, a4, y04
  1213. addi AO4, AO4, 4 * SIZE
  1214. FMADD y01, alpha8, a5, y01
  1215. addi AO5, AO5, 4 * SIZE
  1216. FMADD y02, alpha8, a6, y02
  1217. addi AO6, AO6, 4 * SIZE
  1218. FMADD y03, alpha8, a7, y03
  1219. addi AO7, AO7, 4 * SIZE
  1220. FMADD y04, alpha8, a8, y04
  1221. addi AO8, AO8, 4 * SIZE
  1222. STFD y01, 0 * SIZE(Y1)
  1223. STFD y02, 1 * SIZE(Y1)
  1224. STFD y03, 2 * SIZE(Y1)
  1225. STFD y04, 3 * SIZE(Y1)
  1226. addi Y1, Y1, 4 * SIZE
  1227. .align 4
  1228. LL(17):
  1229. andi. r0, M, 2
  1230. ble LL(18)
  1231. LFD y01, 0 * SIZE(Y1)
  1232. LFD y02, 1 * SIZE(Y1)
  1233. LFD a1, 0 * SIZE(AO1)
  1234. LFD a2, 1 * SIZE(AO1)
  1235. LFD a3, 0 * SIZE(AO2)
  1236. LFD a4, 1 * SIZE(AO2)
  1237. LFD a5, 0 * SIZE(AO3)
  1238. LFD a6, 1 * SIZE(AO3)
  1239. LFD a7, 0 * SIZE(AO4)
  1240. LFD a8, 1 * SIZE(AO4)
  1241. FMADD y01, alpha1, a1, y01
  1242. LFD a1, 0 * SIZE(AO5)
  1243. FMADD y02, alpha1, a2, y02
  1244. LFD a2, 1 * SIZE(AO5)
  1245. FMADD y01, alpha2, a3, y01
  1246. LFD a3, 0 * SIZE(AO6)
  1247. FMADD y02, alpha2, a4, y02
  1248. LFD a4, 1 * SIZE(AO6)
  1249. FMADD y01, alpha3, a5, y01
  1250. LFD a5, 0 * SIZE(AO7)
  1251. FMADD y02, alpha3, a6, y02
  1252. LFD a6, 1 * SIZE(AO7)
  1253. FMADD y01, alpha4, a7, y01
  1254. LFD a7, 0 * SIZE(AO8)
  1255. FMADD y02, alpha4, a8, y02
  1256. LFD a8, 1 * SIZE(AO8)
  1257. FMADD y01, alpha5, a1, y01
  1258. addi AO1, AO1, 2 * SIZE
  1259. FMADD y02, alpha5, a2, y02
  1260. addi AO2, AO2, 2 * SIZE
  1261. FMADD y01, alpha6, a3, y01
  1262. addi AO3, AO3, 2 * SIZE
  1263. FMADD y02, alpha6, a4, y02
  1264. addi AO4, AO4, 2 * SIZE
  1265. FMADD y01, alpha7, a5, y01
  1266. addi AO5, AO5, 2 * SIZE
  1267. FMADD y02, alpha7, a6, y02
  1268. addi AO6, AO6, 2 * SIZE
  1269. FMADD y01, alpha8, a7, y01
  1270. addi AO7, AO7, 2 * SIZE
  1271. FMADD y02, alpha8, a8, y02
  1272. addi AO8, AO8, 2 * SIZE
  1273. STFD y01, 0 * SIZE(Y1)
  1274. STFD y02, 1 * SIZE(Y1)
  1275. addi Y1, Y1, 2 * SIZE
  1276. .align 4
  1277. LL(18):
  1278. andi. r0, M, 1
  1279. ble LL(19)
  1280. LFD y01, 0 * SIZE(Y1)
  1281. LFD a1, 0 * SIZE(AO1)
  1282. LFD a2, 0 * SIZE(AO2)
  1283. LFD a3, 0 * SIZE(AO3)
  1284. LFD a4, 0 * SIZE(AO4)
  1285. LFD a5, 0 * SIZE(AO5)
  1286. LFD a6, 0 * SIZE(AO6)
  1287. LFD a7, 0 * SIZE(AO7)
  1288. LFD a8, 0 * SIZE(AO8)
  1289. FMADD y01, alpha1, a1, y01
  1290. FMADD y01, alpha2, a2, y01
  1291. FMADD y01, alpha3, a3, y01
  1292. FMADD y01, alpha4, a4, y01
  1293. FMADD y01, alpha5, a5, y01
  1294. FMADD y01, alpha6, a6, y01
  1295. FMADD y01, alpha7, a7, y01
  1296. FMADD y01, alpha8, a8, y01
  1297. STFD y01, 0 * SIZE(Y1)
  1298. .align 4
  1299. LL(19):
  1300. addi J, J, -1
  1301. lfd alpha, ALPHA
  1302. cmpi cr0, 0, J, 0
  1303. bgt LL(11)
  1304. .align 4
  1305. LL(20):
  1306. andi. J, N, 4
  1307. mr AO1, A
  1308. add AO2, A, LDA
  1309. ble LL(30)
  1310. .align 4
  1311. LFD alpha1, 0 * SIZE(X)
  1312. add X, X, INCX
  1313. LFD alpha2, 0 * SIZE(X)
  1314. add X, X, INCX
  1315. LFD alpha3, 0 * SIZE(X)
  1316. add X, X, INCX
  1317. LFD alpha4, 0 * SIZE(X)
  1318. add X, X, INCX
  1319. FMUL alpha1, alpha, alpha1
  1320. add AO3, AO2, LDA
  1321. FMUL alpha2, alpha, alpha2
  1322. add AO4, AO3, LDA
  1323. FMUL alpha3, alpha, alpha3
  1324. add A, AO4, LDA
  1325. FMUL alpha4, alpha, alpha4
  1326. mr Y1, YY
  1327. srawi. r0, M, 4
  1328. mtspr CTR, r0
  1329. ble LL(25)
  1330. LFD y01, 0 * SIZE(Y1)
  1331. LFD y02, 1 * SIZE(Y1)
  1332. LFD y03, 2 * SIZE(Y1)
  1333. LFD y04, 3 * SIZE(Y1)
  1334. LFD y05, 4 * SIZE(Y1)
  1335. LFD y06, 5 * SIZE(Y1)
  1336. LFD y07, 6 * SIZE(Y1)
  1337. LFD y08, 7 * SIZE(Y1)
  1338. LFD y09, 8 * SIZE(Y1)
  1339. LFD y10, 9 * SIZE(Y1)
  1340. LFD y11, 10 * SIZE(Y1)
  1341. LFD y12, 11 * SIZE(Y1)
  1342. LFD y13, 12 * SIZE(Y1)
  1343. LFD y14, 13 * SIZE(Y1)
  1344. LFD y15, 14 * SIZE(Y1)
  1345. LFD y16, 15 * SIZE(Y1)
  1346. LFD a1, 0 * SIZE(AO1)
  1347. LFD a2, 1 * SIZE(AO1)
  1348. LFD a3, 2 * SIZE(AO1)
  1349. LFD a4, 3 * SIZE(AO1)
  1350. LFD a5, 4 * SIZE(AO1)
  1351. LFD a6, 5 * SIZE(AO1)
  1352. LFD a7, 6 * SIZE(AO1)
  1353. LFD a8, 7 * SIZE(AO1)
  1354. bdz LL(23)
  1355. .align 4
  1356. LL(22):
  1357. FMADD y01, alpha1, a1, y01
  1358. LFD a1, 8 * SIZE(AO1)
  1359. FMADD y02, alpha1, a2, y02
  1360. LFD a2, 9 * SIZE(AO1)
  1361. FMADD y03, alpha1, a3, y03
  1362. LFD a3, 10 * SIZE(AO1)
  1363. FMADD y04, alpha1, a4, y04
  1364. LFD a4, 11 * SIZE(AO1)
  1365. FMADD y05, alpha1, a5, y05
  1366. LFD a5, 12 * SIZE(AO1)
  1367. FMADD y06, alpha1, a6, y06
  1368. LFD a6, 13 * SIZE(AO1)
  1369. FMADD y07, alpha1, a7, y07
  1370. LFD a7, 14 * SIZE(AO1)
  1371. FMADD y08, alpha1, a8, y08
  1372. LFD a8, 15 * SIZE(AO1)
  1373. FMADD y09, alpha1, a1, y09
  1374. LFD a1, 0 * SIZE(AO2)
  1375. FMADD y10, alpha1, a2, y10
  1376. LFD a2, 1 * SIZE(AO2)
  1377. FMADD y11, alpha1, a3, y11
  1378. LFD a3, 2 * SIZE(AO2)
  1379. FMADD y12, alpha1, a4, y12
  1380. LFD a4, 3 * SIZE(AO2)
  1381. FMADD y13, alpha1, a5, y13
  1382. LFD a5, 4 * SIZE(AO2)
  1383. FMADD y14, alpha1, a6, y14
  1384. LFD a6, 5 * SIZE(AO2)
  1385. FMADD y15, alpha1, a7, y15
  1386. LFD a7, 6 * SIZE(AO2)
  1387. FMADD y16, alpha1, a8, y16
  1388. LFD a8, 7 * SIZE(AO2)
  1389. FMADD y01, alpha2, a1, y01
  1390. LFD a1, 8 * SIZE(AO2)
  1391. FMADD y02, alpha2, a2, y02
  1392. LFD a2, 9 * SIZE(AO2)
  1393. FMADD y03, alpha2, a3, y03
  1394. LFD a3, 10 * SIZE(AO2)
  1395. FMADD y04, alpha2, a4, y04
  1396. LFD a4, 11 * SIZE(AO2)
  1397. FMADD y05, alpha2, a5, y05
  1398. LFD a5, 12 * SIZE(AO2)
  1399. FMADD y06, alpha2, a6, y06
  1400. LFD a6, 13 * SIZE(AO2)
  1401. FMADD y07, alpha2, a7, y07
  1402. LFD a7, 14 * SIZE(AO2)
  1403. FMADD y08, alpha2, a8, y08
  1404. LFD a8, 15 * SIZE(AO2)
  1405. addi AO1, AO1, 16 * SIZE
  1406. addi AO2, AO2, 16 * SIZE
  1407. DCBT(AO1, PREA)
  1408. DCBT(AO2, PREA)
  1409. FMADD y09, alpha2, a1, y09
  1410. LFD a1, 0 * SIZE(AO3)
  1411. FMADD y10, alpha2, a2, y10
  1412. LFD a2, 1 * SIZE(AO3)
  1413. FMADD y11, alpha2, a3, y11
  1414. LFD a3, 2 * SIZE(AO3)
  1415. FMADD y12, alpha2, a4, y12
  1416. LFD a4, 3 * SIZE(AO3)
  1417. FMADD y13, alpha2, a5, y13
  1418. LFD a5, 4 * SIZE(AO3)
  1419. FMADD y14, alpha2, a6, y14
  1420. LFD a6, 5 * SIZE(AO3)
  1421. FMADD y15, alpha2, a7, y15
  1422. LFD a7, 6 * SIZE(AO3)
  1423. FMADD y16, alpha2, a8, y16
  1424. LFD a8, 7 * SIZE(AO3)
  1425. FMADD y01, alpha3, a1, y01
  1426. LFD a1, 8 * SIZE(AO3)
  1427. FMADD y02, alpha3, a2, y02
  1428. LFD a2, 9 * SIZE(AO3)
  1429. FMADD y03, alpha3, a3, y03
  1430. LFD a3, 10 * SIZE(AO3)
  1431. FMADD y04, alpha3, a4, y04
  1432. LFD a4, 11 * SIZE(AO3)
  1433. FMADD y05, alpha3, a5, y05
  1434. LFD a5, 12 * SIZE(AO3)
  1435. FMADD y06, alpha3, a6, y06
  1436. LFD a6, 13 * SIZE(AO3)
  1437. FMADD y07, alpha3, a7, y07
  1438. LFD a7, 14 * SIZE(AO3)
  1439. FMADD y08, alpha3, a8, y08
  1440. LFD a8, 15 * SIZE(AO3)
  1441. FMADD y09, alpha3, a1, y09
  1442. LFD a1, 0 * SIZE(AO4)
  1443. FMADD y10, alpha3, a2, y10
  1444. LFD a2, 1 * SIZE(AO4)
  1445. FMADD y11, alpha3, a3, y11
  1446. LFD a3, 2 * SIZE(AO4)
  1447. FMADD y12, alpha3, a4, y12
  1448. LFD a4, 3 * SIZE(AO4)
  1449. FMADD y13, alpha3, a5, y13
  1450. LFD a5, 4 * SIZE(AO4)
  1451. FMADD y14, alpha3, a6, y14
  1452. LFD a6, 5 * SIZE(AO4)
  1453. FMADD y15, alpha3, a7, y15
  1454. LFD a7, 6 * SIZE(AO4)
  1455. FMADD y16, alpha3, a8, y16
  1456. LFD a8, 7 * SIZE(AO4)
  1457. FMADD y01, alpha4, a1, y01
  1458. LFD a1, 8 * SIZE(AO4)
  1459. FMADD y02, alpha4, a2, y02
  1460. LFD a2, 9 * SIZE(AO4)
  1461. FMADD y03, alpha4, a3, y03
  1462. LFD a3, 10 * SIZE(AO4)
  1463. FMADD y04, alpha4, a4, y04
  1464. LFD a4, 11 * SIZE(AO4)
  1465. STFD y01, 0 * SIZE(Y1)
  1466. STFD y02, 1 * SIZE(Y1)
  1467. STFD y03, 2 * SIZE(Y1)
  1468. STFD y04, 3 * SIZE(Y1)
  1469. LFD y01, 16 * SIZE(Y1)
  1470. LFD y02, 17 * SIZE(Y1)
  1471. LFD y03, 18 * SIZE(Y1)
  1472. LFD y04, 19 * SIZE(Y1)
  1473. FMADD y05, alpha4, a5, y05
  1474. LFD a5, 12 * SIZE(AO4)
  1475. FMADD y06, alpha4, a6, y06
  1476. LFD a6, 13 * SIZE(AO4)
  1477. FMADD y07, alpha4, a7, y07
  1478. LFD a7, 14 * SIZE(AO4)
  1479. FMADD y08, alpha4, a8, y08
  1480. LFD a8, 15 * SIZE(AO4)
  1481. STFD y05, 4 * SIZE(Y1)
  1482. STFD y06, 5 * SIZE(Y1)
  1483. STFD y07, 6 * SIZE(Y1)
  1484. STFD y08, 7 * SIZE(Y1)
  1485. LFD y05, 20 * SIZE(Y1)
  1486. LFD y06, 21 * SIZE(Y1)
  1487. LFD y07, 22 * SIZE(Y1)
  1488. LFD y08, 23 * SIZE(Y1)
  1489. addi AO3, AO3, 16 * SIZE
  1490. addi AO4, AO4, 16 * SIZE
  1491. DCBT(AO3, PREA)
  1492. DCBT(AO4, PREA)
  1493. FMADD y09, alpha4, a1, y09
  1494. LFD a1, 0 * SIZE(AO1)
  1495. FMADD y10, alpha4, a2, y10
  1496. LFD a2, 1 * SIZE(AO1)
  1497. FMADD y11, alpha4, a3, y11
  1498. LFD a3, 2 * SIZE(AO1)
  1499. FMADD y12, alpha4, a4, y12
  1500. LFD a4, 3 * SIZE(AO1)
  1501. STFD y09, 8 * SIZE(Y1)
  1502. STFD y10, 9 * SIZE(Y1)
  1503. STFD y11, 10 * SIZE(Y1)
  1504. STFD y12, 11 * SIZE(Y1)
  1505. LFD y09, 24 * SIZE(Y1)
  1506. LFD y10, 25 * SIZE(Y1)
  1507. LFD y11, 26 * SIZE(Y1)
  1508. LFD y12, 27 * SIZE(Y1)
  1509. FMADD y13, alpha4, a5, y13
  1510. LFD a5, 4 * SIZE(AO1)
  1511. FMADD y14, alpha4, a6, y14
  1512. LFD a6, 5 * SIZE(AO1)
  1513. FMADD y15, alpha4, a7, y15
  1514. LFD a7, 6 * SIZE(AO1)
  1515. FMADD y16, alpha4, a8, y16
  1516. LFD a8, 7 * SIZE(AO1)
  1517. STFD y13, 12 * SIZE(Y1)
  1518. STFD y14, 13 * SIZE(Y1)
  1519. STFD y15, 14 * SIZE(Y1)
  1520. STFD y16, 15 * SIZE(Y1)
  1521. LFD y13, 28 * SIZE(Y1)
  1522. LFD y14, 29 * SIZE(Y1)
  1523. LFD y15, 30 * SIZE(Y1)
  1524. LFD y16, 31 * SIZE(Y1)
  1525. addi Y1, Y1, 16 * SIZE
  1526. DCBT(Y1, PREC)
  1527. bdnz LL(22)
  1528. .align 4
  1529. LL(23):
  1530. FMADD y01, alpha1, a1, y01
  1531. LFD a1, 8 * SIZE(AO1)
  1532. FMADD y02, alpha1, a2, y02
  1533. LFD a2, 9 * SIZE(AO1)
  1534. FMADD y03, alpha1, a3, y03
  1535. LFD a3, 10 * SIZE(AO1)
  1536. FMADD y04, alpha1, a4, y04
  1537. LFD a4, 11 * SIZE(AO1)
  1538. FMADD y05, alpha1, a5, y05
  1539. LFD a5, 12 * SIZE(AO1)
  1540. FMADD y06, alpha1, a6, y06
  1541. LFD a6, 13 * SIZE(AO1)
  1542. FMADD y07, alpha1, a7, y07
  1543. LFD a7, 14 * SIZE(AO1)
  1544. FMADD y08, alpha1, a8, y08
  1545. LFD a8, 15 * SIZE(AO1)
  1546. FMADD y09, alpha1, a1, y09
  1547. LFD a1, 0 * SIZE(AO2)
  1548. FMADD y10, alpha1, a2, y10
  1549. LFD a2, 1 * SIZE(AO2)
  1550. FMADD y11, alpha1, a3, y11
  1551. LFD a3, 2 * SIZE(AO2)
  1552. FMADD y12, alpha1, a4, y12
  1553. LFD a4, 3 * SIZE(AO2)
  1554. FMADD y13, alpha1, a5, y13
  1555. LFD a5, 4 * SIZE(AO2)
  1556. FMADD y14, alpha1, a6, y14
  1557. LFD a6, 5 * SIZE(AO2)
  1558. FMADD y15, alpha1, a7, y15
  1559. LFD a7, 6 * SIZE(AO2)
  1560. FMADD y16, alpha1, a8, y16
  1561. LFD a8, 7 * SIZE(AO2)
  1562. FMADD y01, alpha2, a1, y01
  1563. LFD a1, 8 * SIZE(AO2)
  1564. FMADD y02, alpha2, a2, y02
  1565. LFD a2, 9 * SIZE(AO2)
  1566. FMADD y03, alpha2, a3, y03
  1567. LFD a3, 10 * SIZE(AO2)
  1568. FMADD y04, alpha2, a4, y04
  1569. LFD a4, 11 * SIZE(AO2)
  1570. FMADD y05, alpha2, a5, y05
  1571. LFD a5, 12 * SIZE(AO2)
  1572. FMADD y06, alpha2, a6, y06
  1573. LFD a6, 13 * SIZE(AO2)
  1574. FMADD y07, alpha2, a7, y07
  1575. LFD a7, 14 * SIZE(AO2)
  1576. FMADD y08, alpha2, a8, y08
  1577. LFD a8, 15 * SIZE(AO2)
  1578. FMADD y09, alpha2, a1, y09
  1579. LFD a1, 0 * SIZE(AO3)
  1580. FMADD y10, alpha2, a2, y10
  1581. LFD a2, 1 * SIZE(AO3)
  1582. FMADD y11, alpha2, a3, y11
  1583. LFD a3, 2 * SIZE(AO3)
  1584. FMADD y12, alpha2, a4, y12
  1585. LFD a4, 3 * SIZE(AO3)
  1586. FMADD y13, alpha2, a5, y13
  1587. LFD a5, 4 * SIZE(AO3)
  1588. FMADD y14, alpha2, a6, y14
  1589. LFD a6, 5 * SIZE(AO3)
  1590. FMADD y15, alpha2, a7, y15
  1591. LFD a7, 6 * SIZE(AO3)
  1592. FMADD y16, alpha2, a8, y16
  1593. LFD a8, 7 * SIZE(AO3)
  1594. FMADD y01, alpha3, a1, y01
  1595. LFD a1, 8 * SIZE(AO3)
  1596. FMADD y02, alpha3, a2, y02
  1597. LFD a2, 9 * SIZE(AO3)
  1598. FMADD y03, alpha3, a3, y03
  1599. LFD a3, 10 * SIZE(AO3)
  1600. FMADD y04, alpha3, a4, y04
  1601. LFD a4, 11 * SIZE(AO3)
  1602. FMADD y05, alpha3, a5, y05
  1603. LFD a5, 12 * SIZE(AO3)
  1604. FMADD y06, alpha3, a6, y06
  1605. LFD a6, 13 * SIZE(AO3)
  1606. FMADD y07, alpha3, a7, y07
  1607. LFD a7, 14 * SIZE(AO3)
  1608. FMADD y08, alpha3, a8, y08
  1609. LFD a8, 15 * SIZE(AO3)
  1610. FMADD y09, alpha3, a1, y09
  1611. LFD a1, 0 * SIZE(AO4)
  1612. FMADD y10, alpha3, a2, y10
  1613. LFD a2, 1 * SIZE(AO4)
  1614. FMADD y11, alpha3, a3, y11
  1615. LFD a3, 2 * SIZE(AO4)
  1616. FMADD y12, alpha3, a4, y12
  1617. LFD a4, 3 * SIZE(AO4)
  1618. FMADD y13, alpha3, a5, y13
  1619. LFD a5, 4 * SIZE(AO4)
  1620. FMADD y14, alpha3, a6, y14
  1621. LFD a6, 5 * SIZE(AO4)
  1622. FMADD y15, alpha3, a7, y15
  1623. LFD a7, 6 * SIZE(AO4)
  1624. FMADD y16, alpha3, a8, y16
  1625. LFD a8, 7 * SIZE(AO4)
  1626. FMADD y01, alpha4, a1, y01
  1627. LFD a1, 8 * SIZE(AO4)
  1628. FMADD y02, alpha4, a2, y02
  1629. LFD a2, 9 * SIZE(AO4)
  1630. FMADD y03, alpha4, a3, y03
  1631. LFD a3, 10 * SIZE(AO4)
  1632. FMADD y04, alpha4, a4, y04
  1633. LFD a4, 11 * SIZE(AO4)
  1634. FMADD y05, alpha4, a5, y05
  1635. LFD a5, 12 * SIZE(AO4)
  1636. FMADD y06, alpha4, a6, y06
  1637. LFD a6, 13 * SIZE(AO4)
  1638. FMADD y07, alpha4, a7, y07
  1639. LFD a7, 14 * SIZE(AO4)
  1640. FMADD y08, alpha4, a8, y08
  1641. LFD a8, 15 * SIZE(AO4)
  1642. FMADD y09, alpha4, a1, y09
  1643. addi AO1, AO1, 16 * SIZE
  1644. FMADD y10, alpha4, a2, y10
  1645. addi AO2, AO2, 16 * SIZE
  1646. FMADD y11, alpha4, a3, y11
  1647. addi AO3, AO3, 16 * SIZE
  1648. FMADD y12, alpha4, a4, y12
  1649. addi AO4, AO4, 16 * SIZE
  1650. FMADD y13, alpha4, a5, y13
  1651. FMADD y14, alpha4, a6, y14
  1652. FMADD y15, alpha4, a7, y15
  1653. FMADD y16, alpha4, a8, y16
  1654. STFD y01, 0 * SIZE(Y1)
  1655. STFD y02, 1 * SIZE(Y1)
  1656. STFD y03, 2 * SIZE(Y1)
  1657. STFD y04, 3 * SIZE(Y1)
  1658. STFD y05, 4 * SIZE(Y1)
  1659. STFD y06, 5 * SIZE(Y1)
  1660. STFD y07, 6 * SIZE(Y1)
  1661. STFD y08, 7 * SIZE(Y1)
  1662. STFD y09, 8 * SIZE(Y1)
  1663. STFD y10, 9 * SIZE(Y1)
  1664. STFD y11, 10 * SIZE(Y1)
  1665. STFD y12, 11 * SIZE(Y1)
  1666. STFD y13, 12 * SIZE(Y1)
  1667. STFD y14, 13 * SIZE(Y1)
  1668. STFD y15, 14 * SIZE(Y1)
  1669. STFD y16, 15 * SIZE(Y1)
  1670. addi Y1, Y1, 16 * SIZE
  1671. .align 4
  1672. LL(25):
  1673. andi. r0, M, 15
  1674. ble LL(30)
  1675. andi. r0, M, 8
  1676. ble LL(26)
  1677. LFD y01, 0 * SIZE(Y1)
  1678. LFD y02, 1 * SIZE(Y1)
  1679. LFD y03, 2 * SIZE(Y1)
  1680. LFD y04, 3 * SIZE(Y1)
  1681. LFD y05, 4 * SIZE(Y1)
  1682. LFD y06, 5 * SIZE(Y1)
  1683. LFD y07, 6 * SIZE(Y1)
  1684. LFD y08, 7 * SIZE(Y1)
  1685. LFD a1, 0 * SIZE(AO1)
  1686. LFD a2, 1 * SIZE(AO1)
  1687. LFD a3, 2 * SIZE(AO1)
  1688. LFD a4, 3 * SIZE(AO1)
  1689. LFD a5, 4 * SIZE(AO1)
  1690. LFD a6, 5 * SIZE(AO1)
  1691. LFD a7, 6 * SIZE(AO1)
  1692. LFD a8, 7 * SIZE(AO1)
  1693. FMADD y01, alpha1, a1, y01
  1694. LFD a1, 0 * SIZE(AO2)
  1695. FMADD y02, alpha1, a2, y02
  1696. LFD a2, 1 * SIZE(AO2)
  1697. FMADD y03, alpha1, a3, y03
  1698. LFD a3, 2 * SIZE(AO2)
  1699. FMADD y04, alpha1, a4, y04
  1700. LFD a4, 3 * SIZE(AO2)
  1701. FMADD y05, alpha1, a5, y05
  1702. LFD a5, 4 * SIZE(AO2)
  1703. FMADD y06, alpha1, a6, y06
  1704. LFD a6, 5 * SIZE(AO2)
  1705. FMADD y07, alpha1, a7, y07
  1706. LFD a7, 6 * SIZE(AO2)
  1707. FMADD y08, alpha1, a8, y08
  1708. LFD a8, 7 * SIZE(AO2)
  1709. FMADD y01, alpha2, a1, y01
  1710. LFD a1, 0 * SIZE(AO3)
  1711. FMADD y02, alpha2, a2, y02
  1712. LFD a2, 1 * SIZE(AO3)
  1713. FMADD y03, alpha2, a3, y03
  1714. LFD a3, 2 * SIZE(AO3)
  1715. FMADD y04, alpha2, a4, y04
  1716. LFD a4, 3 * SIZE(AO3)
  1717. FMADD y05, alpha2, a5, y05
  1718. LFD a5, 4 * SIZE(AO3)
  1719. FMADD y06, alpha2, a6, y06
  1720. LFD a6, 5 * SIZE(AO3)
  1721. FMADD y07, alpha2, a7, y07
  1722. LFD a7, 6 * SIZE(AO3)
  1723. FMADD y08, alpha2, a8, y08
  1724. LFD a8, 7 * SIZE(AO3)
  1725. FMADD y01, alpha3, a1, y01
  1726. LFD a1, 0 * SIZE(AO4)
  1727. FMADD y02, alpha3, a2, y02
  1728. LFD a2, 1 * SIZE(AO4)
  1729. FMADD y03, alpha3, a3, y03
  1730. LFD a3, 2 * SIZE(AO4)
  1731. FMADD y04, alpha3, a4, y04
  1732. LFD a4, 3 * SIZE(AO4)
  1733. FMADD y05, alpha3, a5, y05
  1734. LFD a5, 4 * SIZE(AO4)
  1735. FMADD y06, alpha3, a6, y06
  1736. LFD a6, 5 * SIZE(AO4)
  1737. FMADD y07, alpha3, a7, y07
  1738. LFD a7, 6 * SIZE(AO4)
  1739. FMADD y08, alpha3, a8, y08
  1740. LFD a8, 7 * SIZE(AO4)
  1741. FMADD y01, alpha4, a1, y01
  1742. addi AO1, AO1, 8 * SIZE
  1743. FMADD y02, alpha4, a2, y02
  1744. addi AO2, AO2, 8 * SIZE
  1745. FMADD y03, alpha4, a3, y03
  1746. addi AO3, AO3, 8 * SIZE
  1747. FMADD y04, alpha4, a4, y04
  1748. addi AO4, AO4, 8 * SIZE
  1749. STFD y01, 0 * SIZE(Y1)
  1750. STFD y02, 1 * SIZE(Y1)
  1751. STFD y03, 2 * SIZE(Y1)
  1752. STFD y04, 3 * SIZE(Y1)
  1753. FMADD y05, alpha4, a5, y05
  1754. FMADD y06, alpha4, a6, y06
  1755. FMADD y07, alpha4, a7, y07
  1756. FMADD y08, alpha4, a8, y08
  1757. STFD y05, 4 * SIZE(Y1)
  1758. STFD y06, 5 * SIZE(Y1)
  1759. STFD y07, 6 * SIZE(Y1)
  1760. STFD y08, 7 * SIZE(Y1)
  1761. addi Y1, Y1, 8 * SIZE
  1762. .align 4
  1763. LL(26):
  1764. andi. r0, M, 4
  1765. ble LL(27)
  1766. LFD y01, 0 * SIZE(Y1)
  1767. LFD y02, 1 * SIZE(Y1)
  1768. LFD y03, 2 * SIZE(Y1)
  1769. LFD y04, 3 * SIZE(Y1)
  1770. LFD a1, 0 * SIZE(AO1)
  1771. LFD a2, 1 * SIZE(AO1)
  1772. LFD a3, 2 * SIZE(AO1)
  1773. LFD a4, 3 * SIZE(AO1)
  1774. LFD a5, 0 * SIZE(AO2)
  1775. LFD a6, 1 * SIZE(AO2)
  1776. LFD a7, 2 * SIZE(AO2)
  1777. LFD a8, 3 * SIZE(AO2)
  1778. FMADD y01, alpha1, a1, y01
  1779. LFD a1, 0 * SIZE(AO3)
  1780. FMADD y02, alpha1, a2, y02
  1781. LFD a2, 1 * SIZE(AO3)
  1782. FMADD y03, alpha1, a3, y03
  1783. LFD a3, 2 * SIZE(AO3)
  1784. FMADD y04, alpha1, a4, y04
  1785. LFD a4, 3 * SIZE(AO3)
  1786. FMADD y01, alpha2, a5, y01
  1787. LFD a5, 0 * SIZE(AO4)
  1788. FMADD y02, alpha2, a6, y02
  1789. LFD a6, 1 * SIZE(AO4)
  1790. FMADD y03, alpha2, a7, y03
  1791. LFD a7, 2 * SIZE(AO4)
  1792. FMADD y04, alpha2, a8, y04
  1793. LFD a8, 3 * SIZE(AO4)
  1794. FMADD y01, alpha3, a1, y01
  1795. addi AO1, AO1, 4 * SIZE
  1796. FMADD y02, alpha3, a2, y02
  1797. addi AO2, AO2, 4 * SIZE
  1798. FMADD y03, alpha3, a3, y03
  1799. addi AO3, AO3, 4 * SIZE
  1800. FMADD y04, alpha3, a4, y04
  1801. addi AO4, AO4, 4 * SIZE
  1802. FMADD y01, alpha4, a5, y01
  1803. FMADD y02, alpha4, a6, y02
  1804. FMADD y03, alpha4, a7, y03
  1805. FMADD y04, alpha4, a8, y04
  1806. STFD y01, 0 * SIZE(Y1)
  1807. STFD y02, 1 * SIZE(Y1)
  1808. STFD y03, 2 * SIZE(Y1)
  1809. STFD y04, 3 * SIZE(Y1)
  1810. addi Y1, Y1, 4 * SIZE
  1811. .align 4
  1812. LL(27):
  1813. andi. r0, M, 2
  1814. ble LL(28)
  1815. LFD y01, 0 * SIZE(Y1)
  1816. LFD y02, 1 * SIZE(Y1)
  1817. LFD a1, 0 * SIZE(AO1)
  1818. LFD a2, 1 * SIZE(AO1)
  1819. LFD a3, 0 * SIZE(AO2)
  1820. LFD a4, 1 * SIZE(AO2)
  1821. LFD a5, 0 * SIZE(AO3)
  1822. LFD a6, 1 * SIZE(AO3)
  1823. LFD a7, 0 * SIZE(AO4)
  1824. LFD a8, 1 * SIZE(AO4)
  1825. FMADD y01, alpha1, a1, y01
  1826. addi AO1, AO1, 2 * SIZE
  1827. FMADD y02, alpha1, a2, y02
  1828. addi AO2, AO2, 2 * SIZE
  1829. FMADD y01, alpha2, a3, y01
  1830. addi AO3, AO3, 2 * SIZE
  1831. FMADD y02, alpha2, a4, y02
  1832. addi AO4, AO4, 2 * SIZE
  1833. FMADD y01, alpha3, a5, y01
  1834. FMADD y02, alpha3, a6, y02
  1835. FMADD y01, alpha4, a7, y01
  1836. FMADD y02, alpha4, a8, y02
  1837. STFD y01, 0 * SIZE(Y1)
  1838. STFD y02, 1 * SIZE(Y1)
  1839. addi Y1, Y1, 2 * SIZE
  1840. .align 4
  1841. LL(28):
  1842. andi. r0, M, 1
  1843. ble LL(30)
  1844. LFD y01, 0 * SIZE(Y1)
  1845. LFD a1, 0 * SIZE(AO1)
  1846. LFD a2, 0 * SIZE(AO2)
  1847. LFD a3, 0 * SIZE(AO3)
  1848. LFD a4, 0 * SIZE(AO4)
  1849. FMADD y01, alpha1, a1, y01
  1850. FMADD y01, alpha2, a2, y01
  1851. FMADD y01, alpha3, a3, y01
  1852. FMADD y01, alpha4, a4, y01
  1853. STFD y01, 0 * SIZE(Y1)
  1854. .align 4
  1855. LL(30):
  1856. andi. J, N, 2
  1857. lfd alpha, ALPHA
  1858. ble LL(40)
  1859. .align 4
  1860. LFD alpha1, 0 * SIZE(X)
  1861. add X, X, INCX
  1862. LFD alpha2, 0 * SIZE(X)
  1863. add X, X, INCX
  1864. FMUL alpha1, alpha, alpha1
  1865. FMUL alpha2, alpha, alpha2
  1866. mr AO1, A
  1867. add AO2, A, LDA
  1868. add A, AO2, LDA
  1869. mr Y1, YY
  1870. srawi. r0, M, 4
  1871. mtspr CTR, r0
  1872. ble LL(35)
  1873. LFD y01, 0 * SIZE(Y1)
  1874. LFD y02, 1 * SIZE(Y1)
  1875. LFD y03, 2 * SIZE(Y1)
  1876. LFD y04, 3 * SIZE(Y1)
  1877. LFD y05, 4 * SIZE(Y1)
  1878. LFD y06, 5 * SIZE(Y1)
  1879. LFD y07, 6 * SIZE(Y1)
  1880. LFD y08, 7 * SIZE(Y1)
  1881. LFD y09, 8 * SIZE(Y1)
  1882. LFD y10, 9 * SIZE(Y1)
  1883. LFD y11, 10 * SIZE(Y1)
  1884. LFD y12, 11 * SIZE(Y1)
  1885. LFD y13, 12 * SIZE(Y1)
  1886. LFD y14, 13 * SIZE(Y1)
  1887. LFD y15, 14 * SIZE(Y1)
  1888. LFD y16, 15 * SIZE(Y1)
  1889. LFD a1, 0 * SIZE(AO1)
  1890. LFD a2, 1 * SIZE(AO1)
  1891. LFD a3, 2 * SIZE(AO1)
  1892. LFD a4, 3 * SIZE(AO1)
  1893. LFD a5, 4 * SIZE(AO1)
  1894. LFD a6, 5 * SIZE(AO1)
  1895. LFD a7, 6 * SIZE(AO1)
  1896. LFD a8, 7 * SIZE(AO1)
  1897. bdz LL(33)
  1898. .align 4
  1899. LL(32):
  1900. FMADD y01, alpha1, a1, y01
  1901. LFD a1, 8 * SIZE(AO1)
  1902. FMADD y02, alpha1, a2, y02
  1903. LFD a2, 9 * SIZE(AO1)
  1904. FMADD y03, alpha1, a3, y03
  1905. LFD a3, 10 * SIZE(AO1)
  1906. FMADD y04, alpha1, a4, y04
  1907. LFD a4, 11 * SIZE(AO1)
  1908. FMADD y05, alpha1, a5, y05
  1909. LFD a5, 12 * SIZE(AO1)
  1910. FMADD y06, alpha1, a6, y06
  1911. LFD a6, 13 * SIZE(AO1)
  1912. FMADD y07, alpha1, a7, y07
  1913. LFD a7, 14 * SIZE(AO1)
  1914. FMADD y08, alpha1, a8, y08
  1915. LFD a8, 15 * SIZE(AO1)
  1916. FMADD y09, alpha1, a1, y09
  1917. LFD a1, 0 * SIZE(AO2)
  1918. FMADD y10, alpha1, a2, y10
  1919. LFD a2, 1 * SIZE(AO2)
  1920. FMADD y11, alpha1, a3, y11
  1921. LFD a3, 2 * SIZE(AO2)
  1922. FMADD y12, alpha1, a4, y12
  1923. LFD a4, 3 * SIZE(AO2)
  1924. FMADD y13, alpha1, a5, y13
  1925. LFD a5, 4 * SIZE(AO2)
  1926. FMADD y14, alpha1, a6, y14
  1927. LFD a6, 5 * SIZE(AO2)
  1928. FMADD y15, alpha1, a7, y15
  1929. LFD a7, 6 * SIZE(AO2)
  1930. FMADD y16, alpha1, a8, y16
  1931. LFD a8, 7 * SIZE(AO2)
  1932. FMADD y01, alpha2, a1, y01
  1933. LFD a1, 8 * SIZE(AO2)
  1934. FMADD y02, alpha2, a2, y02
  1935. LFD a2, 9 * SIZE(AO2)
  1936. FMADD y03, alpha2, a3, y03
  1937. LFD a3, 10 * SIZE(AO2)
  1938. FMADD y04, alpha2, a4, y04
  1939. LFD a4, 11 * SIZE(AO2)
  1940. FMADD y05, alpha2, a5, y05
  1941. LFD a5, 12 * SIZE(AO2)
  1942. FMADD y06, alpha2, a6, y06
  1943. LFD a6, 13 * SIZE(AO2)
  1944. FMADD y07, alpha2, a7, y07
  1945. LFD a7, 14 * SIZE(AO2)
  1946. FMADD y08, alpha2, a8, y08
  1947. LFD a8, 15 * SIZE(AO2)
  1948. FMADD y09, alpha2, a1, y09
  1949. LFD a1, 16 * SIZE(AO1)
  1950. FMADD y10, alpha2, a2, y10
  1951. LFD a2, 17 * SIZE(AO1)
  1952. FMADD y11, alpha2, a3, y11
  1953. LFD a3, 18 * SIZE(AO1)
  1954. FMADD y12, alpha2, a4, y12
  1955. LFD a4, 19 * SIZE(AO1)
  1956. FMADD y13, alpha2, a5, y13
  1957. LFD a5, 20 * SIZE(AO1)
  1958. FMADD y14, alpha2, a6, y14
  1959. LFD a6, 21 * SIZE(AO1)
  1960. FMADD y15, alpha2, a7, y15
  1961. LFD a7, 22 * SIZE(AO1)
  1962. FMADD y16, alpha2, a8, y16
  1963. LFD a8, 23 * SIZE(AO1)
  1964. STFD y01, 0 * SIZE(Y1)
  1965. STFD y02, 1 * SIZE(Y1)
  1966. STFD y03, 2 * SIZE(Y1)
  1967. STFD y04, 3 * SIZE(Y1)
  1968. LFD y01, 16 * SIZE(Y1)
  1969. LFD y02, 17 * SIZE(Y1)
  1970. LFD y03, 18 * SIZE(Y1)
  1971. LFD y04, 19 * SIZE(Y1)
  1972. STFD y05, 4 * SIZE(Y1)
  1973. STFD y06, 5 * SIZE(Y1)
  1974. STFD y07, 6 * SIZE(Y1)
  1975. STFD y08, 7 * SIZE(Y1)
  1976. LFD y05, 20 * SIZE(Y1)
  1977. LFD y06, 21 * SIZE(Y1)
  1978. LFD y07, 22 * SIZE(Y1)
  1979. LFD y08, 23 * SIZE(Y1)
  1980. STFD y09, 8 * SIZE(Y1)
  1981. STFD y10, 9 * SIZE(Y1)
  1982. STFD y11, 10 * SIZE(Y1)
  1983. STFD y12, 11 * SIZE(Y1)
  1984. LFD y09, 24 * SIZE(Y1)
  1985. LFD y10, 25 * SIZE(Y1)
  1986. LFD y11, 26 * SIZE(Y1)
  1987. LFD y12, 27 * SIZE(Y1)
  1988. STFD y13, 12 * SIZE(Y1)
  1989. STFD y14, 13 * SIZE(Y1)
  1990. STFD y15, 14 * SIZE(Y1)
  1991. STFD y16, 15 * SIZE(Y1)
  1992. LFD y13, 28 * SIZE(Y1)
  1993. LFD y14, 29 * SIZE(Y1)
  1994. LFD y15, 30 * SIZE(Y1)
  1995. LFD y16, 31 * SIZE(Y1)
  1996. addi AO1, AO1, 16 * SIZE
  1997. addi AO2, AO2, 16 * SIZE
  1998. addi Y1, Y1, 16 * SIZE
  1999. DCBT(AO1, PREA)
  2000. DCBT(AO2, PREA)
  2001. DCBT(Y1, PREC)
  2002. bdnz LL(32)
  2003. .align 4
  2004. LL(33):
  2005. FMADD y01, alpha1, a1, y01
  2006. LFD a1, 8 * SIZE(AO1)
  2007. FMADD y02, alpha1, a2, y02
  2008. LFD a2, 9 * SIZE(AO1)
  2009. FMADD y03, alpha1, a3, y03
  2010. LFD a3, 10 * SIZE(AO1)
  2011. FMADD y04, alpha1, a4, y04
  2012. LFD a4, 11 * SIZE(AO1)
  2013. FMADD y05, alpha1, a5, y05
  2014. LFD a5, 12 * SIZE(AO1)
  2015. FMADD y06, alpha1, a6, y06
  2016. LFD a6, 13 * SIZE(AO1)
  2017. FMADD y07, alpha1, a7, y07
  2018. LFD a7, 14 * SIZE(AO1)
  2019. FMADD y08, alpha1, a8, y08
  2020. LFD a8, 15 * SIZE(AO1)
  2021. FMADD y09, alpha1, a1, y09
  2022. LFD a1, 0 * SIZE(AO2)
  2023. FMADD y10, alpha1, a2, y10
  2024. LFD a2, 1 * SIZE(AO2)
  2025. FMADD y11, alpha1, a3, y11
  2026. LFD a3, 2 * SIZE(AO2)
  2027. FMADD y12, alpha1, a4, y12
  2028. LFD a4, 3 * SIZE(AO2)
  2029. FMADD y13, alpha1, a5, y13
  2030. LFD a5, 4 * SIZE(AO2)
  2031. FMADD y14, alpha1, a6, y14
  2032. LFD a6, 5 * SIZE(AO2)
  2033. FMADD y15, alpha1, a7, y15
  2034. LFD a7, 6 * SIZE(AO2)
  2035. FMADD y16, alpha1, a8, y16
  2036. LFD a8, 7 * SIZE(AO2)
  2037. FMADD y01, alpha2, a1, y01
  2038. LFD a1, 8 * SIZE(AO2)
  2039. FMADD y02, alpha2, a2, y02
  2040. LFD a2, 9 * SIZE(AO2)
  2041. FMADD y03, alpha2, a3, y03
  2042. LFD a3, 10 * SIZE(AO2)
  2043. FMADD y04, alpha2, a4, y04
  2044. LFD a4, 11 * SIZE(AO2)
  2045. FMADD y05, alpha2, a5, y05
  2046. LFD a5, 12 * SIZE(AO2)
  2047. FMADD y06, alpha2, a6, y06
  2048. LFD a6, 13 * SIZE(AO2)
  2049. FMADD y07, alpha2, a7, y07
  2050. LFD a7, 14 * SIZE(AO2)
  2051. FMADD y08, alpha2, a8, y08
  2052. LFD a8, 15 * SIZE(AO2)
  2053. FMADD y09, alpha2, a1, y09
  2054. FMADD y10, alpha2, a2, y10
  2055. FMADD y11, alpha2, a3, y11
  2056. FMADD y12, alpha2, a4, y12
  2057. FMADD y13, alpha2, a5, y13
  2058. FMADD y14, alpha2, a6, y14
  2059. FMADD y15, alpha2, a7, y15
  2060. FMADD y16, alpha2, a8, y16
  2061. STFD y01, 0 * SIZE(Y1)
  2062. STFD y02, 1 * SIZE(Y1)
  2063. STFD y03, 2 * SIZE(Y1)
  2064. STFD y04, 3 * SIZE(Y1)
  2065. STFD y05, 4 * SIZE(Y1)
  2066. STFD y06, 5 * SIZE(Y1)
  2067. STFD y07, 6 * SIZE(Y1)
  2068. STFD y08, 7 * SIZE(Y1)
  2069. STFD y09, 8 * SIZE(Y1)
  2070. STFD y10, 9 * SIZE(Y1)
  2071. STFD y11, 10 * SIZE(Y1)
  2072. STFD y12, 11 * SIZE(Y1)
  2073. STFD y13, 12 * SIZE(Y1)
  2074. STFD y14, 13 * SIZE(Y1)
  2075. STFD y15, 14 * SIZE(Y1)
  2076. STFD y16, 15 * SIZE(Y1)
  2077. addi AO1, AO1, 16 * SIZE
  2078. addi AO2, AO2, 16 * SIZE
  2079. addi Y1, Y1, 16 * SIZE
  2080. .align 4
  2081. LL(35):
  2082. andi. r0, M, 15
  2083. ble LL(40)
  2084. andi. r0, M, 8
  2085. ble LL(36)
  2086. LFD y01, 0 * SIZE(Y1)
  2087. LFD y02, 1 * SIZE(Y1)
  2088. LFD y03, 2 * SIZE(Y1)
  2089. LFD y04, 3 * SIZE(Y1)
  2090. LFD y05, 4 * SIZE(Y1)
  2091. LFD y06, 5 * SIZE(Y1)
  2092. LFD y07, 6 * SIZE(Y1)
  2093. LFD y08, 7 * SIZE(Y1)
  2094. LFD a1, 0 * SIZE(AO1)
  2095. LFD a2, 1 * SIZE(AO1)
  2096. LFD a3, 2 * SIZE(AO1)
  2097. LFD a4, 3 * SIZE(AO1)
  2098. LFD a5, 4 * SIZE(AO1)
  2099. LFD a6, 5 * SIZE(AO1)
  2100. LFD a7, 6 * SIZE(AO1)
  2101. LFD a8, 7 * SIZE(AO1)
  2102. FMADD y01, alpha1, a1, y01
  2103. LFD a1, 0 * SIZE(AO2)
  2104. FMADD y02, alpha1, a2, y02
  2105. LFD a2, 1 * SIZE(AO2)
  2106. FMADD y03, alpha1, a3, y03
  2107. LFD a3, 2 * SIZE(AO2)
  2108. FMADD y04, alpha1, a4, y04
  2109. LFD a4, 3 * SIZE(AO2)
  2110. FMADD y05, alpha1, a5, y05
  2111. LFD a5, 4 * SIZE(AO2)
  2112. FMADD y06, alpha1, a6, y06
  2113. LFD a6, 5 * SIZE(AO2)
  2114. FMADD y07, alpha1, a7, y07
  2115. LFD a7, 6 * SIZE(AO2)
  2116. FMADD y08, alpha1, a8, y08
  2117. LFD a8, 7 * SIZE(AO2)
  2118. FMADD y01, alpha2, a1, y01
  2119. FMADD y02, alpha2, a2, y02
  2120. FMADD y03, alpha2, a3, y03
  2121. FMADD y04, alpha2, a4, y04
  2122. FMADD y05, alpha2, a5, y05
  2123. FMADD y06, alpha2, a6, y06
  2124. FMADD y07, alpha2, a7, y07
  2125. FMADD y08, alpha2, a8, y08
  2126. STFD y01, 0 * SIZE(Y1)
  2127. STFD y02, 1 * SIZE(Y1)
  2128. STFD y03, 2 * SIZE(Y1)
  2129. STFD y04, 3 * SIZE(Y1)
  2130. STFD y05, 4 * SIZE(Y1)
  2131. STFD y06, 5 * SIZE(Y1)
  2132. STFD y07, 6 * SIZE(Y1)
  2133. STFD y08, 7 * SIZE(Y1)
  2134. addi AO1, AO1, 8 * SIZE
  2135. addi AO2, AO2, 8 * SIZE
  2136. addi Y1, Y1, 8 * SIZE
  2137. .align 4
  2138. LL(36):
  2139. andi. r0, M, 4
  2140. ble LL(37)
  2141. LFD y01, 0 * SIZE(Y1)
  2142. LFD y02, 1 * SIZE(Y1)
  2143. LFD y03, 2 * SIZE(Y1)
  2144. LFD y04, 3 * SIZE(Y1)
  2145. LFD a1, 0 * SIZE(AO1)
  2146. LFD a2, 1 * SIZE(AO1)
  2147. LFD a3, 2 * SIZE(AO1)
  2148. LFD a4, 3 * SIZE(AO1)
  2149. LFD a5, 0 * SIZE(AO2)
  2150. LFD a6, 1 * SIZE(AO2)
  2151. LFD a7, 2 * SIZE(AO2)
  2152. LFD a8, 3 * SIZE(AO2)
  2153. FMADD y01, alpha1, a1, y01
  2154. FMADD y02, alpha1, a2, y02
  2155. FMADD y03, alpha1, a3, y03
  2156. FMADD y04, alpha1, a4, y04
  2157. FMADD y01, alpha2, a5, y01
  2158. FMADD y02, alpha2, a6, y02
  2159. FMADD y03, alpha2, a7, y03
  2160. FMADD y04, alpha2, a8, y04
  2161. STFD y01, 0 * SIZE(Y1)
  2162. STFD y02, 1 * SIZE(Y1)
  2163. STFD y03, 2 * SIZE(Y1)
  2164. STFD y04, 3 * SIZE(Y1)
  2165. addi AO1, AO1, 4 * SIZE
  2166. addi AO2, AO2, 4 * SIZE
  2167. addi Y1, Y1, 4 * SIZE
  2168. .align 4
  2169. LL(37):
  2170. andi. r0, M, 2
  2171. ble LL(38)
  2172. LFD y01, 0 * SIZE(Y1)
  2173. LFD y02, 1 * SIZE(Y1)
  2174. LFD a1, 0 * SIZE(AO1)
  2175. LFD a2, 1 * SIZE(AO1)
  2176. LFD a3, 0 * SIZE(AO2)
  2177. LFD a4, 1 * SIZE(AO2)
  2178. FMADD y01, alpha1, a1, y01
  2179. FMADD y02, alpha1, a2, y02
  2180. FMADD y01, alpha2, a3, y01
  2181. FMADD y02, alpha2, a4, y02
  2182. STFD y01, 0 * SIZE(Y1)
  2183. STFD y02, 1 * SIZE(Y1)
  2184. addi AO1, AO1, 2 * SIZE
  2185. addi AO2, AO2, 2 * SIZE
  2186. addi Y1, Y1, 2 * SIZE
  2187. .align 4
  2188. LL(38):
  2189. andi. r0, M, 1
  2190. ble LL(40)
  2191. LFD y01, 0 * SIZE(Y1)
  2192. LFD a1, 0 * SIZE(AO1)
  2193. LFD a2, 0 * SIZE(AO2)
  2194. FMADD y01, alpha1, a1, y01
  2195. FMADD y01, alpha2, a2, y01
  2196. STFD y01, 0 * SIZE(Y1)
  2197. .align 4
  2198. LL(40):
  2199. andi. J, N, 1
  2200. lfd alpha, ALPHA
  2201. ble LL(990)
  2202. .align 4
  2203. LFD alpha1, 0 * SIZE(X)
  2204. FMUL alpha1, alpha, alpha1
  2205. mr AO1, A
  2206. mr Y1, YY
  2207. srawi. r0, M, 4
  2208. mtspr CTR, r0
  2209. ble LL(45)
  2210. LFD y01, 0 * SIZE(Y1)
  2211. LFD y02, 1 * SIZE(Y1)
  2212. LFD y03, 2 * SIZE(Y1)
  2213. LFD y04, 3 * SIZE(Y1)
  2214. LFD y05, 4 * SIZE(Y1)
  2215. LFD y06, 5 * SIZE(Y1)
  2216. LFD y07, 6 * SIZE(Y1)
  2217. LFD y08, 7 * SIZE(Y1)
  2218. LFD a1, 0 * SIZE(AO1)
  2219. LFD a2, 1 * SIZE(AO1)
  2220. LFD a3, 2 * SIZE(AO1)
  2221. LFD a4, 3 * SIZE(AO1)
  2222. LFD a5, 4 * SIZE(AO1)
  2223. LFD a6, 5 * SIZE(AO1)
  2224. LFD a7, 6 * SIZE(AO1)
  2225. LFD a8, 7 * SIZE(AO1)
  2226. LFD y09, 8 * SIZE(Y1)
  2227. LFD y10, 9 * SIZE(Y1)
  2228. LFD y11, 10 * SIZE(Y1)
  2229. LFD y12, 11 * SIZE(Y1)
  2230. LFD y13, 12 * SIZE(Y1)
  2231. LFD y14, 13 * SIZE(Y1)
  2232. LFD y15, 14 * SIZE(Y1)
  2233. LFD y16, 15 * SIZE(Y1)
  2234. bdz LL(43)
  2235. .align 4
  2236. LL(42):
  2237. FMADD y01, alpha1, a1, y01
  2238. LFD a1, 8 * SIZE(AO1)
  2239. FMADD y02, alpha1, a2, y02
  2240. LFD a2, 9 * SIZE(AO1)
  2241. FMADD y03, alpha1, a3, y03
  2242. LFD a3, 10 * SIZE(AO1)
  2243. FMADD y04, alpha1, a4, y04
  2244. LFD a4, 11 * SIZE(AO1)
  2245. FMADD y05, alpha1, a5, y05
  2246. LFD a5, 12 * SIZE(AO1)
  2247. FMADD y06, alpha1, a6, y06
  2248. LFD a6, 13 * SIZE(AO1)
  2249. FMADD y07, alpha1, a7, y07
  2250. LFD a7, 14 * SIZE(AO1)
  2251. FMADD y08, alpha1, a8, y08
  2252. LFD a8, 15 * SIZE(AO1)
  2253. FMADD y09, alpha1, a1, y09
  2254. LFD a1, 16 * SIZE(AO1)
  2255. FMADD y10, alpha1, a2, y10
  2256. LFD a2, 17 * SIZE(AO1)
  2257. FMADD y11, alpha1, a3, y11
  2258. LFD a3, 18 * SIZE(AO1)
  2259. FMADD y12, alpha1, a4, y12
  2260. LFD a4, 19 * SIZE(AO1)
  2261. FMADD y13, alpha1, a5, y13
  2262. LFD a5, 20 * SIZE(AO1)
  2263. FMADD y14, alpha1, a6, y14
  2264. LFD a6, 21 * SIZE(AO1)
  2265. FMADD y15, alpha1, a7, y15
  2266. LFD a7, 22 * SIZE(AO1)
  2267. FMADD y16, alpha1, a8, y16
  2268. LFD a8, 23 * SIZE(AO1)
  2269. STFD y01, 0 * SIZE(Y1)
  2270. LFD y01, 16 * SIZE(Y1)
  2271. STFD y02, 1 * SIZE(Y1)
  2272. LFD y02, 17 * SIZE(Y1)
  2273. STFD y03, 2 * SIZE(Y1)
  2274. LFD y03, 18 * SIZE(Y1)
  2275. STFD y04, 3 * SIZE(Y1)
  2276. LFD y04, 19 * SIZE(Y1)
  2277. STFD y05, 4 * SIZE(Y1)
  2278. LFD y05, 20 * SIZE(Y1)
  2279. STFD y06, 5 * SIZE(Y1)
  2280. LFD y06, 21 * SIZE(Y1)
  2281. STFD y07, 6 * SIZE(Y1)
  2282. LFD y07, 22 * SIZE(Y1)
  2283. STFD y08, 7 * SIZE(Y1)
  2284. LFD y08, 23 * SIZE(Y1)
  2285. STFD y09, 8 * SIZE(Y1)
  2286. LFD y09, 24 * SIZE(Y1)
  2287. STFD y10, 9 * SIZE(Y1)
  2288. LFD y10, 25 * SIZE(Y1)
  2289. STFD y11, 10 * SIZE(Y1)
  2290. LFD y11, 26 * SIZE(Y1)
  2291. STFD y12, 11 * SIZE(Y1)
  2292. LFD y12, 27 * SIZE(Y1)
  2293. STFD y13, 12 * SIZE(Y1)
  2294. LFD y13, 28 * SIZE(Y1)
  2295. STFD y14, 13 * SIZE(Y1)
  2296. LFD y14, 29 * SIZE(Y1)
  2297. STFD y15, 14 * SIZE(Y1)
  2298. LFD y15, 30 * SIZE(Y1)
  2299. STFD y16, 15 * SIZE(Y1)
  2300. LFD y16, 31 * SIZE(Y1)
  2301. addi AO1, AO1, 16 * SIZE
  2302. addi Y1, Y1, 16 * SIZE
  2303. DCBT(AO1, PREA)
  2304. DCBT(Y1, PREC)
  2305. bdnz LL(42)
  2306. .align 4
  2307. LL(43):
  2308. FMADD y01, alpha1, a1, y01
  2309. LFD a1, 8 * SIZE(AO1)
  2310. FMADD y02, alpha1, a2, y02
  2311. LFD a2, 9 * SIZE(AO1)
  2312. FMADD y03, alpha1, a3, y03
  2313. LFD a3, 10 * SIZE(AO1)
  2314. FMADD y04, alpha1, a4, y04
  2315. LFD a4, 11 * SIZE(AO1)
  2316. FMADD y05, alpha1, a5, y05
  2317. LFD a5, 12 * SIZE(AO1)
  2318. FMADD y06, alpha1, a6, y06
  2319. LFD a6, 13 * SIZE(AO1)
  2320. FMADD y07, alpha1, a7, y07
  2321. LFD a7, 14 * SIZE(AO1)
  2322. FMADD y08, alpha1, a8, y08
  2323. LFD a8, 15 * SIZE(AO1)
  2324. FMADD y09, alpha1, a1, y09
  2325. FMADD y10, alpha1, a2, y10
  2326. FMADD y11, alpha1, a3, y11
  2327. FMADD y12, alpha1, a4, y12
  2328. FMADD y13, alpha1, a5, y13
  2329. FMADD y14, alpha1, a6, y14
  2330. FMADD y15, alpha1, a7, y15
  2331. FMADD y16, alpha1, a8, y16
  2332. STFD y01, 0 * SIZE(Y1)
  2333. STFD y02, 1 * SIZE(Y1)
  2334. STFD y03, 2 * SIZE(Y1)
  2335. STFD y04, 3 * SIZE(Y1)
  2336. STFD y05, 4 * SIZE(Y1)
  2337. STFD y06, 5 * SIZE(Y1)
  2338. STFD y07, 6 * SIZE(Y1)
  2339. STFD y08, 7 * SIZE(Y1)
  2340. STFD y09, 8 * SIZE(Y1)
  2341. STFD y10, 9 * SIZE(Y1)
  2342. STFD y11, 10 * SIZE(Y1)
  2343. STFD y12, 11 * SIZE(Y1)
  2344. STFD y13, 12 * SIZE(Y1)
  2345. STFD y14, 13 * SIZE(Y1)
  2346. STFD y15, 14 * SIZE(Y1)
  2347. STFD y16, 15 * SIZE(Y1)
  2348. addi AO1, AO1, 16 * SIZE
  2349. addi Y1, Y1, 16 * SIZE
  2350. .align 4
  2351. LL(45):
  2352. andi. r0, M, 15
  2353. ble LL(990)
  2354. andi. r0, M, 8
  2355. ble LL(46)
  2356. LFD y01, 0 * SIZE(Y1)
  2357. LFD y02, 1 * SIZE(Y1)
  2358. LFD y03, 2 * SIZE(Y1)
  2359. LFD y04, 3 * SIZE(Y1)
  2360. LFD a1, 0 * SIZE(AO1)
  2361. LFD a2, 1 * SIZE(AO1)
  2362. LFD a3, 2 * SIZE(AO1)
  2363. LFD a4, 3 * SIZE(AO1)
  2364. LFD y05, 4 * SIZE(Y1)
  2365. LFD y06, 5 * SIZE(Y1)
  2366. LFD y07, 6 * SIZE(Y1)
  2367. LFD y08, 7 * SIZE(Y1)
  2368. LFD a5, 4 * SIZE(AO1)
  2369. LFD a6, 5 * SIZE(AO1)
  2370. LFD a7, 6 * SIZE(AO1)
  2371. LFD a8, 7 * SIZE(AO1)
  2372. FMADD y01, alpha1, a1, y01
  2373. FMADD y02, alpha1, a2, y02
  2374. FMADD y03, alpha1, a3, y03
  2375. FMADD y04, alpha1, a4, y04
  2376. FMADD y05, alpha1, a5, y05
  2377. FMADD y06, alpha1, a6, y06
  2378. FMADD y07, alpha1, a7, y07
  2379. FMADD y08, alpha1, a8, y08
  2380. STFD y01, 0 * SIZE(Y1)
  2381. STFD y02, 1 * SIZE(Y1)
  2382. STFD y03, 2 * SIZE(Y1)
  2383. STFD y04, 3 * SIZE(Y1)
  2384. STFD y05, 4 * SIZE(Y1)
  2385. STFD y06, 5 * SIZE(Y1)
  2386. STFD y07, 6 * SIZE(Y1)
  2387. STFD y08, 7 * SIZE(Y1)
  2388. addi AO1, AO1, 8 * SIZE
  2389. addi Y1, Y1, 8 * SIZE
  2390. .align 4
  2391. LL(46):
  2392. andi. r0, M, 4
  2393. ble LL(47)
  2394. LFD y01, 0 * SIZE(Y1)
  2395. LFD y02, 1 * SIZE(Y1)
  2396. LFD y03, 2 * SIZE(Y1)
  2397. LFD y04, 3 * SIZE(Y1)
  2398. LFD a1, 0 * SIZE(AO1)
  2399. LFD a2, 1 * SIZE(AO1)
  2400. LFD a3, 2 * SIZE(AO1)
  2401. LFD a4, 3 * SIZE(AO1)
  2402. FMADD y01, alpha1, a1, y01
  2403. FMADD y02, alpha1, a2, y02
  2404. FMADD y03, alpha1, a3, y03
  2405. FMADD y04, alpha1, a4, y04
  2406. STFD y01, 0 * SIZE(Y1)
  2407. STFD y02, 1 * SIZE(Y1)
  2408. STFD y03, 2 * SIZE(Y1)
  2409. STFD y04, 3 * SIZE(Y1)
  2410. addi AO1, AO1, 4 * SIZE
  2411. addi Y1, Y1, 4 * SIZE
  2412. .align 4
  2413. LL(47):
  2414. andi. r0, M, 2
  2415. ble LL(48)
  2416. LFD y01, 0 * SIZE(Y1)
  2417. LFD y02, 1 * SIZE(Y1)
  2418. LFD a1, 0 * SIZE(AO1)
  2419. LFD a2, 1 * SIZE(AO1)
  2420. FMADD y01, alpha1, a1, y01
  2421. FMADD y02, alpha1, a2, y02
  2422. STFD y01, 0 * SIZE(Y1)
  2423. STFD y02, 1 * SIZE(Y1)
  2424. addi AO1, AO1, 2 * SIZE
  2425. addi Y1, Y1, 2 * SIZE
  2426. .align 4
  2427. LL(48):
  2428. andi. r0, M, 1
  2429. ble LL(990)
  2430. LFD y01, 0 * SIZE(Y1)
  2431. LFD a1, 0 * SIZE(AO1)
  2432. FMADD y01, alpha1, a1, y01
  2433. STFD y01, 0 * SIZE(Y1)
  2434. .align 4
  2435. LL(990):
  2436. cmpi cr0, 0, INCY, SIZE
  2437. beq LL(999)
  2438. mr YY, BUFFER
  2439. mr Y1, Y
  2440. srawi. r0, M, 3
  2441. mtspr CTR, r0
  2442. ble LL(995)
  2443. .align 4
  2444. LL(991):
  2445. LFD f0, 0 * SIZE(Y)
  2446. add Y, Y, INCY
  2447. LFD f1, 0 * SIZE(Y)
  2448. add Y, Y, INCY
  2449. LFD f2, 0 * SIZE(Y)
  2450. add Y, Y, INCY
  2451. LFD f3, 0 * SIZE(Y)
  2452. add Y, Y, INCY
  2453. LFD f4, 0 * SIZE(Y)
  2454. add Y, Y, INCY
  2455. LFD f5, 0 * SIZE(Y)
  2456. add Y, Y, INCY
  2457. LFD f6, 0 * SIZE(Y)
  2458. add Y, Y, INCY
  2459. LFD f7, 0 * SIZE(Y)
  2460. add Y, Y, INCY
  2461. LFD f8, 0 * SIZE(YY)
  2462. LFD f9, 1 * SIZE(YY)
  2463. LFD f10, 2 * SIZE(YY)
  2464. LFD f11, 3 * SIZE(YY)
  2465. LFD f12, 4 * SIZE(YY)
  2466. LFD f13, 5 * SIZE(YY)
  2467. LFD f14, 6 * SIZE(YY)
  2468. LFD f15, 7 * SIZE(YY)
  2469. addi YY, YY, 8 * SIZE
  2470. FADD f8, f8, f0
  2471. FADD f9, f9, f1
  2472. FADD f10, f10, f2
  2473. FADD f11, f11, f3
  2474. FADD f12, f12, f4
  2475. FADD f13, f13, f5
  2476. FADD f14, f14, f6
  2477. FADD f15, f15, f7
  2478. STFD f8, 0 * SIZE(Y1)
  2479. add Y1, Y1, INCY
  2480. STFD f9, 0 * SIZE(Y1)
  2481. add Y1, Y1, INCY
  2482. STFD f10, 0 * SIZE(Y1)
  2483. add Y1, Y1, INCY
  2484. STFD f11, 0 * SIZE(Y1)
  2485. add Y1, Y1, INCY
  2486. STFD f12, 0 * SIZE(Y1)
  2487. add Y1, Y1, INCY
  2488. STFD f13, 0 * SIZE(Y1)
  2489. add Y1, Y1, INCY
  2490. STFD f14, 0 * SIZE(Y1)
  2491. add Y1, Y1, INCY
  2492. STFD f15, 0 * SIZE(Y1)
  2493. add Y1, Y1, INCY
  2494. bdnz LL(991)
  2495. .align 4
  2496. LL(995):
  2497. andi. J, M, 4
  2498. ble LL(996)
  2499. LFD f0, 0 * SIZE(Y)
  2500. add Y, Y, INCY
  2501. LFD f1, 0 * SIZE(Y)
  2502. add Y, Y, INCY
  2503. LFD f2, 0 * SIZE(Y)
  2504. add Y, Y, INCY
  2505. LFD f3, 0 * SIZE(Y)
  2506. add Y, Y, INCY
  2507. LFD f8, 0 * SIZE(YY)
  2508. LFD f9, 1 * SIZE(YY)
  2509. LFD f10, 2 * SIZE(YY)
  2510. LFD f11, 3 * SIZE(YY)
  2511. addi YY, YY, 4 * SIZE
  2512. FADD f8, f8, f0
  2513. FADD f9, f9, f1
  2514. FADD f10, f10, f2
  2515. FADD f11, f11, f3
  2516. STFD f8, 0 * SIZE(Y1)
  2517. add Y1, Y1, INCY
  2518. STFD f9, 0 * SIZE(Y1)
  2519. add Y1, Y1, INCY
  2520. STFD f10, 0 * SIZE(Y1)
  2521. add Y1, Y1, INCY
  2522. STFD f11, 0 * SIZE(Y1)
  2523. add Y1, Y1, INCY
  2524. .align 4
  2525. LL(996):
  2526. andi. J, M, 2
  2527. ble LL(997)
  2528. LFD f0, 0 * SIZE(Y)
  2529. add Y, Y, INCY
  2530. LFD f1, 0 * SIZE(Y)
  2531. add Y, Y, INCY
  2532. LFD f8, 0 * SIZE(YY)
  2533. LFD f9, 1 * SIZE(YY)
  2534. addi YY, YY, 2 * SIZE
  2535. FADD f8, f8, f0
  2536. FADD f9, f9, f1
  2537. STFD f8, 0 * SIZE(Y1)
  2538. add Y1, Y1, INCY
  2539. STFD f9, 0 * SIZE(Y1)
  2540. add Y1, Y1, INCY
  2541. .align 4
  2542. LL(997):
  2543. andi. J, M, 1
  2544. ble LL(999)
  2545. LFD f0, 0 * SIZE(Y)
  2546. LFD f8, 0 * SIZE(YY)
  2547. FADD f8, f8, f0
  2548. STFD f8, 0 * SIZE(Y1)
  2549. .align 4
  2550. LL(999):
  2551. li r3, 0
  2552. lfd f14, 0(SP)
  2553. lfd f15, 8(SP)
  2554. lfd f16, 16(SP)
  2555. lfd f17, 24(SP)
  2556. lfd f18, 32(SP)
  2557. lfd f19, 40(SP)
  2558. lfd f20, 48(SP)
  2559. lfd f21, 56(SP)
  2560. lfd f22, 64(SP)
  2561. lfd f23, 72(SP)
  2562. lfd f24, 80(SP)
  2563. lfd f25, 88(SP)
  2564. lfd f26, 96(SP)
  2565. lfd f27, 104(SP)
  2566. lfd f28, 112(SP)
  2567. lfd f29, 120(SP)
  2568. lfd f30, 128(SP)
  2569. lfd f31, 136(SP)
  2570. #ifdef __64BIT__
  2571. ld r14, 144(SP)
  2572. ld r15, 152(SP)
  2573. ld r16, 160(SP)
  2574. ld r17, 168(SP)
  2575. ld r18, 176(SP)
  2576. ld r19, 184(SP)
  2577. ld r20, 192(SP)
  2578. ld r21, 200(SP)
  2579. ld r22, 208(SP)
  2580. ld r23, 216(SP)
  2581. ld r24, 224(SP)
  2582. ld r25, 232(SP)
  2583. ld r26, 240(SP)
  2584. ld r27, 248(SP)
  2585. #else
  2586. lwz r14, 144(SP)
  2587. lwz r15, 148(SP)
  2588. lwz r16, 152(SP)
  2589. lwz r17, 156(SP)
  2590. lwz r18, 160(SP)
  2591. lwz r19, 164(SP)
  2592. lwz r20, 168(SP)
  2593. lwz r21, 172(SP)
  2594. lwz r22, 176(SP)
  2595. lwz r23, 180(SP)
  2596. lwz r24, 184(SP)
  2597. lwz r25, 188(SP)
  2598. lwz r26, 192(SP)
  2599. lwz r27, 196(SP)
  2600. #endif
  2601. addi SP, SP, STACKSIZE
  2602. blr
  2603. EPILOGUE
  2604. #endif