You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n.S 63 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef linux
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define I r11
  83. #define J r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define AO5 r18
  89. #define AO6 r19
  90. #define AO7 r20
  91. #define AO8 r21
  92. #define LDA8 r22
  93. #define Y1 r23
  94. #define PREA r24
  95. #define PREC r25
  96. #define YY r26
  97. #define BUFFER r27
  98. #define y01 f0
  99. #define y02 f1
  100. #define y03 f2
  101. #define y04 f3
  102. #define y05 f4
  103. #define y06 f5
  104. #define y07 f6
  105. #define y08 f7
  106. #define y09 f8
  107. #define y10 f9
  108. #define y11 f10
  109. #define y12 f11
  110. #define y13 f12
  111. #define y14 f13
  112. #define y15 f14
  113. #define y16 f15
  114. #define alpha1 f16
  115. #define alpha2 f17
  116. #define alpha3 f18
  117. #define alpha4 f19
  118. #define alpha5 f20
  119. #define alpha6 f21
  120. #define alpha7 f22
  121. #define alpha8 f23
  122. #define a1 f24
  123. #define a2 f25
  124. #define a3 f26
  125. #define a4 f27
  126. #define a5 f28
  127. #define a6 f29
  128. #define a7 f30
  129. #define a8 f31
  130. #define alpha f31
  131. #if defined(PPCG4)
  132. #define PREFETCHSIZE_A 24
  133. #define PREFETCHSIZE_C 16
  134. #endif
  135. #if defined(PPC440) || defined(PPC440FP2)
  136. #define PREFETCHSIZE_A 24
  137. #define PREFETCHSIZE_C 16
  138. #endif
  139. #ifdef PPC970
  140. #define PREFETCHSIZE_A 16
  141. #define PREFETCHSIZE_C 16
  142. #endif
  143. #ifdef CELL
  144. #define PREFETCHSIZE_A 16
  145. #define PREFETCHSIZE_C 16
  146. #endif
  147. #ifdef POWER4
  148. #define PREFETCHSIZE_A 16
  149. #define PREFETCHSIZE_C 16
  150. #endif
  151. #ifdef POWER5
  152. #define PREFETCHSIZE_A 40
  153. #define PREFETCHSIZE_C 24
  154. #endif
  155. #ifdef POWER6
  156. #define PREFETCHSIZE_A 96
  157. #define PREFETCHSIZE_C 40
  158. #endif
  159. #ifdef POWER8
  160. #define PREFETCHSIZE_A 96
  161. #define PREFETCHSIZE_C 40
  162. #endif
  163. #ifndef NEEDPARAM
  164. #ifndef __64BIT__
  165. #define STACKSIZE 224
  166. #define ALPHA 200(SP)
  167. #define FZERO 208(SP)
  168. #else
  169. #define STACKSIZE 280
  170. #define ALPHA 256(SP)
  171. #define FZERO 264(SP)
  172. #endif
  173. PROLOGUE
  174. PROFCODE
  175. addi SP, SP, -STACKSIZE
  176. li r0, 0
  177. stfd f14, 0(SP)
  178. stfd f15, 8(SP)
  179. stfd f16, 16(SP)
  180. stfd f17, 24(SP)
  181. stfd f18, 32(SP)
  182. stfd f19, 40(SP)
  183. stfd f20, 48(SP)
  184. stfd f21, 56(SP)
  185. stfd f22, 64(SP)
  186. stfd f23, 72(SP)
  187. stfd f24, 80(SP)
  188. stfd f25, 88(SP)
  189. stfd f26, 96(SP)
  190. stfd f27, 104(SP)
  191. stfd f28, 112(SP)
  192. stfd f29, 120(SP)
  193. stfd f30, 128(SP)
  194. stfd f31, 136(SP)
  195. #ifdef __64BIT__
  196. std r0, FZERO
  197. std r14, 144(SP)
  198. std r15, 152(SP)
  199. std r16, 160(SP)
  200. std r17, 168(SP)
  201. std r18, 176(SP)
  202. std r19, 184(SP)
  203. std r20, 192(SP)
  204. std r21, 200(SP)
  205. std r22, 208(SP)
  206. std r23, 216(SP)
  207. std r24, 224(SP)
  208. std r25, 232(SP)
  209. std r26, 240(SP)
  210. std r27, 248(SP)
  211. #else
  212. stw r0, 0 + FZERO
  213. stw r0, 4 + FZERO
  214. stw r14, 144(SP)
  215. stw r15, 148(SP)
  216. stw r16, 152(SP)
  217. stw r17, 156(SP)
  218. stw r18, 160(SP)
  219. stw r19, 164(SP)
  220. stw r20, 168(SP)
  221. stw r21, 172(SP)
  222. stw r22, 176(SP)
  223. stw r23, 180(SP)
  224. stw r24, 184(SP)
  225. stw r25, 188(SP)
  226. stw r26, 192(SP)
  227. stw r27, 196(SP)
  228. #endif
  229. #ifdef linux
  230. #ifndef __64BIT__
  231. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  232. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  233. #else
  234. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  235. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  236. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  237. #endif
  238. #endif
  239. #if defined(_AIX) || defined(__APPLE__)
  240. #ifndef __64BIT__
  241. #ifdef DOUBLE
  242. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  243. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  244. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  245. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  246. #else
  247. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  248. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  249. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  250. #endif
  251. #else
  252. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  253. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  254. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  255. #endif
  256. #endif
  257. stfd f1, ALPHA
  258. fmr alpha, f1
  259. slwi LDA8, LDA, BASE_SHIFT + 3
  260. slwi LDA, LDA, BASE_SHIFT
  261. slwi INCX, INCX, BASE_SHIFT
  262. slwi INCY, INCY, BASE_SHIFT
  263. li PREA, PREFETCHSIZE_A * SIZE
  264. li PREC, PREFETCHSIZE_C * SIZE
  265. cmpwi cr0, M, 0
  266. ble- LL(999)
  267. cmpwi cr0, N, 0
  268. ble- LL(999)
  269. mr YY, Y
  270. lfd f0, FZERO
  271. cmpi cr0, 0, INCY, SIZE
  272. beq LL(10)
  273. mr YY, BUFFER
  274. mr Y1, BUFFER
  275. addi r0, M, 7
  276. srawi. r0, r0, 3
  277. mtspr CTR, r0
  278. .align 4
  279. LL(02):
  280. STFD f0, 0 * SIZE(Y1)
  281. STFD f0, 1 * SIZE(Y1)
  282. STFD f0, 2 * SIZE(Y1)
  283. STFD f0, 3 * SIZE(Y1)
  284. STFD f0, 4 * SIZE(Y1)
  285. STFD f0, 5 * SIZE(Y1)
  286. STFD f0, 6 * SIZE(Y1)
  287. STFD f0, 7 * SIZE(Y1)
  288. addi Y1, Y1, 8 * SIZE
  289. bdnz LL(02)
  290. .align 4
  291. LL(10):
  292. srawi. J, N, 3
  293. ble LL(20)
  294. .align 4
  295. LL(11):
  296. LFD alpha1, 0 * SIZE(X)
  297. add X, X, INCX
  298. LFD alpha2, 0 * SIZE(X)
  299. add X, X, INCX
  300. LFD alpha3, 0 * SIZE(X)
  301. add X, X, INCX
  302. LFD alpha4, 0 * SIZE(X)
  303. add X, X, INCX
  304. LFD alpha5, 0 * SIZE(X)
  305. add X, X, INCX
  306. LFD alpha6, 0 * SIZE(X)
  307. add X, X, INCX
  308. LFD alpha7, 0 * SIZE(X)
  309. add X, X, INCX
  310. LFD alpha8, 0 * SIZE(X)
  311. add X, X, INCX
  312. FMUL alpha1, alpha, alpha1
  313. FMUL alpha2, alpha, alpha2
  314. FMUL alpha3, alpha, alpha3
  315. FMUL alpha4, alpha, alpha4
  316. FMUL alpha5, alpha, alpha5
  317. FMUL alpha6, alpha, alpha6
  318. FMUL alpha7, alpha, alpha7
  319. FMUL alpha8, alpha, alpha8
  320. mr AO1, A
  321. add AO2, A, LDA
  322. add AO3, AO2, LDA
  323. add AO4, AO3, LDA
  324. add AO5, AO4, LDA
  325. add AO6, AO5, LDA
  326. add AO7, AO6, LDA
  327. add AO8, AO7, LDA
  328. add A, AO8, LDA
  329. mr Y1, YY
  330. srawi. r0, M, 4
  331. mtspr CTR, r0
  332. ble LL(15)
  333. LFD y01, 0 * SIZE(Y1)
  334. LFD y02, 1 * SIZE(Y1)
  335. LFD y03, 2 * SIZE(Y1)
  336. LFD y04, 3 * SIZE(Y1)
  337. LFD y05, 4 * SIZE(Y1)
  338. LFD y06, 5 * SIZE(Y1)
  339. LFD y07, 6 * SIZE(Y1)
  340. LFD y08, 7 * SIZE(Y1)
  341. LFD a1, 0 * SIZE(AO1)
  342. LFD a2, 1 * SIZE(AO1)
  343. LFD a3, 2 * SIZE(AO1)
  344. LFD a4, 3 * SIZE(AO1)
  345. LFD a5, 4 * SIZE(AO1)
  346. LFD a6, 5 * SIZE(AO1)
  347. LFD a7, 6 * SIZE(AO1)
  348. LFD a8, 7 * SIZE(AO1)
  349. LFD y09, 8 * SIZE(Y1)
  350. LFD y10, 9 * SIZE(Y1)
  351. LFD y11, 10 * SIZE(Y1)
  352. LFD y12, 11 * SIZE(Y1)
  353. LFD y13, 12 * SIZE(Y1)
  354. LFD y14, 13 * SIZE(Y1)
  355. LFD y15, 14 * SIZE(Y1)
  356. LFD y16, 15 * SIZE(Y1)
  357. FMADD y01, alpha1, a1, y01
  358. FMADD y02, alpha1, a2, y02
  359. FMADD y03, alpha1, a3, y03
  360. FMADD y04, alpha1, a4, y04
  361. LFD a1, 8 * SIZE(AO1)
  362. LFD a2, 9 * SIZE(AO1)
  363. LFD a3, 10 * SIZE(AO1)
  364. LFD a4, 11 * SIZE(AO1)
  365. FMADD y05, alpha1, a5, y05
  366. FMADD y06, alpha1, a6, y06
  367. FMADD y07, alpha1, a7, y07
  368. FMADD y08, alpha1, a8, y08
  369. LFD a5, 12 * SIZE(AO1)
  370. LFD a6, 13 * SIZE(AO1)
  371. LFD a7, 14 * SIZE(AO1)
  372. LFD a8, 15 * SIZE(AO1)
  373. addi AO1, AO1, 16 * SIZE
  374. nop
  375. nop
  376. DCBT(AO1, PREA)
  377. FMADD y09, alpha1, a1, y09
  378. FMADD y10, alpha1, a2, y10
  379. FMADD y11, alpha1, a3, y11
  380. FMADD y12, alpha1, a4, y12
  381. LFD a1, 0 * SIZE(AO2)
  382. LFD a2, 1 * SIZE(AO2)
  383. LFD a3, 2 * SIZE(AO2)
  384. LFD a4, 3 * SIZE(AO2)
  385. FMADD y13, alpha1, a5, y13
  386. FMADD y14, alpha1, a6, y14
  387. FMADD y15, alpha1, a7, y15
  388. FMADD y16, alpha1, a8, y16
  389. LFD a5, 4 * SIZE(AO2)
  390. LFD a6, 5 * SIZE(AO2)
  391. LFD a7, 6 * SIZE(AO2)
  392. LFD a8, 7 * SIZE(AO2)
  393. FMADD y01, alpha2, a1, y01
  394. FMADD y02, alpha2, a2, y02
  395. FMADD y03, alpha2, a3, y03
  396. FMADD y04, alpha2, a4, y04
  397. LFD a1, 8 * SIZE(AO2)
  398. LFD a2, 9 * SIZE(AO2)
  399. LFD a3, 10 * SIZE(AO2)
  400. LFD a4, 11 * SIZE(AO2)
  401. FMADD y05, alpha2, a5, y05
  402. FMADD y06, alpha2, a6, y06
  403. FMADD y07, alpha2, a7, y07
  404. FMADD y08, alpha2, a8, y08
  405. LFD a5, 12 * SIZE(AO2)
  406. LFD a6, 13 * SIZE(AO2)
  407. LFD a7, 14 * SIZE(AO2)
  408. LFD a8, 15 * SIZE(AO2)
  409. addi AO2, AO2, 16 * SIZE
  410. nop
  411. nop
  412. DCBT(AO2, PREA)
  413. FMADD y09, alpha2, a1, y09
  414. FMADD y10, alpha2, a2, y10
  415. FMADD y11, alpha2, a3, y11
  416. FMADD y12, alpha2, a4, y12
  417. LFD a1, 0 * SIZE(AO3)
  418. LFD a2, 1 * SIZE(AO3)
  419. LFD a3, 2 * SIZE(AO3)
  420. LFD a4, 3 * SIZE(AO3)
  421. FMADD y13, alpha2, a5, y13
  422. FMADD y14, alpha2, a6, y14
  423. FMADD y15, alpha2, a7, y15
  424. FMADD y16, alpha2, a8, y16
  425. LFD a5, 4 * SIZE(AO3)
  426. LFD a6, 5 * SIZE(AO3)
  427. LFD a7, 6 * SIZE(AO3)
  428. LFD a8, 7 * SIZE(AO3)
  429. FMADD y01, alpha3, a1, y01
  430. FMADD y02, alpha3, a2, y02
  431. FMADD y03, alpha3, a3, y03
  432. FMADD y04, alpha3, a4, y04
  433. LFD a1, 8 * SIZE(AO3)
  434. LFD a2, 9 * SIZE(AO3)
  435. LFD a3, 10 * SIZE(AO3)
  436. LFD a4, 11 * SIZE(AO3)
  437. FMADD y05, alpha3, a5, y05
  438. FMADD y06, alpha3, a6, y06
  439. FMADD y07, alpha3, a7, y07
  440. FMADD y08, alpha3, a8, y08
  441. LFD a5, 12 * SIZE(AO3)
  442. LFD a6, 13 * SIZE(AO3)
  443. LFD a7, 14 * SIZE(AO3)
  444. LFD a8, 15 * SIZE(AO3)
  445. addi AO3, AO3, 16 * SIZE
  446. nop
  447. nop
  448. DCBT(AO3, PREA)
  449. FMADD y09, alpha3, a1, y09
  450. FMADD y10, alpha3, a2, y10
  451. FMADD y11, alpha3, a3, y11
  452. FMADD y12, alpha3, a4, y12
  453. LFD a1, 0 * SIZE(AO4)
  454. LFD a2, 1 * SIZE(AO4)
  455. LFD a3, 2 * SIZE(AO4)
  456. LFD a4, 3 * SIZE(AO4)
  457. FMADD y13, alpha3, a5, y13
  458. FMADD y14, alpha3, a6, y14
  459. FMADD y15, alpha3, a7, y15
  460. FMADD y16, alpha3, a8, y16
  461. LFD a5, 4 * SIZE(AO4)
  462. LFD a6, 5 * SIZE(AO4)
  463. LFD a7, 6 * SIZE(AO4)
  464. LFD a8, 7 * SIZE(AO4)
  465. FMADD y01, alpha4, a1, y01
  466. FMADD y02, alpha4, a2, y02
  467. FMADD y03, alpha4, a3, y03
  468. FMADD y04, alpha4, a4, y04
  469. LFD a1, 8 * SIZE(AO4)
  470. LFD a2, 9 * SIZE(AO4)
  471. LFD a3, 10 * SIZE(AO4)
  472. LFD a4, 11 * SIZE(AO4)
  473. FMADD y05, alpha4, a5, y05
  474. FMADD y06, alpha4, a6, y06
  475. FMADD y07, alpha4, a7, y07
  476. FMADD y08, alpha4, a8, y08
  477. LFD a5, 12 * SIZE(AO4)
  478. LFD a6, 13 * SIZE(AO4)
  479. LFD a7, 14 * SIZE(AO4)
  480. LFD a8, 15 * SIZE(AO4)
  481. addi AO4, AO4, 16 * SIZE
  482. nop
  483. nop
  484. DCBT(AO4, PREA)
  485. FMADD y09, alpha4, a1, y09
  486. FMADD y10, alpha4, a2, y10
  487. FMADD y11, alpha4, a3, y11
  488. FMADD y12, alpha4, a4, y12
  489. LFD a1, 0 * SIZE(AO5)
  490. LFD a2, 1 * SIZE(AO5)
  491. LFD a3, 2 * SIZE(AO5)
  492. LFD a4, 3 * SIZE(AO5)
  493. FMADD y13, alpha4, a5, y13
  494. FMADD y14, alpha4, a6, y14
  495. FMADD y15, alpha4, a7, y15
  496. FMADD y16, alpha4, a8, y16
  497. LFD a5, 4 * SIZE(AO5)
  498. LFD a6, 5 * SIZE(AO5)
  499. LFD a7, 6 * SIZE(AO5)
  500. LFD a8, 7 * SIZE(AO5)
  501. FMADD y01, alpha5, a1, y01
  502. FMADD y02, alpha5, a2, y02
  503. FMADD y03, alpha5, a3, y03
  504. FMADD y04, alpha5, a4, y04
  505. LFD a1, 8 * SIZE(AO5)
  506. LFD a2, 9 * SIZE(AO5)
  507. LFD a3, 10 * SIZE(AO5)
  508. LFD a4, 11 * SIZE(AO5)
  509. FMADD y05, alpha5, a5, y05
  510. FMADD y06, alpha5, a6, y06
  511. FMADD y07, alpha5, a7, y07
  512. FMADD y08, alpha5, a8, y08
  513. LFD a5, 12 * SIZE(AO5)
  514. LFD a6, 13 * SIZE(AO5)
  515. LFD a7, 14 * SIZE(AO5)
  516. LFD a8, 15 * SIZE(AO5)
  517. addi AO5, AO5, 16 * SIZE
  518. nop
  519. nop
  520. DCBT(AO5, PREA)
  521. FMADD y09, alpha5, a1, y09
  522. FMADD y10, alpha5, a2, y10
  523. FMADD y11, alpha5, a3, y11
  524. FMADD y12, alpha5, a4, y12
  525. LFD a1, 0 * SIZE(AO6)
  526. LFD a2, 1 * SIZE(AO6)
  527. LFD a3, 2 * SIZE(AO6)
  528. LFD a4, 3 * SIZE(AO6)
  529. FMADD y13, alpha5, a5, y13
  530. FMADD y14, alpha5, a6, y14
  531. FMADD y15, alpha5, a7, y15
  532. FMADD y16, alpha5, a8, y16
  533. LFD a5, 4 * SIZE(AO6)
  534. LFD a6, 5 * SIZE(AO6)
  535. LFD a7, 6 * SIZE(AO6)
  536. LFD a8, 7 * SIZE(AO6)
  537. FMADD y01, alpha6, a1, y01
  538. FMADD y02, alpha6, a2, y02
  539. FMADD y03, alpha6, a3, y03
  540. FMADD y04, alpha6, a4, y04
  541. LFD a1, 8 * SIZE(AO6)
  542. LFD a2, 9 * SIZE(AO6)
  543. LFD a3, 10 * SIZE(AO6)
  544. LFD a4, 11 * SIZE(AO6)
  545. FMADD y05, alpha6, a5, y05
  546. FMADD y06, alpha6, a6, y06
  547. FMADD y07, alpha6, a7, y07
  548. FMADD y08, alpha6, a8, y08
  549. LFD a5, 12 * SIZE(AO6)
  550. LFD a6, 13 * SIZE(AO6)
  551. LFD a7, 14 * SIZE(AO6)
  552. LFD a8, 15 * SIZE(AO6)
  553. addi AO6, AO6, 16 * SIZE
  554. nop
  555. nop
  556. DCBT(AO6, PREA)
  557. FMADD y09, alpha6, a1, y09
  558. FMADD y10, alpha6, a2, y10
  559. FMADD y11, alpha6, a3, y11
  560. FMADD y12, alpha6, a4, y12
  561. LFD a1, 0 * SIZE(AO7)
  562. LFD a2, 1 * SIZE(AO7)
  563. LFD a3, 2 * SIZE(AO7)
  564. LFD a4, 3 * SIZE(AO7)
  565. FMADD y13, alpha6, a5, y13
  566. FMADD y14, alpha6, a6, y14
  567. FMADD y15, alpha6, a7, y15
  568. FMADD y16, alpha6, a8, y16
  569. LFD a5, 4 * SIZE(AO7)
  570. LFD a6, 5 * SIZE(AO7)
  571. LFD a7, 6 * SIZE(AO7)
  572. LFD a8, 7 * SIZE(AO7)
  573. FMADD y01, alpha7, a1, y01
  574. FMADD y02, alpha7, a2, y02
  575. FMADD y03, alpha7, a3, y03
  576. FMADD y04, alpha7, a4, y04
  577. LFD a1, 8 * SIZE(AO7)
  578. LFD a2, 9 * SIZE(AO7)
  579. LFD a3, 10 * SIZE(AO7)
  580. LFD a4, 11 * SIZE(AO7)
  581. FMADD y05, alpha7, a5, y05
  582. FMADD y06, alpha7, a6, y06
  583. FMADD y07, alpha7, a7, y07
  584. FMADD y08, alpha7, a8, y08
  585. LFD a5, 12 * SIZE(AO7)
  586. LFD a6, 13 * SIZE(AO7)
  587. LFD a7, 14 * SIZE(AO7)
  588. LFD a8, 15 * SIZE(AO7)
  589. addi AO7, AO7, 16 * SIZE
  590. nop
  591. nop
  592. DCBT(AO7, PREA)
  593. FMADD y09, alpha7, a1, y09
  594. FMADD y10, alpha7, a2, y10
  595. FMADD y11, alpha7, a3, y11
  596. FMADD y12, alpha7, a4, y12
  597. LFD a1, 0 * SIZE(AO8)
  598. LFD a2, 1 * SIZE(AO8)
  599. LFD a3, 2 * SIZE(AO8)
  600. LFD a4, 3 * SIZE(AO8)
  601. FMADD y13, alpha7, a5, y13
  602. FMADD y14, alpha7, a6, y14
  603. FMADD y15, alpha7, a7, y15
  604. FMADD y16, alpha7, a8, y16
  605. LFD a5, 4 * SIZE(AO8)
  606. LFD a6, 5 * SIZE(AO8)
  607. LFD a7, 6 * SIZE(AO8)
  608. LFD a8, 7 * SIZE(AO8)
  609. FMADD y01, alpha8, a1, y01
  610. FMADD y02, alpha8, a2, y02
  611. FMADD y03, alpha8, a3, y03
  612. FMADD y04, alpha8, a4, y04
  613. LFD a1, 8 * SIZE(AO8)
  614. LFD a2, 9 * SIZE(AO8)
  615. LFD a3, 10 * SIZE(AO8)
  616. LFD a4, 11 * SIZE(AO8)
  617. FMADD y05, alpha8, a5, y05
  618. FMADD y06, alpha8, a6, y06
  619. FMADD y07, alpha8, a7, y07
  620. FMADD y08, alpha8, a8, y08
  621. LFD a5, 12 * SIZE(AO8)
  622. LFD a6, 13 * SIZE(AO8)
  623. LFD a7, 14 * SIZE(AO8)
  624. LFD a8, 15 * SIZE(AO8)
  625. addi AO8, AO8, 16 * SIZE
  626. nop
  627. nop
  628. DCBT(AO8, PREA)
  629. FMADD y09, alpha8, a1, y09
  630. FMADD y10, alpha8, a2, y10
  631. FMADD y11, alpha8, a3, y11
  632. FMADD y12, alpha8, a4, y12
  633. LFD a1, 0 * SIZE(AO1)
  634. LFD a2, 1 * SIZE(AO1)
  635. LFD a3, 2 * SIZE(AO1)
  636. LFD a4, 3 * SIZE(AO1)
  637. FMADD y13, alpha8, a5, y13
  638. FMADD y14, alpha8, a6, y14
  639. FMADD y15, alpha8, a7, y15
  640. FMADD y16, alpha8, a8, y16
  641. LFD a5, 4 * SIZE(AO1)
  642. LFD a6, 5 * SIZE(AO1)
  643. LFD a7, 6 * SIZE(AO1)
  644. LFD a8, 7 * SIZE(AO1)
  645. STFD y01, 0 * SIZE(Y1)
  646. STFD y02, 1 * SIZE(Y1)
  647. STFD y03, 2 * SIZE(Y1)
  648. STFD y04, 3 * SIZE(Y1)
  649. LFD y01, 16 * SIZE(Y1)
  650. LFD y02, 17 * SIZE(Y1)
  651. LFD y03, 18 * SIZE(Y1)
  652. LFD y04, 19 * SIZE(Y1)
  653. DCBT(Y1, PREC)
  654. bdz LL(13)
  655. .align 4
  656. LL(12):
  657. FMADD y01, alpha1, a1, y01
  658. FMADD y02, alpha1, a2, y02
  659. FMADD y03, alpha1, a3, y03
  660. FMADD y04, alpha1, a4, y04
  661. LFD a1, 8 * SIZE(AO1)
  662. LFD a2, 9 * SIZE(AO1)
  663. LFD a3, 10 * SIZE(AO1)
  664. LFD a4, 11 * SIZE(AO1)
  665. STFD y05, 4 * SIZE(Y1)
  666. STFD y06, 5 * SIZE(Y1)
  667. STFD y07, 6 * SIZE(Y1)
  668. STFD y08, 7 * SIZE(Y1)
  669. LFD y05, 20 * SIZE(Y1)
  670. LFD y06, 21 * SIZE(Y1)
  671. LFD y07, 22 * SIZE(Y1)
  672. LFD y08, 23 * SIZE(Y1)
  673. FMADD y05, alpha1, a5, y05
  674. FMADD y06, alpha1, a6, y06
  675. FMADD y07, alpha1, a7, y07
  676. FMADD y08, alpha1, a8, y08
  677. LFD a5, 12 * SIZE(AO1)
  678. LFD a6, 13 * SIZE(AO1)
  679. LFD a7, 14 * SIZE(AO1)
  680. LFD a8, 15 * SIZE(AO1)
  681. STFD y09, 8 * SIZE(Y1)
  682. STFD y10, 9 * SIZE(Y1)
  683. STFD y11, 10 * SIZE(Y1)
  684. STFD y12, 11 * SIZE(Y1)
  685. LFD y09, 24 * SIZE(Y1)
  686. LFD y10, 25 * SIZE(Y1)
  687. LFD y11, 26 * SIZE(Y1)
  688. LFD y12, 27 * SIZE(Y1)
  689. FMADD y09, alpha1, a1, y09
  690. FMADD y10, alpha1, a2, y10
  691. FMADD y11, alpha1, a3, y11
  692. FMADD y12, alpha1, a4, y12
  693. LFD a1, 0 * SIZE(AO2)
  694. LFD a2, 1 * SIZE(AO2)
  695. LFD a3, 2 * SIZE(AO2)
  696. LFD a4, 3 * SIZE(AO2)
  697. STFD y13, 12 * SIZE(Y1)
  698. STFD y14, 13 * SIZE(Y1)
  699. STFD y15, 14 * SIZE(Y1)
  700. STFD y16, 15 * SIZE(Y1)
  701. LFD y13, 28 * SIZE(Y1)
  702. LFD y14, 29 * SIZE(Y1)
  703. LFD y15, 30 * SIZE(Y1)
  704. LFD y16, 31 * SIZE(Y1)
  705. FMADD y13, alpha1, a5, y13
  706. FMADD y14, alpha1, a6, y14
  707. FMADD y15, alpha1, a7, y15
  708. FMADD y16, alpha1, a8, y16
  709. LFD a5, 4 * SIZE(AO2)
  710. LFD a6, 5 * SIZE(AO2)
  711. LFD a7, 6 * SIZE(AO2)
  712. LFD a8, 7 * SIZE(AO2)
  713. FMADD y01, alpha2, a1, y01
  714. FMADD y02, alpha2, a2, y02
  715. FMADD y03, alpha2, a3, y03
  716. FMADD y04, alpha2, a4, y04
  717. LFD a1, 8 * SIZE(AO2)
  718. LFD a2, 9 * SIZE(AO2)
  719. LFD a3, 10 * SIZE(AO2)
  720. LFD a4, 11 * SIZE(AO2)
  721. FMADD y05, alpha2, a5, y05
  722. FMADD y06, alpha2, a6, y06
  723. FMADD y07, alpha2, a7, y07
  724. FMADD y08, alpha2, a8, y08
  725. LFD a5, 12 * SIZE(AO2)
  726. LFD a6, 13 * SIZE(AO2)
  727. LFD a7, 14 * SIZE(AO2)
  728. LFD a8, 15 * SIZE(AO2)
  729. FMADD y09, alpha2, a1, y09
  730. FMADD y10, alpha2, a2, y10
  731. FMADD y11, alpha2, a3, y11
  732. FMADD y12, alpha2, a4, y12
  733. LFD a1, 0 * SIZE(AO3)
  734. LFD a2, 1 * SIZE(AO3)
  735. LFD a3, 2 * SIZE(AO3)
  736. LFD a4, 3 * SIZE(AO3)
  737. FMADD y13, alpha2, a5, y13
  738. FMADD y14, alpha2, a6, y14
  739. FMADD y15, alpha2, a7, y15
  740. FMADD y16, alpha2, a8, y16
  741. LFD a5, 4 * SIZE(AO3)
  742. LFD a6, 5 * SIZE(AO3)
  743. LFD a7, 6 * SIZE(AO3)
  744. LFD a8, 7 * SIZE(AO3)
  745. FMADD y01, alpha3, a1, y01
  746. FMADD y02, alpha3, a2, y02
  747. FMADD y03, alpha3, a3, y03
  748. FMADD y04, alpha3, a4, y04
  749. LFD a1, 8 * SIZE(AO3)
  750. LFD a2, 9 * SIZE(AO3)
  751. LFD a3, 10 * SIZE(AO3)
  752. LFD a4, 11 * SIZE(AO3)
  753. FMADD y05, alpha3, a5, y05
  754. FMADD y06, alpha3, a6, y06
  755. FMADD y07, alpha3, a7, y07
  756. FMADD y08, alpha3, a8, y08
  757. LFD a5, 12 * SIZE(AO3)
  758. LFD a6, 13 * SIZE(AO3)
  759. LFD a7, 14 * SIZE(AO3)
  760. LFD a8, 15 * SIZE(AO3)
  761. FMADD y09, alpha3, a1, y09
  762. FMADD y10, alpha3, a2, y10
  763. FMADD y11, alpha3, a3, y11
  764. FMADD y12, alpha3, a4, y12
  765. LFD a1, 0 * SIZE(AO4)
  766. LFD a2, 1 * SIZE(AO4)
  767. LFD a3, 2 * SIZE(AO4)
  768. LFD a4, 3 * SIZE(AO4)
  769. FMADD y13, alpha3, a5, y13
  770. FMADD y14, alpha3, a6, y14
  771. FMADD y15, alpha3, a7, y15
  772. FMADD y16, alpha3, a8, y16
  773. LFD a5, 4 * SIZE(AO4)
  774. LFD a6, 5 * SIZE(AO4)
  775. LFD a7, 6 * SIZE(AO4)
  776. LFD a8, 7 * SIZE(AO4)
  777. FMADD y01, alpha4, a1, y01
  778. FMADD y02, alpha4, a2, y02
  779. FMADD y03, alpha4, a3, y03
  780. FMADD y04, alpha4, a4, y04
  781. LFD a1, 8 * SIZE(AO4)
  782. LFD a2, 9 * SIZE(AO4)
  783. LFD a3, 10 * SIZE(AO4)
  784. LFD a4, 11 * SIZE(AO4)
  785. FMADD y05, alpha4, a5, y05
  786. FMADD y06, alpha4, a6, y06
  787. FMADD y07, alpha4, a7, y07
  788. FMADD y08, alpha4, a8, y08
  789. LFD a5, 12 * SIZE(AO4)
  790. LFD a6, 13 * SIZE(AO4)
  791. LFD a7, 14 * SIZE(AO4)
  792. LFD a8, 15 * SIZE(AO4)
  793. addi AO1, AO1, 16 * SIZE
  794. addi AO2, AO2, 16 * SIZE
  795. addi AO3, AO3, 16 * SIZE
  796. addi AO4, AO4, 16 * SIZE
  797. DCBT(AO1, PREA)
  798. DCBT(AO2, PREA)
  799. DCBT(AO3, PREA)
  800. DCBT(AO4, PREA)
  801. FMADD y09, alpha4, a1, y09
  802. FMADD y10, alpha4, a2, y10
  803. FMADD y11, alpha4, a3, y11
  804. FMADD y12, alpha4, a4, y12
  805. LFD a1, 0 * SIZE(AO5)
  806. LFD a2, 1 * SIZE(AO5)
  807. LFD a3, 2 * SIZE(AO5)
  808. LFD a4, 3 * SIZE(AO5)
  809. FMADD y13, alpha4, a5, y13
  810. FMADD y14, alpha4, a6, y14
  811. FMADD y15, alpha4, a7, y15
  812. FMADD y16, alpha4, a8, y16
  813. LFD a5, 4 * SIZE(AO5)
  814. LFD a6, 5 * SIZE(AO5)
  815. LFD a7, 6 * SIZE(AO5)
  816. LFD a8, 7 * SIZE(AO5)
  817. FMADD y01, alpha5, a1, y01
  818. FMADD y02, alpha5, a2, y02
  819. FMADD y03, alpha5, a3, y03
  820. FMADD y04, alpha5, a4, y04
  821. LFD a1, 8 * SIZE(AO5)
  822. LFD a2, 9 * SIZE(AO5)
  823. LFD a3, 10 * SIZE(AO5)
  824. LFD a4, 11 * SIZE(AO5)
  825. FMADD y05, alpha5, a5, y05
  826. FMADD y06, alpha5, a6, y06
  827. FMADD y07, alpha5, a7, y07
  828. FMADD y08, alpha5, a8, y08
  829. LFD a5, 12 * SIZE(AO5)
  830. LFD a6, 13 * SIZE(AO5)
  831. LFD a7, 14 * SIZE(AO5)
  832. LFD a8, 15 * SIZE(AO5)
  833. FMADD y09, alpha5, a1, y09
  834. FMADD y10, alpha5, a2, y10
  835. FMADD y11, alpha5, a3, y11
  836. FMADD y12, alpha5, a4, y12
  837. LFD a1, 0 * SIZE(AO6)
  838. LFD a2, 1 * SIZE(AO6)
  839. LFD a3, 2 * SIZE(AO6)
  840. LFD a4, 3 * SIZE(AO6)
  841. FMADD y13, alpha5, a5, y13
  842. FMADD y14, alpha5, a6, y14
  843. FMADD y15, alpha5, a7, y15
  844. FMADD y16, alpha5, a8, y16
  845. LFD a5, 4 * SIZE(AO6)
  846. LFD a6, 5 * SIZE(AO6)
  847. LFD a7, 6 * SIZE(AO6)
  848. LFD a8, 7 * SIZE(AO6)
  849. FMADD y01, alpha6, a1, y01
  850. FMADD y02, alpha6, a2, y02
  851. FMADD y03, alpha6, a3, y03
  852. FMADD y04, alpha6, a4, y04
  853. LFD a1, 8 * SIZE(AO6)
  854. LFD a2, 9 * SIZE(AO6)
  855. LFD a3, 10 * SIZE(AO6)
  856. LFD a4, 11 * SIZE(AO6)
  857. FMADD y05, alpha6, a5, y05
  858. FMADD y06, alpha6, a6, y06
  859. FMADD y07, alpha6, a7, y07
  860. FMADD y08, alpha6, a8, y08
  861. LFD a5, 12 * SIZE(AO6)
  862. LFD a6, 13 * SIZE(AO6)
  863. LFD a7, 14 * SIZE(AO6)
  864. LFD a8, 15 * SIZE(AO6)
  865. FMADD y09, alpha6, a1, y09
  866. FMADD y10, alpha6, a2, y10
  867. FMADD y11, alpha6, a3, y11
  868. FMADD y12, alpha6, a4, y12
  869. LFD a1, 0 * SIZE(AO7)
  870. LFD a2, 1 * SIZE(AO7)
  871. LFD a3, 2 * SIZE(AO7)
  872. LFD a4, 3 * SIZE(AO7)
  873. FMADD y13, alpha6, a5, y13
  874. FMADD y14, alpha6, a6, y14
  875. FMADD y15, alpha6, a7, y15
  876. FMADD y16, alpha6, a8, y16
  877. LFD a5, 4 * SIZE(AO7)
  878. LFD a6, 5 * SIZE(AO7)
  879. LFD a7, 6 * SIZE(AO7)
  880. LFD a8, 7 * SIZE(AO7)
  881. FMADD y01, alpha7, a1, y01
  882. FMADD y02, alpha7, a2, y02
  883. FMADD y03, alpha7, a3, y03
  884. FMADD y04, alpha7, a4, y04
  885. LFD a1, 8 * SIZE(AO7)
  886. LFD a2, 9 * SIZE(AO7)
  887. LFD a3, 10 * SIZE(AO7)
  888. LFD a4, 11 * SIZE(AO7)
  889. FMADD y05, alpha7, a5, y05
  890. FMADD y06, alpha7, a6, y06
  891. FMADD y07, alpha7, a7, y07
  892. FMADD y08, alpha7, a8, y08
  893. LFD a5, 12 * SIZE(AO7)
  894. LFD a6, 13 * SIZE(AO7)
  895. LFD a7, 14 * SIZE(AO7)
  896. LFD a8, 15 * SIZE(AO7)
  897. FMADD y09, alpha7, a1, y09
  898. FMADD y10, alpha7, a2, y10
  899. FMADD y11, alpha7, a3, y11
  900. FMADD y12, alpha7, a4, y12
  901. LFD a1, 0 * SIZE(AO8)
  902. LFD a2, 1 * SIZE(AO8)
  903. LFD a3, 2 * SIZE(AO8)
  904. LFD a4, 3 * SIZE(AO8)
  905. FMADD y13, alpha7, a5, y13
  906. FMADD y14, alpha7, a6, y14
  907. FMADD y15, alpha7, a7, y15
  908. FMADD y16, alpha7, a8, y16
  909. LFD a5, 4 * SIZE(AO8)
  910. LFD a6, 5 * SIZE(AO8)
  911. LFD a7, 6 * SIZE(AO8)
  912. LFD a8, 7 * SIZE(AO8)
  913. FMADD y01, alpha8, a1, y01
  914. FMADD y02, alpha8, a2, y02
  915. FMADD y03, alpha8, a3, y03
  916. FMADD y04, alpha8, a4, y04
  917. LFD a1, 8 * SIZE(AO8)
  918. LFD a2, 9 * SIZE(AO8)
  919. LFD a3, 10 * SIZE(AO8)
  920. LFD a4, 11 * SIZE(AO8)
  921. FMADD y05, alpha8, a5, y05
  922. FMADD y06, alpha8, a6, y06
  923. FMADD y07, alpha8, a7, y07
  924. FMADD y08, alpha8, a8, y08
  925. LFD a5, 12 * SIZE(AO8)
  926. LFD a6, 13 * SIZE(AO8)
  927. LFD a7, 14 * SIZE(AO8)
  928. LFD a8, 15 * SIZE(AO8)
  929. addi AO5, AO5, 16 * SIZE
  930. addi AO6, AO6, 16 * SIZE
  931. addi AO7, AO7, 16 * SIZE
  932. addi AO8, AO8, 16 * SIZE
  933. DCBT(AO5, PREA)
  934. DCBT(AO6, PREA)
  935. DCBT(AO7, PREA)
  936. DCBT(AO8, PREA)
  937. FMADD y09, alpha8, a1, y09
  938. FMADD y10, alpha8, a2, y10
  939. FMADD y11, alpha8, a3, y11
  940. FMADD y12, alpha8, a4, y12
  941. LFD a1, 0 * SIZE(AO1)
  942. LFD a2, 1 * SIZE(AO1)
  943. LFD a3, 2 * SIZE(AO1)
  944. LFD a4, 3 * SIZE(AO1)
  945. FMADD y13, alpha8, a5, y13
  946. FMADD y14, alpha8, a6, y14
  947. FMADD y15, alpha8, a7, y15
  948. FMADD y16, alpha8, a8, y16
  949. LFD a5, 4 * SIZE(AO1)
  950. LFD a6, 5 * SIZE(AO1)
  951. LFD a7, 6 * SIZE(AO1)
  952. LFD a8, 7 * SIZE(AO1)
  953. STFD y01, 16 * SIZE(Y1)
  954. STFD y02, 17 * SIZE(Y1)
  955. STFD y03, 18 * SIZE(Y1)
  956. STFD y04, 19 * SIZE(Y1)
  957. LFD y01, 32 * SIZE(Y1)
  958. LFD y02, 33 * SIZE(Y1)
  959. LFD y03, 34 * SIZE(Y1)
  960. LFD y04, 35 * SIZE(Y1)
  961. DCBT(Y1, PREC)
  962. addi Y1, Y1, 16 * SIZE
  963. bdnz LL(12)
  964. .align 4
  965. LL(13):
  966. STFD y05, 4 * SIZE(Y1)
  967. STFD y06, 5 * SIZE(Y1)
  968. STFD y07, 6 * SIZE(Y1)
  969. STFD y08, 7 * SIZE(Y1)
  970. STFD y09, 8 * SIZE(Y1)
  971. STFD y10, 9 * SIZE(Y1)
  972. STFD y11, 10 * SIZE(Y1)
  973. STFD y12, 11 * SIZE(Y1)
  974. STFD y13, 12 * SIZE(Y1)
  975. STFD y14, 13 * SIZE(Y1)
  976. STFD y15, 14 * SIZE(Y1)
  977. STFD y16, 15 * SIZE(Y1)
  978. addi Y1, Y1, 16 * SIZE
  979. .align 4
  980. LL(15):
  981. andi. r0, M, 15
  982. ble LL(19)
  983. andi. r0, M, 8
  984. ble LL(16)
  985. LFD y01, 0 * SIZE(Y1)
  986. LFD y02, 1 * SIZE(Y1)
  987. LFD y03, 2 * SIZE(Y1)
  988. LFD y04, 3 * SIZE(Y1)
  989. LFD a1, 0 * SIZE(AO1)
  990. LFD a2, 1 * SIZE(AO1)
  991. LFD a3, 2 * SIZE(AO1)
  992. LFD a4, 3 * SIZE(AO1)
  993. LFD y05, 4 * SIZE(Y1)
  994. LFD y06, 5 * SIZE(Y1)
  995. LFD y07, 6 * SIZE(Y1)
  996. LFD y08, 7 * SIZE(Y1)
  997. LFD a5, 4 * SIZE(AO1)
  998. LFD a6, 5 * SIZE(AO1)
  999. LFD a7, 6 * SIZE(AO1)
  1000. LFD a8, 7 * SIZE(AO1)
  1001. FMADD y01, alpha1, a1, y01
  1002. LFD a1, 0 * SIZE(AO2)
  1003. FMADD y02, alpha1, a2, y02
  1004. LFD a2, 1 * SIZE(AO2)
  1005. FMADD y03, alpha1, a3, y03
  1006. LFD a3, 2 * SIZE(AO2)
  1007. FMADD y04, alpha1, a4, y04
  1008. LFD a4, 3 * SIZE(AO2)
  1009. FMADD y05, alpha1, a5, y05
  1010. LFD a5, 4 * SIZE(AO2)
  1011. FMADD y06, alpha1, a6, y06
  1012. LFD a6, 5 * SIZE(AO2)
  1013. FMADD y07, alpha1, a7, y07
  1014. LFD a7, 6 * SIZE(AO2)
  1015. FMADD y08, alpha1, a8, y08
  1016. LFD a8, 7 * SIZE(AO2)
  1017. FMADD y01, alpha2, a1, y01
  1018. LFD a1, 0 * SIZE(AO3)
  1019. FMADD y02, alpha2, a2, y02
  1020. LFD a2, 1 * SIZE(AO3)
  1021. FMADD y03, alpha2, a3, y03
  1022. LFD a3, 2 * SIZE(AO3)
  1023. FMADD y04, alpha2, a4, y04
  1024. LFD a4, 3 * SIZE(AO3)
  1025. FMADD y05, alpha2, a5, y05
  1026. LFD a5, 4 * SIZE(AO3)
  1027. FMADD y06, alpha2, a6, y06
  1028. LFD a6, 5 * SIZE(AO3)
  1029. FMADD y07, alpha2, a7, y07
  1030. LFD a7, 6 * SIZE(AO3)
  1031. FMADD y08, alpha2, a8, y08
  1032. LFD a8, 7 * SIZE(AO3)
  1033. FMADD y01, alpha3, a1, y01
  1034. LFD a1, 0 * SIZE(AO4)
  1035. FMADD y02, alpha3, a2, y02
  1036. LFD a2, 1 * SIZE(AO4)
  1037. FMADD y03, alpha3, a3, y03
  1038. LFD a3, 2 * SIZE(AO4)
  1039. FMADD y04, alpha3, a4, y04
  1040. LFD a4, 3 * SIZE(AO4)
  1041. FMADD y05, alpha3, a5, y05
  1042. LFD a5, 4 * SIZE(AO4)
  1043. FMADD y06, alpha3, a6, y06
  1044. LFD a6, 5 * SIZE(AO4)
  1045. FMADD y07, alpha3, a7, y07
  1046. LFD a7, 6 * SIZE(AO4)
  1047. FMADD y08, alpha3, a8, y08
  1048. LFD a8, 7 * SIZE(AO4)
  1049. FMADD y01, alpha4, a1, y01
  1050. LFD a1, 0 * SIZE(AO5)
  1051. FMADD y02, alpha4, a2, y02
  1052. LFD a2, 1 * SIZE(AO5)
  1053. FMADD y03, alpha4, a3, y03
  1054. LFD a3, 2 * SIZE(AO5)
  1055. FMADD y04, alpha4, a4, y04
  1056. LFD a4, 3 * SIZE(AO5)
  1057. FMADD y05, alpha4, a5, y05
  1058. LFD a5, 4 * SIZE(AO5)
  1059. FMADD y06, alpha4, a6, y06
  1060. LFD a6, 5 * SIZE(AO5)
  1061. FMADD y07, alpha4, a7, y07
  1062. LFD a7, 6 * SIZE(AO5)
  1063. FMADD y08, alpha4, a8, y08
  1064. LFD a8, 7 * SIZE(AO5)
  1065. FMADD y01, alpha5, a1, y01
  1066. LFD a1, 0 * SIZE(AO6)
  1067. FMADD y02, alpha5, a2, y02
  1068. LFD a2, 1 * SIZE(AO6)
  1069. FMADD y03, alpha5, a3, y03
  1070. LFD a3, 2 * SIZE(AO6)
  1071. FMADD y04, alpha5, a4, y04
  1072. LFD a4, 3 * SIZE(AO6)
  1073. FMADD y05, alpha5, a5, y05
  1074. LFD a5, 4 * SIZE(AO6)
  1075. FMADD y06, alpha5, a6, y06
  1076. LFD a6, 5 * SIZE(AO6)
  1077. FMADD y07, alpha5, a7, y07
  1078. LFD a7, 6 * SIZE(AO6)
  1079. FMADD y08, alpha5, a8, y08
  1080. LFD a8, 7 * SIZE(AO6)
  1081. FMADD y01, alpha6, a1, y01
  1082. LFD a1, 0 * SIZE(AO7)
  1083. FMADD y02, alpha6, a2, y02
  1084. LFD a2, 1 * SIZE(AO7)
  1085. FMADD y03, alpha6, a3, y03
  1086. LFD a3, 2 * SIZE(AO7)
  1087. FMADD y04, alpha6, a4, y04
  1088. LFD a4, 3 * SIZE(AO7)
  1089. FMADD y05, alpha6, a5, y05
  1090. LFD a5, 4 * SIZE(AO7)
  1091. FMADD y06, alpha6, a6, y06
  1092. LFD a6, 5 * SIZE(AO7)
  1093. FMADD y07, alpha6, a7, y07
  1094. LFD a7, 6 * SIZE(AO7)
  1095. FMADD y08, alpha6, a8, y08
  1096. LFD a8, 7 * SIZE(AO7)
  1097. FMADD y01, alpha7, a1, y01
  1098. LFD a1, 0 * SIZE(AO8)
  1099. FMADD y02, alpha7, a2, y02
  1100. LFD a2, 1 * SIZE(AO8)
  1101. FMADD y03, alpha7, a3, y03
  1102. LFD a3, 2 * SIZE(AO8)
  1103. FMADD y04, alpha7, a4, y04
  1104. LFD a4, 3 * SIZE(AO8)
  1105. FMADD y05, alpha7, a5, y05
  1106. LFD a5, 4 * SIZE(AO8)
  1107. FMADD y06, alpha7, a6, y06
  1108. LFD a6, 5 * SIZE(AO8)
  1109. FMADD y07, alpha7, a7, y07
  1110. LFD a7, 6 * SIZE(AO8)
  1111. FMADD y08, alpha7, a8, y08
  1112. LFD a8, 7 * SIZE(AO8)
  1113. FMADD y01, alpha8, a1, y01
  1114. addi AO1, AO1, 8 * SIZE
  1115. FMADD y02, alpha8, a2, y02
  1116. addi AO2, AO2, 8 * SIZE
  1117. FMADD y03, alpha8, a3, y03
  1118. addi AO3, AO3, 8 * SIZE
  1119. FMADD y04, alpha8, a4, y04
  1120. addi AO4, AO4, 8 * SIZE
  1121. STFD y01, 0 * SIZE(Y1)
  1122. STFD y02, 1 * SIZE(Y1)
  1123. STFD y03, 2 * SIZE(Y1)
  1124. STFD y04, 3 * SIZE(Y1)
  1125. FMADD y05, alpha8, a5, y05
  1126. addi AO5, AO5, 8 * SIZE
  1127. FMADD y06, alpha8, a6, y06
  1128. addi AO6, AO6, 8 * SIZE
  1129. FMADD y07, alpha8, a7, y07
  1130. addi AO7, AO7, 8 * SIZE
  1131. FMADD y08, alpha8, a8, y08
  1132. addi AO8, AO8, 8 * SIZE
  1133. STFD y05, 4 * SIZE(Y1)
  1134. STFD y06, 5 * SIZE(Y1)
  1135. STFD y07, 6 * SIZE(Y1)
  1136. STFD y08, 7 * SIZE(Y1)
  1137. addi Y1, Y1, 8 * SIZE
  1138. .align 4
  1139. LL(16):
  1140. andi. r0, M, 4
  1141. ble LL(17)
  1142. LFD y01, 0 * SIZE(Y1)
  1143. LFD y02, 1 * SIZE(Y1)
  1144. LFD y03, 2 * SIZE(Y1)
  1145. LFD y04, 3 * SIZE(Y1)
  1146. LFD a1, 0 * SIZE(AO1)
  1147. LFD a2, 1 * SIZE(AO1)
  1148. LFD a3, 2 * SIZE(AO1)
  1149. LFD a4, 3 * SIZE(AO1)
  1150. LFD a5, 0 * SIZE(AO2)
  1151. LFD a6, 1 * SIZE(AO2)
  1152. LFD a7, 2 * SIZE(AO2)
  1153. LFD a8, 3 * SIZE(AO2)
  1154. FMADD y01, alpha1, a1, y01
  1155. LFD a1, 0 * SIZE(AO3)
  1156. FMADD y02, alpha1, a2, y02
  1157. LFD a2, 1 * SIZE(AO3)
  1158. FMADD y03, alpha1, a3, y03
  1159. LFD a3, 2 * SIZE(AO3)
  1160. FMADD y04, alpha1, a4, y04
  1161. LFD a4, 3 * SIZE(AO3)
  1162. FMADD y01, alpha2, a5, y01
  1163. LFD a5, 0 * SIZE(AO4)
  1164. FMADD y02, alpha2, a6, y02
  1165. LFD a6, 1 * SIZE(AO4)
  1166. FMADD y03, alpha2, a7, y03
  1167. LFD a7, 2 * SIZE(AO4)
  1168. FMADD y04, alpha2, a8, y04
  1169. LFD a8, 3 * SIZE(AO4)
  1170. FMADD y01, alpha3, a1, y01
  1171. LFD a1, 0 * SIZE(AO5)
  1172. FMADD y02, alpha3, a2, y02
  1173. LFD a2, 1 * SIZE(AO5)
  1174. FMADD y03, alpha3, a3, y03
  1175. LFD a3, 2 * SIZE(AO5)
  1176. FMADD y04, alpha3, a4, y04
  1177. LFD a4, 3 * SIZE(AO5)
  1178. FMADD y01, alpha4, a5, y01
  1179. LFD a5, 0 * SIZE(AO6)
  1180. FMADD y02, alpha4, a6, y02
  1181. LFD a6, 1 * SIZE(AO6)
  1182. FMADD y03, alpha4, a7, y03
  1183. LFD a7, 2 * SIZE(AO6)
  1184. FMADD y04, alpha4, a8, y04
  1185. LFD a8, 3 * SIZE(AO6)
  1186. FMADD y01, alpha5, a1, y01
  1187. LFD a1, 0 * SIZE(AO7)
  1188. FMADD y02, alpha5, a2, y02
  1189. LFD a2, 1 * SIZE(AO7)
  1190. FMADD y03, alpha5, a3, y03
  1191. LFD a3, 2 * SIZE(AO7)
  1192. FMADD y04, alpha5, a4, y04
  1193. LFD a4, 3 * SIZE(AO7)
  1194. FMADD y01, alpha6, a5, y01
  1195. LFD a5, 0 * SIZE(AO8)
  1196. FMADD y02, alpha6, a6, y02
  1197. LFD a6, 1 * SIZE(AO8)
  1198. FMADD y03, alpha6, a7, y03
  1199. LFD a7, 2 * SIZE(AO8)
  1200. FMADD y04, alpha6, a8, y04
  1201. LFD a8, 3 * SIZE(AO8)
  1202. FMADD y01, alpha7, a1, y01
  1203. addi AO1, AO1, 4 * SIZE
  1204. FMADD y02, alpha7, a2, y02
  1205. addi AO2, AO2, 4 * SIZE
  1206. FMADD y03, alpha7, a3, y03
  1207. addi AO3, AO3, 4 * SIZE
  1208. FMADD y04, alpha7, a4, y04
  1209. addi AO4, AO4, 4 * SIZE
  1210. FMADD y01, alpha8, a5, y01
  1211. addi AO5, AO5, 4 * SIZE
  1212. FMADD y02, alpha8, a6, y02
  1213. addi AO6, AO6, 4 * SIZE
  1214. FMADD y03, alpha8, a7, y03
  1215. addi AO7, AO7, 4 * SIZE
  1216. FMADD y04, alpha8, a8, y04
  1217. addi AO8, AO8, 4 * SIZE
  1218. STFD y01, 0 * SIZE(Y1)
  1219. STFD y02, 1 * SIZE(Y1)
  1220. STFD y03, 2 * SIZE(Y1)
  1221. STFD y04, 3 * SIZE(Y1)
  1222. addi Y1, Y1, 4 * SIZE
  1223. .align 4
  1224. LL(17):
  1225. andi. r0, M, 2
  1226. ble LL(18)
  1227. LFD y01, 0 * SIZE(Y1)
  1228. LFD y02, 1 * SIZE(Y1)
  1229. LFD a1, 0 * SIZE(AO1)
  1230. LFD a2, 1 * SIZE(AO1)
  1231. LFD a3, 0 * SIZE(AO2)
  1232. LFD a4, 1 * SIZE(AO2)
  1233. LFD a5, 0 * SIZE(AO3)
  1234. LFD a6, 1 * SIZE(AO3)
  1235. LFD a7, 0 * SIZE(AO4)
  1236. LFD a8, 1 * SIZE(AO4)
  1237. FMADD y01, alpha1, a1, y01
  1238. LFD a1, 0 * SIZE(AO5)
  1239. FMADD y02, alpha1, a2, y02
  1240. LFD a2, 1 * SIZE(AO5)
  1241. FMADD y01, alpha2, a3, y01
  1242. LFD a3, 0 * SIZE(AO6)
  1243. FMADD y02, alpha2, a4, y02
  1244. LFD a4, 1 * SIZE(AO6)
  1245. FMADD y01, alpha3, a5, y01
  1246. LFD a5, 0 * SIZE(AO7)
  1247. FMADD y02, alpha3, a6, y02
  1248. LFD a6, 1 * SIZE(AO7)
  1249. FMADD y01, alpha4, a7, y01
  1250. LFD a7, 0 * SIZE(AO8)
  1251. FMADD y02, alpha4, a8, y02
  1252. LFD a8, 1 * SIZE(AO8)
  1253. FMADD y01, alpha5, a1, y01
  1254. addi AO1, AO1, 2 * SIZE
  1255. FMADD y02, alpha5, a2, y02
  1256. addi AO2, AO2, 2 * SIZE
  1257. FMADD y01, alpha6, a3, y01
  1258. addi AO3, AO3, 2 * SIZE
  1259. FMADD y02, alpha6, a4, y02
  1260. addi AO4, AO4, 2 * SIZE
  1261. FMADD y01, alpha7, a5, y01
  1262. addi AO5, AO5, 2 * SIZE
  1263. FMADD y02, alpha7, a6, y02
  1264. addi AO6, AO6, 2 * SIZE
  1265. FMADD y01, alpha8, a7, y01
  1266. addi AO7, AO7, 2 * SIZE
  1267. FMADD y02, alpha8, a8, y02
  1268. addi AO8, AO8, 2 * SIZE
  1269. STFD y01, 0 * SIZE(Y1)
  1270. STFD y02, 1 * SIZE(Y1)
  1271. addi Y1, Y1, 2 * SIZE
  1272. .align 4
  1273. LL(18):
  1274. andi. r0, M, 1
  1275. ble LL(19)
  1276. LFD y01, 0 * SIZE(Y1)
  1277. LFD a1, 0 * SIZE(AO1)
  1278. LFD a2, 0 * SIZE(AO2)
  1279. LFD a3, 0 * SIZE(AO3)
  1280. LFD a4, 0 * SIZE(AO4)
  1281. LFD a5, 0 * SIZE(AO5)
  1282. LFD a6, 0 * SIZE(AO6)
  1283. LFD a7, 0 * SIZE(AO7)
  1284. LFD a8, 0 * SIZE(AO8)
  1285. FMADD y01, alpha1, a1, y01
  1286. FMADD y01, alpha2, a2, y01
  1287. FMADD y01, alpha3, a3, y01
  1288. FMADD y01, alpha4, a4, y01
  1289. FMADD y01, alpha5, a5, y01
  1290. FMADD y01, alpha6, a6, y01
  1291. FMADD y01, alpha7, a7, y01
  1292. FMADD y01, alpha8, a8, y01
  1293. STFD y01, 0 * SIZE(Y1)
  1294. .align 4
  1295. LL(19):
  1296. addi J, J, -1
  1297. lfd alpha, ALPHA
  1298. cmpi cr0, 0, J, 0
  1299. bgt LL(11)
  1300. .align 4
  1301. LL(20):
  1302. andi. J, N, 4
  1303. mr AO1, A
  1304. add AO2, A, LDA
  1305. ble LL(30)
  1306. .align 4
  1307. LFD alpha1, 0 * SIZE(X)
  1308. add X, X, INCX
  1309. LFD alpha2, 0 * SIZE(X)
  1310. add X, X, INCX
  1311. LFD alpha3, 0 * SIZE(X)
  1312. add X, X, INCX
  1313. LFD alpha4, 0 * SIZE(X)
  1314. add X, X, INCX
  1315. FMUL alpha1, alpha, alpha1
  1316. add AO3, AO2, LDA
  1317. FMUL alpha2, alpha, alpha2
  1318. add AO4, AO3, LDA
  1319. FMUL alpha3, alpha, alpha3
  1320. add A, AO4, LDA
  1321. FMUL alpha4, alpha, alpha4
  1322. mr Y1, YY
  1323. srawi. r0, M, 4
  1324. mtspr CTR, r0
  1325. ble LL(25)
  1326. LFD y01, 0 * SIZE(Y1)
  1327. LFD y02, 1 * SIZE(Y1)
  1328. LFD y03, 2 * SIZE(Y1)
  1329. LFD y04, 3 * SIZE(Y1)
  1330. LFD y05, 4 * SIZE(Y1)
  1331. LFD y06, 5 * SIZE(Y1)
  1332. LFD y07, 6 * SIZE(Y1)
  1333. LFD y08, 7 * SIZE(Y1)
  1334. LFD y09, 8 * SIZE(Y1)
  1335. LFD y10, 9 * SIZE(Y1)
  1336. LFD y11, 10 * SIZE(Y1)
  1337. LFD y12, 11 * SIZE(Y1)
  1338. LFD y13, 12 * SIZE(Y1)
  1339. LFD y14, 13 * SIZE(Y1)
  1340. LFD y15, 14 * SIZE(Y1)
  1341. LFD y16, 15 * SIZE(Y1)
  1342. LFD a1, 0 * SIZE(AO1)
  1343. LFD a2, 1 * SIZE(AO1)
  1344. LFD a3, 2 * SIZE(AO1)
  1345. LFD a4, 3 * SIZE(AO1)
  1346. LFD a5, 4 * SIZE(AO1)
  1347. LFD a6, 5 * SIZE(AO1)
  1348. LFD a7, 6 * SIZE(AO1)
  1349. LFD a8, 7 * SIZE(AO1)
  1350. bdz LL(23)
  1351. .align 4
  1352. LL(22):
  1353. FMADD y01, alpha1, a1, y01
  1354. LFD a1, 8 * SIZE(AO1)
  1355. FMADD y02, alpha1, a2, y02
  1356. LFD a2, 9 * SIZE(AO1)
  1357. FMADD y03, alpha1, a3, y03
  1358. LFD a3, 10 * SIZE(AO1)
  1359. FMADD y04, alpha1, a4, y04
  1360. LFD a4, 11 * SIZE(AO1)
  1361. FMADD y05, alpha1, a5, y05
  1362. LFD a5, 12 * SIZE(AO1)
  1363. FMADD y06, alpha1, a6, y06
  1364. LFD a6, 13 * SIZE(AO1)
  1365. FMADD y07, alpha1, a7, y07
  1366. LFD a7, 14 * SIZE(AO1)
  1367. FMADD y08, alpha1, a8, y08
  1368. LFD a8, 15 * SIZE(AO1)
  1369. FMADD y09, alpha1, a1, y09
  1370. LFD a1, 0 * SIZE(AO2)
  1371. FMADD y10, alpha1, a2, y10
  1372. LFD a2, 1 * SIZE(AO2)
  1373. FMADD y11, alpha1, a3, y11
  1374. LFD a3, 2 * SIZE(AO2)
  1375. FMADD y12, alpha1, a4, y12
  1376. LFD a4, 3 * SIZE(AO2)
  1377. FMADD y13, alpha1, a5, y13
  1378. LFD a5, 4 * SIZE(AO2)
  1379. FMADD y14, alpha1, a6, y14
  1380. LFD a6, 5 * SIZE(AO2)
  1381. FMADD y15, alpha1, a7, y15
  1382. LFD a7, 6 * SIZE(AO2)
  1383. FMADD y16, alpha1, a8, y16
  1384. LFD a8, 7 * SIZE(AO2)
  1385. FMADD y01, alpha2, a1, y01
  1386. LFD a1, 8 * SIZE(AO2)
  1387. FMADD y02, alpha2, a2, y02
  1388. LFD a2, 9 * SIZE(AO2)
  1389. FMADD y03, alpha2, a3, y03
  1390. LFD a3, 10 * SIZE(AO2)
  1391. FMADD y04, alpha2, a4, y04
  1392. LFD a4, 11 * SIZE(AO2)
  1393. FMADD y05, alpha2, a5, y05
  1394. LFD a5, 12 * SIZE(AO2)
  1395. FMADD y06, alpha2, a6, y06
  1396. LFD a6, 13 * SIZE(AO2)
  1397. FMADD y07, alpha2, a7, y07
  1398. LFD a7, 14 * SIZE(AO2)
  1399. FMADD y08, alpha2, a8, y08
  1400. LFD a8, 15 * SIZE(AO2)
  1401. addi AO1, AO1, 16 * SIZE
  1402. addi AO2, AO2, 16 * SIZE
  1403. DCBT(AO1, PREA)
  1404. DCBT(AO2, PREA)
  1405. FMADD y09, alpha2, a1, y09
  1406. LFD a1, 0 * SIZE(AO3)
  1407. FMADD y10, alpha2, a2, y10
  1408. LFD a2, 1 * SIZE(AO3)
  1409. FMADD y11, alpha2, a3, y11
  1410. LFD a3, 2 * SIZE(AO3)
  1411. FMADD y12, alpha2, a4, y12
  1412. LFD a4, 3 * SIZE(AO3)
  1413. FMADD y13, alpha2, a5, y13
  1414. LFD a5, 4 * SIZE(AO3)
  1415. FMADD y14, alpha2, a6, y14
  1416. LFD a6, 5 * SIZE(AO3)
  1417. FMADD y15, alpha2, a7, y15
  1418. LFD a7, 6 * SIZE(AO3)
  1419. FMADD y16, alpha2, a8, y16
  1420. LFD a8, 7 * SIZE(AO3)
  1421. FMADD y01, alpha3, a1, y01
  1422. LFD a1, 8 * SIZE(AO3)
  1423. FMADD y02, alpha3, a2, y02
  1424. LFD a2, 9 * SIZE(AO3)
  1425. FMADD y03, alpha3, a3, y03
  1426. LFD a3, 10 * SIZE(AO3)
  1427. FMADD y04, alpha3, a4, y04
  1428. LFD a4, 11 * SIZE(AO3)
  1429. FMADD y05, alpha3, a5, y05
  1430. LFD a5, 12 * SIZE(AO3)
  1431. FMADD y06, alpha3, a6, y06
  1432. LFD a6, 13 * SIZE(AO3)
  1433. FMADD y07, alpha3, a7, y07
  1434. LFD a7, 14 * SIZE(AO3)
  1435. FMADD y08, alpha3, a8, y08
  1436. LFD a8, 15 * SIZE(AO3)
  1437. FMADD y09, alpha3, a1, y09
  1438. LFD a1, 0 * SIZE(AO4)
  1439. FMADD y10, alpha3, a2, y10
  1440. LFD a2, 1 * SIZE(AO4)
  1441. FMADD y11, alpha3, a3, y11
  1442. LFD a3, 2 * SIZE(AO4)
  1443. FMADD y12, alpha3, a4, y12
  1444. LFD a4, 3 * SIZE(AO4)
  1445. FMADD y13, alpha3, a5, y13
  1446. LFD a5, 4 * SIZE(AO4)
  1447. FMADD y14, alpha3, a6, y14
  1448. LFD a6, 5 * SIZE(AO4)
  1449. FMADD y15, alpha3, a7, y15
  1450. LFD a7, 6 * SIZE(AO4)
  1451. FMADD y16, alpha3, a8, y16
  1452. LFD a8, 7 * SIZE(AO4)
  1453. FMADD y01, alpha4, a1, y01
  1454. LFD a1, 8 * SIZE(AO4)
  1455. FMADD y02, alpha4, a2, y02
  1456. LFD a2, 9 * SIZE(AO4)
  1457. FMADD y03, alpha4, a3, y03
  1458. LFD a3, 10 * SIZE(AO4)
  1459. FMADD y04, alpha4, a4, y04
  1460. LFD a4, 11 * SIZE(AO4)
  1461. STFD y01, 0 * SIZE(Y1)
  1462. STFD y02, 1 * SIZE(Y1)
  1463. STFD y03, 2 * SIZE(Y1)
  1464. STFD y04, 3 * SIZE(Y1)
  1465. LFD y01, 16 * SIZE(Y1)
  1466. LFD y02, 17 * SIZE(Y1)
  1467. LFD y03, 18 * SIZE(Y1)
  1468. LFD y04, 19 * SIZE(Y1)
  1469. FMADD y05, alpha4, a5, y05
  1470. LFD a5, 12 * SIZE(AO4)
  1471. FMADD y06, alpha4, a6, y06
  1472. LFD a6, 13 * SIZE(AO4)
  1473. FMADD y07, alpha4, a7, y07
  1474. LFD a7, 14 * SIZE(AO4)
  1475. FMADD y08, alpha4, a8, y08
  1476. LFD a8, 15 * SIZE(AO4)
  1477. STFD y05, 4 * SIZE(Y1)
  1478. STFD y06, 5 * SIZE(Y1)
  1479. STFD y07, 6 * SIZE(Y1)
  1480. STFD y08, 7 * SIZE(Y1)
  1481. LFD y05, 20 * SIZE(Y1)
  1482. LFD y06, 21 * SIZE(Y1)
  1483. LFD y07, 22 * SIZE(Y1)
  1484. LFD y08, 23 * SIZE(Y1)
  1485. addi AO3, AO3, 16 * SIZE
  1486. addi AO4, AO4, 16 * SIZE
  1487. DCBT(AO3, PREA)
  1488. DCBT(AO4, PREA)
  1489. FMADD y09, alpha4, a1, y09
  1490. LFD a1, 0 * SIZE(AO1)
  1491. FMADD y10, alpha4, a2, y10
  1492. LFD a2, 1 * SIZE(AO1)
  1493. FMADD y11, alpha4, a3, y11
  1494. LFD a3, 2 * SIZE(AO1)
  1495. FMADD y12, alpha4, a4, y12
  1496. LFD a4, 3 * SIZE(AO1)
  1497. STFD y09, 8 * SIZE(Y1)
  1498. STFD y10, 9 * SIZE(Y1)
  1499. STFD y11, 10 * SIZE(Y1)
  1500. STFD y12, 11 * SIZE(Y1)
  1501. LFD y09, 24 * SIZE(Y1)
  1502. LFD y10, 25 * SIZE(Y1)
  1503. LFD y11, 26 * SIZE(Y1)
  1504. LFD y12, 27 * SIZE(Y1)
  1505. FMADD y13, alpha4, a5, y13
  1506. LFD a5, 4 * SIZE(AO1)
  1507. FMADD y14, alpha4, a6, y14
  1508. LFD a6, 5 * SIZE(AO1)
  1509. FMADD y15, alpha4, a7, y15
  1510. LFD a7, 6 * SIZE(AO1)
  1511. FMADD y16, alpha4, a8, y16
  1512. LFD a8, 7 * SIZE(AO1)
  1513. STFD y13, 12 * SIZE(Y1)
  1514. STFD y14, 13 * SIZE(Y1)
  1515. STFD y15, 14 * SIZE(Y1)
  1516. STFD y16, 15 * SIZE(Y1)
  1517. LFD y13, 28 * SIZE(Y1)
  1518. LFD y14, 29 * SIZE(Y1)
  1519. LFD y15, 30 * SIZE(Y1)
  1520. LFD y16, 31 * SIZE(Y1)
  1521. addi Y1, Y1, 16 * SIZE
  1522. DCBT(Y1, PREC)
  1523. bdnz LL(22)
  1524. .align 4
  1525. LL(23):
  1526. FMADD y01, alpha1, a1, y01
  1527. LFD a1, 8 * SIZE(AO1)
  1528. FMADD y02, alpha1, a2, y02
  1529. LFD a2, 9 * SIZE(AO1)
  1530. FMADD y03, alpha1, a3, y03
  1531. LFD a3, 10 * SIZE(AO1)
  1532. FMADD y04, alpha1, a4, y04
  1533. LFD a4, 11 * SIZE(AO1)
  1534. FMADD y05, alpha1, a5, y05
  1535. LFD a5, 12 * SIZE(AO1)
  1536. FMADD y06, alpha1, a6, y06
  1537. LFD a6, 13 * SIZE(AO1)
  1538. FMADD y07, alpha1, a7, y07
  1539. LFD a7, 14 * SIZE(AO1)
  1540. FMADD y08, alpha1, a8, y08
  1541. LFD a8, 15 * SIZE(AO1)
  1542. FMADD y09, alpha1, a1, y09
  1543. LFD a1, 0 * SIZE(AO2)
  1544. FMADD y10, alpha1, a2, y10
  1545. LFD a2, 1 * SIZE(AO2)
  1546. FMADD y11, alpha1, a3, y11
  1547. LFD a3, 2 * SIZE(AO2)
  1548. FMADD y12, alpha1, a4, y12
  1549. LFD a4, 3 * SIZE(AO2)
  1550. FMADD y13, alpha1, a5, y13
  1551. LFD a5, 4 * SIZE(AO2)
  1552. FMADD y14, alpha1, a6, y14
  1553. LFD a6, 5 * SIZE(AO2)
  1554. FMADD y15, alpha1, a7, y15
  1555. LFD a7, 6 * SIZE(AO2)
  1556. FMADD y16, alpha1, a8, y16
  1557. LFD a8, 7 * SIZE(AO2)
  1558. FMADD y01, alpha2, a1, y01
  1559. LFD a1, 8 * SIZE(AO2)
  1560. FMADD y02, alpha2, a2, y02
  1561. LFD a2, 9 * SIZE(AO2)
  1562. FMADD y03, alpha2, a3, y03
  1563. LFD a3, 10 * SIZE(AO2)
  1564. FMADD y04, alpha2, a4, y04
  1565. LFD a4, 11 * SIZE(AO2)
  1566. FMADD y05, alpha2, a5, y05
  1567. LFD a5, 12 * SIZE(AO2)
  1568. FMADD y06, alpha2, a6, y06
  1569. LFD a6, 13 * SIZE(AO2)
  1570. FMADD y07, alpha2, a7, y07
  1571. LFD a7, 14 * SIZE(AO2)
  1572. FMADD y08, alpha2, a8, y08
  1573. LFD a8, 15 * SIZE(AO2)
  1574. FMADD y09, alpha2, a1, y09
  1575. LFD a1, 0 * SIZE(AO3)
  1576. FMADD y10, alpha2, a2, y10
  1577. LFD a2, 1 * SIZE(AO3)
  1578. FMADD y11, alpha2, a3, y11
  1579. LFD a3, 2 * SIZE(AO3)
  1580. FMADD y12, alpha2, a4, y12
  1581. LFD a4, 3 * SIZE(AO3)
  1582. FMADD y13, alpha2, a5, y13
  1583. LFD a5, 4 * SIZE(AO3)
  1584. FMADD y14, alpha2, a6, y14
  1585. LFD a6, 5 * SIZE(AO3)
  1586. FMADD y15, alpha2, a7, y15
  1587. LFD a7, 6 * SIZE(AO3)
  1588. FMADD y16, alpha2, a8, y16
  1589. LFD a8, 7 * SIZE(AO3)
  1590. FMADD y01, alpha3, a1, y01
  1591. LFD a1, 8 * SIZE(AO3)
  1592. FMADD y02, alpha3, a2, y02
  1593. LFD a2, 9 * SIZE(AO3)
  1594. FMADD y03, alpha3, a3, y03
  1595. LFD a3, 10 * SIZE(AO3)
  1596. FMADD y04, alpha3, a4, y04
  1597. LFD a4, 11 * SIZE(AO3)
  1598. FMADD y05, alpha3, a5, y05
  1599. LFD a5, 12 * SIZE(AO3)
  1600. FMADD y06, alpha3, a6, y06
  1601. LFD a6, 13 * SIZE(AO3)
  1602. FMADD y07, alpha3, a7, y07
  1603. LFD a7, 14 * SIZE(AO3)
  1604. FMADD y08, alpha3, a8, y08
  1605. LFD a8, 15 * SIZE(AO3)
  1606. FMADD y09, alpha3, a1, y09
  1607. LFD a1, 0 * SIZE(AO4)
  1608. FMADD y10, alpha3, a2, y10
  1609. LFD a2, 1 * SIZE(AO4)
  1610. FMADD y11, alpha3, a3, y11
  1611. LFD a3, 2 * SIZE(AO4)
  1612. FMADD y12, alpha3, a4, y12
  1613. LFD a4, 3 * SIZE(AO4)
  1614. FMADD y13, alpha3, a5, y13
  1615. LFD a5, 4 * SIZE(AO4)
  1616. FMADD y14, alpha3, a6, y14
  1617. LFD a6, 5 * SIZE(AO4)
  1618. FMADD y15, alpha3, a7, y15
  1619. LFD a7, 6 * SIZE(AO4)
  1620. FMADD y16, alpha3, a8, y16
  1621. LFD a8, 7 * SIZE(AO4)
  1622. FMADD y01, alpha4, a1, y01
  1623. LFD a1, 8 * SIZE(AO4)
  1624. FMADD y02, alpha4, a2, y02
  1625. LFD a2, 9 * SIZE(AO4)
  1626. FMADD y03, alpha4, a3, y03
  1627. LFD a3, 10 * SIZE(AO4)
  1628. FMADD y04, alpha4, a4, y04
  1629. LFD a4, 11 * SIZE(AO4)
  1630. FMADD y05, alpha4, a5, y05
  1631. LFD a5, 12 * SIZE(AO4)
  1632. FMADD y06, alpha4, a6, y06
  1633. LFD a6, 13 * SIZE(AO4)
  1634. FMADD y07, alpha4, a7, y07
  1635. LFD a7, 14 * SIZE(AO4)
  1636. FMADD y08, alpha4, a8, y08
  1637. LFD a8, 15 * SIZE(AO4)
  1638. FMADD y09, alpha4, a1, y09
  1639. addi AO1, AO1, 16 * SIZE
  1640. FMADD y10, alpha4, a2, y10
  1641. addi AO2, AO2, 16 * SIZE
  1642. FMADD y11, alpha4, a3, y11
  1643. addi AO3, AO3, 16 * SIZE
  1644. FMADD y12, alpha4, a4, y12
  1645. addi AO4, AO4, 16 * SIZE
  1646. FMADD y13, alpha4, a5, y13
  1647. FMADD y14, alpha4, a6, y14
  1648. FMADD y15, alpha4, a7, y15
  1649. FMADD y16, alpha4, a8, y16
  1650. STFD y01, 0 * SIZE(Y1)
  1651. STFD y02, 1 * SIZE(Y1)
  1652. STFD y03, 2 * SIZE(Y1)
  1653. STFD y04, 3 * SIZE(Y1)
  1654. STFD y05, 4 * SIZE(Y1)
  1655. STFD y06, 5 * SIZE(Y1)
  1656. STFD y07, 6 * SIZE(Y1)
  1657. STFD y08, 7 * SIZE(Y1)
  1658. STFD y09, 8 * SIZE(Y1)
  1659. STFD y10, 9 * SIZE(Y1)
  1660. STFD y11, 10 * SIZE(Y1)
  1661. STFD y12, 11 * SIZE(Y1)
  1662. STFD y13, 12 * SIZE(Y1)
  1663. STFD y14, 13 * SIZE(Y1)
  1664. STFD y15, 14 * SIZE(Y1)
  1665. STFD y16, 15 * SIZE(Y1)
  1666. addi Y1, Y1, 16 * SIZE
  1667. .align 4
  1668. LL(25):
  1669. andi. r0, M, 15
  1670. ble LL(30)
  1671. andi. r0, M, 8
  1672. ble LL(26)
  1673. LFD y01, 0 * SIZE(Y1)
  1674. LFD y02, 1 * SIZE(Y1)
  1675. LFD y03, 2 * SIZE(Y1)
  1676. LFD y04, 3 * SIZE(Y1)
  1677. LFD y05, 4 * SIZE(Y1)
  1678. LFD y06, 5 * SIZE(Y1)
  1679. LFD y07, 6 * SIZE(Y1)
  1680. LFD y08, 7 * SIZE(Y1)
  1681. LFD a1, 0 * SIZE(AO1)
  1682. LFD a2, 1 * SIZE(AO1)
  1683. LFD a3, 2 * SIZE(AO1)
  1684. LFD a4, 3 * SIZE(AO1)
  1685. LFD a5, 4 * SIZE(AO1)
  1686. LFD a6, 5 * SIZE(AO1)
  1687. LFD a7, 6 * SIZE(AO1)
  1688. LFD a8, 7 * SIZE(AO1)
  1689. FMADD y01, alpha1, a1, y01
  1690. LFD a1, 0 * SIZE(AO2)
  1691. FMADD y02, alpha1, a2, y02
  1692. LFD a2, 1 * SIZE(AO2)
  1693. FMADD y03, alpha1, a3, y03
  1694. LFD a3, 2 * SIZE(AO2)
  1695. FMADD y04, alpha1, a4, y04
  1696. LFD a4, 3 * SIZE(AO2)
  1697. FMADD y05, alpha1, a5, y05
  1698. LFD a5, 4 * SIZE(AO2)
  1699. FMADD y06, alpha1, a6, y06
  1700. LFD a6, 5 * SIZE(AO2)
  1701. FMADD y07, alpha1, a7, y07
  1702. LFD a7, 6 * SIZE(AO2)
  1703. FMADD y08, alpha1, a8, y08
  1704. LFD a8, 7 * SIZE(AO2)
  1705. FMADD y01, alpha2, a1, y01
  1706. LFD a1, 0 * SIZE(AO3)
  1707. FMADD y02, alpha2, a2, y02
  1708. LFD a2, 1 * SIZE(AO3)
  1709. FMADD y03, alpha2, a3, y03
  1710. LFD a3, 2 * SIZE(AO3)
  1711. FMADD y04, alpha2, a4, y04
  1712. LFD a4, 3 * SIZE(AO3)
  1713. FMADD y05, alpha2, a5, y05
  1714. LFD a5, 4 * SIZE(AO3)
  1715. FMADD y06, alpha2, a6, y06
  1716. LFD a6, 5 * SIZE(AO3)
  1717. FMADD y07, alpha2, a7, y07
  1718. LFD a7, 6 * SIZE(AO3)
  1719. FMADD y08, alpha2, a8, y08
  1720. LFD a8, 7 * SIZE(AO3)
  1721. FMADD y01, alpha3, a1, y01
  1722. LFD a1, 0 * SIZE(AO4)
  1723. FMADD y02, alpha3, a2, y02
  1724. LFD a2, 1 * SIZE(AO4)
  1725. FMADD y03, alpha3, a3, y03
  1726. LFD a3, 2 * SIZE(AO4)
  1727. FMADD y04, alpha3, a4, y04
  1728. LFD a4, 3 * SIZE(AO4)
  1729. FMADD y05, alpha3, a5, y05
  1730. LFD a5, 4 * SIZE(AO4)
  1731. FMADD y06, alpha3, a6, y06
  1732. LFD a6, 5 * SIZE(AO4)
  1733. FMADD y07, alpha3, a7, y07
  1734. LFD a7, 6 * SIZE(AO4)
  1735. FMADD y08, alpha3, a8, y08
  1736. LFD a8, 7 * SIZE(AO4)
  1737. FMADD y01, alpha4, a1, y01
  1738. addi AO1, AO1, 8 * SIZE
  1739. FMADD y02, alpha4, a2, y02
  1740. addi AO2, AO2, 8 * SIZE
  1741. FMADD y03, alpha4, a3, y03
  1742. addi AO3, AO3, 8 * SIZE
  1743. FMADD y04, alpha4, a4, y04
  1744. addi AO4, AO4, 8 * SIZE
  1745. STFD y01, 0 * SIZE(Y1)
  1746. STFD y02, 1 * SIZE(Y1)
  1747. STFD y03, 2 * SIZE(Y1)
  1748. STFD y04, 3 * SIZE(Y1)
  1749. FMADD y05, alpha4, a5, y05
  1750. FMADD y06, alpha4, a6, y06
  1751. FMADD y07, alpha4, a7, y07
  1752. FMADD y08, alpha4, a8, y08
  1753. STFD y05, 4 * SIZE(Y1)
  1754. STFD y06, 5 * SIZE(Y1)
  1755. STFD y07, 6 * SIZE(Y1)
  1756. STFD y08, 7 * SIZE(Y1)
  1757. addi Y1, Y1, 8 * SIZE
  1758. .align 4
  1759. LL(26):
  1760. andi. r0, M, 4
  1761. ble LL(27)
  1762. LFD y01, 0 * SIZE(Y1)
  1763. LFD y02, 1 * SIZE(Y1)
  1764. LFD y03, 2 * SIZE(Y1)
  1765. LFD y04, 3 * SIZE(Y1)
  1766. LFD a1, 0 * SIZE(AO1)
  1767. LFD a2, 1 * SIZE(AO1)
  1768. LFD a3, 2 * SIZE(AO1)
  1769. LFD a4, 3 * SIZE(AO1)
  1770. LFD a5, 0 * SIZE(AO2)
  1771. LFD a6, 1 * SIZE(AO2)
  1772. LFD a7, 2 * SIZE(AO2)
  1773. LFD a8, 3 * SIZE(AO2)
  1774. FMADD y01, alpha1, a1, y01
  1775. LFD a1, 0 * SIZE(AO3)
  1776. FMADD y02, alpha1, a2, y02
  1777. LFD a2, 1 * SIZE(AO3)
  1778. FMADD y03, alpha1, a3, y03
  1779. LFD a3, 2 * SIZE(AO3)
  1780. FMADD y04, alpha1, a4, y04
  1781. LFD a4, 3 * SIZE(AO3)
  1782. FMADD y01, alpha2, a5, y01
  1783. LFD a5, 0 * SIZE(AO4)
  1784. FMADD y02, alpha2, a6, y02
  1785. LFD a6, 1 * SIZE(AO4)
  1786. FMADD y03, alpha2, a7, y03
  1787. LFD a7, 2 * SIZE(AO4)
  1788. FMADD y04, alpha2, a8, y04
  1789. LFD a8, 3 * SIZE(AO4)
  1790. FMADD y01, alpha3, a1, y01
  1791. addi AO1, AO1, 4 * SIZE
  1792. FMADD y02, alpha3, a2, y02
  1793. addi AO2, AO2, 4 * SIZE
  1794. FMADD y03, alpha3, a3, y03
  1795. addi AO3, AO3, 4 * SIZE
  1796. FMADD y04, alpha3, a4, y04
  1797. addi AO4, AO4, 4 * SIZE
  1798. FMADD y01, alpha4, a5, y01
  1799. FMADD y02, alpha4, a6, y02
  1800. FMADD y03, alpha4, a7, y03
  1801. FMADD y04, alpha4, a8, y04
  1802. STFD y01, 0 * SIZE(Y1)
  1803. STFD y02, 1 * SIZE(Y1)
  1804. STFD y03, 2 * SIZE(Y1)
  1805. STFD y04, 3 * SIZE(Y1)
  1806. addi Y1, Y1, 4 * SIZE
  1807. .align 4
  1808. LL(27):
  1809. andi. r0, M, 2
  1810. ble LL(28)
  1811. LFD y01, 0 * SIZE(Y1)
  1812. LFD y02, 1 * SIZE(Y1)
  1813. LFD a1, 0 * SIZE(AO1)
  1814. LFD a2, 1 * SIZE(AO1)
  1815. LFD a3, 0 * SIZE(AO2)
  1816. LFD a4, 1 * SIZE(AO2)
  1817. LFD a5, 0 * SIZE(AO3)
  1818. LFD a6, 1 * SIZE(AO3)
  1819. LFD a7, 0 * SIZE(AO4)
  1820. LFD a8, 1 * SIZE(AO4)
  1821. FMADD y01, alpha1, a1, y01
  1822. addi AO1, AO1, 2 * SIZE
  1823. FMADD y02, alpha1, a2, y02
  1824. addi AO2, AO2, 2 * SIZE
  1825. FMADD y01, alpha2, a3, y01
  1826. addi AO3, AO3, 2 * SIZE
  1827. FMADD y02, alpha2, a4, y02
  1828. addi AO4, AO4, 2 * SIZE
  1829. FMADD y01, alpha3, a5, y01
  1830. FMADD y02, alpha3, a6, y02
  1831. FMADD y01, alpha4, a7, y01
  1832. FMADD y02, alpha4, a8, y02
  1833. STFD y01, 0 * SIZE(Y1)
  1834. STFD y02, 1 * SIZE(Y1)
  1835. addi Y1, Y1, 2 * SIZE
  1836. .align 4
  1837. LL(28):
  1838. andi. r0, M, 1
  1839. ble LL(30)
  1840. LFD y01, 0 * SIZE(Y1)
  1841. LFD a1, 0 * SIZE(AO1)
  1842. LFD a2, 0 * SIZE(AO2)
  1843. LFD a3, 0 * SIZE(AO3)
  1844. LFD a4, 0 * SIZE(AO4)
  1845. FMADD y01, alpha1, a1, y01
  1846. FMADD y01, alpha2, a2, y01
  1847. FMADD y01, alpha3, a3, y01
  1848. FMADD y01, alpha4, a4, y01
  1849. STFD y01, 0 * SIZE(Y1)
  1850. .align 4
  1851. LL(30):
  1852. andi. J, N, 2
  1853. lfd alpha, ALPHA
  1854. ble LL(40)
  1855. .align 4
  1856. LFD alpha1, 0 * SIZE(X)
  1857. add X, X, INCX
  1858. LFD alpha2, 0 * SIZE(X)
  1859. add X, X, INCX
  1860. FMUL alpha1, alpha, alpha1
  1861. FMUL alpha2, alpha, alpha2
  1862. mr AO1, A
  1863. add AO2, A, LDA
  1864. add A, AO2, LDA
  1865. mr Y1, YY
  1866. srawi. r0, M, 4
  1867. mtspr CTR, r0
  1868. ble LL(35)
  1869. LFD y01, 0 * SIZE(Y1)
  1870. LFD y02, 1 * SIZE(Y1)
  1871. LFD y03, 2 * SIZE(Y1)
  1872. LFD y04, 3 * SIZE(Y1)
  1873. LFD y05, 4 * SIZE(Y1)
  1874. LFD y06, 5 * SIZE(Y1)
  1875. LFD y07, 6 * SIZE(Y1)
  1876. LFD y08, 7 * SIZE(Y1)
  1877. LFD y09, 8 * SIZE(Y1)
  1878. LFD y10, 9 * SIZE(Y1)
  1879. LFD y11, 10 * SIZE(Y1)
  1880. LFD y12, 11 * SIZE(Y1)
  1881. LFD y13, 12 * SIZE(Y1)
  1882. LFD y14, 13 * SIZE(Y1)
  1883. LFD y15, 14 * SIZE(Y1)
  1884. LFD y16, 15 * SIZE(Y1)
  1885. LFD a1, 0 * SIZE(AO1)
  1886. LFD a2, 1 * SIZE(AO1)
  1887. LFD a3, 2 * SIZE(AO1)
  1888. LFD a4, 3 * SIZE(AO1)
  1889. LFD a5, 4 * SIZE(AO1)
  1890. LFD a6, 5 * SIZE(AO1)
  1891. LFD a7, 6 * SIZE(AO1)
  1892. LFD a8, 7 * SIZE(AO1)
  1893. bdz LL(33)
  1894. .align 4
  1895. LL(32):
  1896. FMADD y01, alpha1, a1, y01
  1897. LFD a1, 8 * SIZE(AO1)
  1898. FMADD y02, alpha1, a2, y02
  1899. LFD a2, 9 * SIZE(AO1)
  1900. FMADD y03, alpha1, a3, y03
  1901. LFD a3, 10 * SIZE(AO1)
  1902. FMADD y04, alpha1, a4, y04
  1903. LFD a4, 11 * SIZE(AO1)
  1904. FMADD y05, alpha1, a5, y05
  1905. LFD a5, 12 * SIZE(AO1)
  1906. FMADD y06, alpha1, a6, y06
  1907. LFD a6, 13 * SIZE(AO1)
  1908. FMADD y07, alpha1, a7, y07
  1909. LFD a7, 14 * SIZE(AO1)
  1910. FMADD y08, alpha1, a8, y08
  1911. LFD a8, 15 * SIZE(AO1)
  1912. FMADD y09, alpha1, a1, y09
  1913. LFD a1, 0 * SIZE(AO2)
  1914. FMADD y10, alpha1, a2, y10
  1915. LFD a2, 1 * SIZE(AO2)
  1916. FMADD y11, alpha1, a3, y11
  1917. LFD a3, 2 * SIZE(AO2)
  1918. FMADD y12, alpha1, a4, y12
  1919. LFD a4, 3 * SIZE(AO2)
  1920. FMADD y13, alpha1, a5, y13
  1921. LFD a5, 4 * SIZE(AO2)
  1922. FMADD y14, alpha1, a6, y14
  1923. LFD a6, 5 * SIZE(AO2)
  1924. FMADD y15, alpha1, a7, y15
  1925. LFD a7, 6 * SIZE(AO2)
  1926. FMADD y16, alpha1, a8, y16
  1927. LFD a8, 7 * SIZE(AO2)
  1928. FMADD y01, alpha2, a1, y01
  1929. LFD a1, 8 * SIZE(AO2)
  1930. FMADD y02, alpha2, a2, y02
  1931. LFD a2, 9 * SIZE(AO2)
  1932. FMADD y03, alpha2, a3, y03
  1933. LFD a3, 10 * SIZE(AO2)
  1934. FMADD y04, alpha2, a4, y04
  1935. LFD a4, 11 * SIZE(AO2)
  1936. FMADD y05, alpha2, a5, y05
  1937. LFD a5, 12 * SIZE(AO2)
  1938. FMADD y06, alpha2, a6, y06
  1939. LFD a6, 13 * SIZE(AO2)
  1940. FMADD y07, alpha2, a7, y07
  1941. LFD a7, 14 * SIZE(AO2)
  1942. FMADD y08, alpha2, a8, y08
  1943. LFD a8, 15 * SIZE(AO2)
  1944. FMADD y09, alpha2, a1, y09
  1945. LFD a1, 16 * SIZE(AO1)
  1946. FMADD y10, alpha2, a2, y10
  1947. LFD a2, 17 * SIZE(AO1)
  1948. FMADD y11, alpha2, a3, y11
  1949. LFD a3, 18 * SIZE(AO1)
  1950. FMADD y12, alpha2, a4, y12
  1951. LFD a4, 19 * SIZE(AO1)
  1952. FMADD y13, alpha2, a5, y13
  1953. LFD a5, 20 * SIZE(AO1)
  1954. FMADD y14, alpha2, a6, y14
  1955. LFD a6, 21 * SIZE(AO1)
  1956. FMADD y15, alpha2, a7, y15
  1957. LFD a7, 22 * SIZE(AO1)
  1958. FMADD y16, alpha2, a8, y16
  1959. LFD a8, 23 * SIZE(AO1)
  1960. STFD y01, 0 * SIZE(Y1)
  1961. STFD y02, 1 * SIZE(Y1)
  1962. STFD y03, 2 * SIZE(Y1)
  1963. STFD y04, 3 * SIZE(Y1)
  1964. LFD y01, 16 * SIZE(Y1)
  1965. LFD y02, 17 * SIZE(Y1)
  1966. LFD y03, 18 * SIZE(Y1)
  1967. LFD y04, 19 * SIZE(Y1)
  1968. STFD y05, 4 * SIZE(Y1)
  1969. STFD y06, 5 * SIZE(Y1)
  1970. STFD y07, 6 * SIZE(Y1)
  1971. STFD y08, 7 * SIZE(Y1)
  1972. LFD y05, 20 * SIZE(Y1)
  1973. LFD y06, 21 * SIZE(Y1)
  1974. LFD y07, 22 * SIZE(Y1)
  1975. LFD y08, 23 * SIZE(Y1)
  1976. STFD y09, 8 * SIZE(Y1)
  1977. STFD y10, 9 * SIZE(Y1)
  1978. STFD y11, 10 * SIZE(Y1)
  1979. STFD y12, 11 * SIZE(Y1)
  1980. LFD y09, 24 * SIZE(Y1)
  1981. LFD y10, 25 * SIZE(Y1)
  1982. LFD y11, 26 * SIZE(Y1)
  1983. LFD y12, 27 * SIZE(Y1)
  1984. STFD y13, 12 * SIZE(Y1)
  1985. STFD y14, 13 * SIZE(Y1)
  1986. STFD y15, 14 * SIZE(Y1)
  1987. STFD y16, 15 * SIZE(Y1)
  1988. LFD y13, 28 * SIZE(Y1)
  1989. LFD y14, 29 * SIZE(Y1)
  1990. LFD y15, 30 * SIZE(Y1)
  1991. LFD y16, 31 * SIZE(Y1)
  1992. addi AO1, AO1, 16 * SIZE
  1993. addi AO2, AO2, 16 * SIZE
  1994. addi Y1, Y1, 16 * SIZE
  1995. DCBT(AO1, PREA)
  1996. DCBT(AO2, PREA)
  1997. DCBT(Y1, PREC)
  1998. bdnz LL(32)
  1999. .align 4
  2000. LL(33):
  2001. FMADD y01, alpha1, a1, y01
  2002. LFD a1, 8 * SIZE(AO1)
  2003. FMADD y02, alpha1, a2, y02
  2004. LFD a2, 9 * SIZE(AO1)
  2005. FMADD y03, alpha1, a3, y03
  2006. LFD a3, 10 * SIZE(AO1)
  2007. FMADD y04, alpha1, a4, y04
  2008. LFD a4, 11 * SIZE(AO1)
  2009. FMADD y05, alpha1, a5, y05
  2010. LFD a5, 12 * SIZE(AO1)
  2011. FMADD y06, alpha1, a6, y06
  2012. LFD a6, 13 * SIZE(AO1)
  2013. FMADD y07, alpha1, a7, y07
  2014. LFD a7, 14 * SIZE(AO1)
  2015. FMADD y08, alpha1, a8, y08
  2016. LFD a8, 15 * SIZE(AO1)
  2017. FMADD y09, alpha1, a1, y09
  2018. LFD a1, 0 * SIZE(AO2)
  2019. FMADD y10, alpha1, a2, y10
  2020. LFD a2, 1 * SIZE(AO2)
  2021. FMADD y11, alpha1, a3, y11
  2022. LFD a3, 2 * SIZE(AO2)
  2023. FMADD y12, alpha1, a4, y12
  2024. LFD a4, 3 * SIZE(AO2)
  2025. FMADD y13, alpha1, a5, y13
  2026. LFD a5, 4 * SIZE(AO2)
  2027. FMADD y14, alpha1, a6, y14
  2028. LFD a6, 5 * SIZE(AO2)
  2029. FMADD y15, alpha1, a7, y15
  2030. LFD a7, 6 * SIZE(AO2)
  2031. FMADD y16, alpha1, a8, y16
  2032. LFD a8, 7 * SIZE(AO2)
  2033. FMADD y01, alpha2, a1, y01
  2034. LFD a1, 8 * SIZE(AO2)
  2035. FMADD y02, alpha2, a2, y02
  2036. LFD a2, 9 * SIZE(AO2)
  2037. FMADD y03, alpha2, a3, y03
  2038. LFD a3, 10 * SIZE(AO2)
  2039. FMADD y04, alpha2, a4, y04
  2040. LFD a4, 11 * SIZE(AO2)
  2041. FMADD y05, alpha2, a5, y05
  2042. LFD a5, 12 * SIZE(AO2)
  2043. FMADD y06, alpha2, a6, y06
  2044. LFD a6, 13 * SIZE(AO2)
  2045. FMADD y07, alpha2, a7, y07
  2046. LFD a7, 14 * SIZE(AO2)
  2047. FMADD y08, alpha2, a8, y08
  2048. LFD a8, 15 * SIZE(AO2)
  2049. FMADD y09, alpha2, a1, y09
  2050. FMADD y10, alpha2, a2, y10
  2051. FMADD y11, alpha2, a3, y11
  2052. FMADD y12, alpha2, a4, y12
  2053. FMADD y13, alpha2, a5, y13
  2054. FMADD y14, alpha2, a6, y14
  2055. FMADD y15, alpha2, a7, y15
  2056. FMADD y16, alpha2, a8, y16
  2057. STFD y01, 0 * SIZE(Y1)
  2058. STFD y02, 1 * SIZE(Y1)
  2059. STFD y03, 2 * SIZE(Y1)
  2060. STFD y04, 3 * SIZE(Y1)
  2061. STFD y05, 4 * SIZE(Y1)
  2062. STFD y06, 5 * SIZE(Y1)
  2063. STFD y07, 6 * SIZE(Y1)
  2064. STFD y08, 7 * SIZE(Y1)
  2065. STFD y09, 8 * SIZE(Y1)
  2066. STFD y10, 9 * SIZE(Y1)
  2067. STFD y11, 10 * SIZE(Y1)
  2068. STFD y12, 11 * SIZE(Y1)
  2069. STFD y13, 12 * SIZE(Y1)
  2070. STFD y14, 13 * SIZE(Y1)
  2071. STFD y15, 14 * SIZE(Y1)
  2072. STFD y16, 15 * SIZE(Y1)
  2073. addi AO1, AO1, 16 * SIZE
  2074. addi AO2, AO2, 16 * SIZE
  2075. addi Y1, Y1, 16 * SIZE
  2076. .align 4
  2077. LL(35):
  2078. andi. r0, M, 15
  2079. ble LL(40)
  2080. andi. r0, M, 8
  2081. ble LL(36)
  2082. LFD y01, 0 * SIZE(Y1)
  2083. LFD y02, 1 * SIZE(Y1)
  2084. LFD y03, 2 * SIZE(Y1)
  2085. LFD y04, 3 * SIZE(Y1)
  2086. LFD y05, 4 * SIZE(Y1)
  2087. LFD y06, 5 * SIZE(Y1)
  2088. LFD y07, 6 * SIZE(Y1)
  2089. LFD y08, 7 * SIZE(Y1)
  2090. LFD a1, 0 * SIZE(AO1)
  2091. LFD a2, 1 * SIZE(AO1)
  2092. LFD a3, 2 * SIZE(AO1)
  2093. LFD a4, 3 * SIZE(AO1)
  2094. LFD a5, 4 * SIZE(AO1)
  2095. LFD a6, 5 * SIZE(AO1)
  2096. LFD a7, 6 * SIZE(AO1)
  2097. LFD a8, 7 * SIZE(AO1)
  2098. FMADD y01, alpha1, a1, y01
  2099. LFD a1, 0 * SIZE(AO2)
  2100. FMADD y02, alpha1, a2, y02
  2101. LFD a2, 1 * SIZE(AO2)
  2102. FMADD y03, alpha1, a3, y03
  2103. LFD a3, 2 * SIZE(AO2)
  2104. FMADD y04, alpha1, a4, y04
  2105. LFD a4, 3 * SIZE(AO2)
  2106. FMADD y05, alpha1, a5, y05
  2107. LFD a5, 4 * SIZE(AO2)
  2108. FMADD y06, alpha1, a6, y06
  2109. LFD a6, 5 * SIZE(AO2)
  2110. FMADD y07, alpha1, a7, y07
  2111. LFD a7, 6 * SIZE(AO2)
  2112. FMADD y08, alpha1, a8, y08
  2113. LFD a8, 7 * SIZE(AO2)
  2114. FMADD y01, alpha2, a1, y01
  2115. FMADD y02, alpha2, a2, y02
  2116. FMADD y03, alpha2, a3, y03
  2117. FMADD y04, alpha2, a4, y04
  2118. FMADD y05, alpha2, a5, y05
  2119. FMADD y06, alpha2, a6, y06
  2120. FMADD y07, alpha2, a7, y07
  2121. FMADD y08, alpha2, a8, y08
  2122. STFD y01, 0 * SIZE(Y1)
  2123. STFD y02, 1 * SIZE(Y1)
  2124. STFD y03, 2 * SIZE(Y1)
  2125. STFD y04, 3 * SIZE(Y1)
  2126. STFD y05, 4 * SIZE(Y1)
  2127. STFD y06, 5 * SIZE(Y1)
  2128. STFD y07, 6 * SIZE(Y1)
  2129. STFD y08, 7 * SIZE(Y1)
  2130. addi AO1, AO1, 8 * SIZE
  2131. addi AO2, AO2, 8 * SIZE
  2132. addi Y1, Y1, 8 * SIZE
  2133. .align 4
  2134. LL(36):
  2135. andi. r0, M, 4
  2136. ble LL(37)
  2137. LFD y01, 0 * SIZE(Y1)
  2138. LFD y02, 1 * SIZE(Y1)
  2139. LFD y03, 2 * SIZE(Y1)
  2140. LFD y04, 3 * SIZE(Y1)
  2141. LFD a1, 0 * SIZE(AO1)
  2142. LFD a2, 1 * SIZE(AO1)
  2143. LFD a3, 2 * SIZE(AO1)
  2144. LFD a4, 3 * SIZE(AO1)
  2145. LFD a5, 0 * SIZE(AO2)
  2146. LFD a6, 1 * SIZE(AO2)
  2147. LFD a7, 2 * SIZE(AO2)
  2148. LFD a8, 3 * SIZE(AO2)
  2149. FMADD y01, alpha1, a1, y01
  2150. FMADD y02, alpha1, a2, y02
  2151. FMADD y03, alpha1, a3, y03
  2152. FMADD y04, alpha1, a4, y04
  2153. FMADD y01, alpha2, a5, y01
  2154. FMADD y02, alpha2, a6, y02
  2155. FMADD y03, alpha2, a7, y03
  2156. FMADD y04, alpha2, a8, y04
  2157. STFD y01, 0 * SIZE(Y1)
  2158. STFD y02, 1 * SIZE(Y1)
  2159. STFD y03, 2 * SIZE(Y1)
  2160. STFD y04, 3 * SIZE(Y1)
  2161. addi AO1, AO1, 4 * SIZE
  2162. addi AO2, AO2, 4 * SIZE
  2163. addi Y1, Y1, 4 * SIZE
  2164. .align 4
  2165. LL(37):
  2166. andi. r0, M, 2
  2167. ble LL(38)
  2168. LFD y01, 0 * SIZE(Y1)
  2169. LFD y02, 1 * SIZE(Y1)
  2170. LFD a1, 0 * SIZE(AO1)
  2171. LFD a2, 1 * SIZE(AO1)
  2172. LFD a3, 0 * SIZE(AO2)
  2173. LFD a4, 1 * SIZE(AO2)
  2174. FMADD y01, alpha1, a1, y01
  2175. FMADD y02, alpha1, a2, y02
  2176. FMADD y01, alpha2, a3, y01
  2177. FMADD y02, alpha2, a4, y02
  2178. STFD y01, 0 * SIZE(Y1)
  2179. STFD y02, 1 * SIZE(Y1)
  2180. addi AO1, AO1, 2 * SIZE
  2181. addi AO2, AO2, 2 * SIZE
  2182. addi Y1, Y1, 2 * SIZE
  2183. .align 4
  2184. LL(38):
  2185. andi. r0, M, 1
  2186. ble LL(40)
  2187. LFD y01, 0 * SIZE(Y1)
  2188. LFD a1, 0 * SIZE(AO1)
  2189. LFD a2, 0 * SIZE(AO2)
  2190. FMADD y01, alpha1, a1, y01
  2191. FMADD y01, alpha2, a2, y01
  2192. STFD y01, 0 * SIZE(Y1)
  2193. .align 4
  2194. LL(40):
  2195. andi. J, N, 1
  2196. lfd alpha, ALPHA
  2197. ble LL(990)
  2198. .align 4
  2199. LFD alpha1, 0 * SIZE(X)
  2200. FMUL alpha1, alpha, alpha1
  2201. mr AO1, A
  2202. mr Y1, YY
  2203. srawi. r0, M, 4
  2204. mtspr CTR, r0
  2205. ble LL(45)
  2206. LFD y01, 0 * SIZE(Y1)
  2207. LFD y02, 1 * SIZE(Y1)
  2208. LFD y03, 2 * SIZE(Y1)
  2209. LFD y04, 3 * SIZE(Y1)
  2210. LFD y05, 4 * SIZE(Y1)
  2211. LFD y06, 5 * SIZE(Y1)
  2212. LFD y07, 6 * SIZE(Y1)
  2213. LFD y08, 7 * SIZE(Y1)
  2214. LFD a1, 0 * SIZE(AO1)
  2215. LFD a2, 1 * SIZE(AO1)
  2216. LFD a3, 2 * SIZE(AO1)
  2217. LFD a4, 3 * SIZE(AO1)
  2218. LFD a5, 4 * SIZE(AO1)
  2219. LFD a6, 5 * SIZE(AO1)
  2220. LFD a7, 6 * SIZE(AO1)
  2221. LFD a8, 7 * SIZE(AO1)
  2222. LFD y09, 8 * SIZE(Y1)
  2223. LFD y10, 9 * SIZE(Y1)
  2224. LFD y11, 10 * SIZE(Y1)
  2225. LFD y12, 11 * SIZE(Y1)
  2226. LFD y13, 12 * SIZE(Y1)
  2227. LFD y14, 13 * SIZE(Y1)
  2228. LFD y15, 14 * SIZE(Y1)
  2229. LFD y16, 15 * SIZE(Y1)
  2230. bdz LL(43)
  2231. .align 4
  2232. LL(42):
  2233. FMADD y01, alpha1, a1, y01
  2234. LFD a1, 8 * SIZE(AO1)
  2235. FMADD y02, alpha1, a2, y02
  2236. LFD a2, 9 * SIZE(AO1)
  2237. FMADD y03, alpha1, a3, y03
  2238. LFD a3, 10 * SIZE(AO1)
  2239. FMADD y04, alpha1, a4, y04
  2240. LFD a4, 11 * SIZE(AO1)
  2241. FMADD y05, alpha1, a5, y05
  2242. LFD a5, 12 * SIZE(AO1)
  2243. FMADD y06, alpha1, a6, y06
  2244. LFD a6, 13 * SIZE(AO1)
  2245. FMADD y07, alpha1, a7, y07
  2246. LFD a7, 14 * SIZE(AO1)
  2247. FMADD y08, alpha1, a8, y08
  2248. LFD a8, 15 * SIZE(AO1)
  2249. FMADD y09, alpha1, a1, y09
  2250. LFD a1, 16 * SIZE(AO1)
  2251. FMADD y10, alpha1, a2, y10
  2252. LFD a2, 17 * SIZE(AO1)
  2253. FMADD y11, alpha1, a3, y11
  2254. LFD a3, 18 * SIZE(AO1)
  2255. FMADD y12, alpha1, a4, y12
  2256. LFD a4, 19 * SIZE(AO1)
  2257. FMADD y13, alpha1, a5, y13
  2258. LFD a5, 20 * SIZE(AO1)
  2259. FMADD y14, alpha1, a6, y14
  2260. LFD a6, 21 * SIZE(AO1)
  2261. FMADD y15, alpha1, a7, y15
  2262. LFD a7, 22 * SIZE(AO1)
  2263. FMADD y16, alpha1, a8, y16
  2264. LFD a8, 23 * SIZE(AO1)
  2265. STFD y01, 0 * SIZE(Y1)
  2266. LFD y01, 16 * SIZE(Y1)
  2267. STFD y02, 1 * SIZE(Y1)
  2268. LFD y02, 17 * SIZE(Y1)
  2269. STFD y03, 2 * SIZE(Y1)
  2270. LFD y03, 18 * SIZE(Y1)
  2271. STFD y04, 3 * SIZE(Y1)
  2272. LFD y04, 19 * SIZE(Y1)
  2273. STFD y05, 4 * SIZE(Y1)
  2274. LFD y05, 20 * SIZE(Y1)
  2275. STFD y06, 5 * SIZE(Y1)
  2276. LFD y06, 21 * SIZE(Y1)
  2277. STFD y07, 6 * SIZE(Y1)
  2278. LFD y07, 22 * SIZE(Y1)
  2279. STFD y08, 7 * SIZE(Y1)
  2280. LFD y08, 23 * SIZE(Y1)
  2281. STFD y09, 8 * SIZE(Y1)
  2282. LFD y09, 24 * SIZE(Y1)
  2283. STFD y10, 9 * SIZE(Y1)
  2284. LFD y10, 25 * SIZE(Y1)
  2285. STFD y11, 10 * SIZE(Y1)
  2286. LFD y11, 26 * SIZE(Y1)
  2287. STFD y12, 11 * SIZE(Y1)
  2288. LFD y12, 27 * SIZE(Y1)
  2289. STFD y13, 12 * SIZE(Y1)
  2290. LFD y13, 28 * SIZE(Y1)
  2291. STFD y14, 13 * SIZE(Y1)
  2292. LFD y14, 29 * SIZE(Y1)
  2293. STFD y15, 14 * SIZE(Y1)
  2294. LFD y15, 30 * SIZE(Y1)
  2295. STFD y16, 15 * SIZE(Y1)
  2296. LFD y16, 31 * SIZE(Y1)
  2297. addi AO1, AO1, 16 * SIZE
  2298. addi Y1, Y1, 16 * SIZE
  2299. DCBT(AO1, PREA)
  2300. DCBT(Y1, PREC)
  2301. bdnz LL(42)
  2302. .align 4
  2303. LL(43):
  2304. FMADD y01, alpha1, a1, y01
  2305. LFD a1, 8 * SIZE(AO1)
  2306. FMADD y02, alpha1, a2, y02
  2307. LFD a2, 9 * SIZE(AO1)
  2308. FMADD y03, alpha1, a3, y03
  2309. LFD a3, 10 * SIZE(AO1)
  2310. FMADD y04, alpha1, a4, y04
  2311. LFD a4, 11 * SIZE(AO1)
  2312. FMADD y05, alpha1, a5, y05
  2313. LFD a5, 12 * SIZE(AO1)
  2314. FMADD y06, alpha1, a6, y06
  2315. LFD a6, 13 * SIZE(AO1)
  2316. FMADD y07, alpha1, a7, y07
  2317. LFD a7, 14 * SIZE(AO1)
  2318. FMADD y08, alpha1, a8, y08
  2319. LFD a8, 15 * SIZE(AO1)
  2320. FMADD y09, alpha1, a1, y09
  2321. FMADD y10, alpha1, a2, y10
  2322. FMADD y11, alpha1, a3, y11
  2323. FMADD y12, alpha1, a4, y12
  2324. FMADD y13, alpha1, a5, y13
  2325. FMADD y14, alpha1, a6, y14
  2326. FMADD y15, alpha1, a7, y15
  2327. FMADD y16, alpha1, a8, y16
  2328. STFD y01, 0 * SIZE(Y1)
  2329. STFD y02, 1 * SIZE(Y1)
  2330. STFD y03, 2 * SIZE(Y1)
  2331. STFD y04, 3 * SIZE(Y1)
  2332. STFD y05, 4 * SIZE(Y1)
  2333. STFD y06, 5 * SIZE(Y1)
  2334. STFD y07, 6 * SIZE(Y1)
  2335. STFD y08, 7 * SIZE(Y1)
  2336. STFD y09, 8 * SIZE(Y1)
  2337. STFD y10, 9 * SIZE(Y1)
  2338. STFD y11, 10 * SIZE(Y1)
  2339. STFD y12, 11 * SIZE(Y1)
  2340. STFD y13, 12 * SIZE(Y1)
  2341. STFD y14, 13 * SIZE(Y1)
  2342. STFD y15, 14 * SIZE(Y1)
  2343. STFD y16, 15 * SIZE(Y1)
  2344. addi AO1, AO1, 16 * SIZE
  2345. addi Y1, Y1, 16 * SIZE
  2346. .align 4
  2347. LL(45):
  2348. andi. r0, M, 15
  2349. ble LL(990)
  2350. andi. r0, M, 8
  2351. ble LL(46)
  2352. LFD y01, 0 * SIZE(Y1)
  2353. LFD y02, 1 * SIZE(Y1)
  2354. LFD y03, 2 * SIZE(Y1)
  2355. LFD y04, 3 * SIZE(Y1)
  2356. LFD a1, 0 * SIZE(AO1)
  2357. LFD a2, 1 * SIZE(AO1)
  2358. LFD a3, 2 * SIZE(AO1)
  2359. LFD a4, 3 * SIZE(AO1)
  2360. LFD y05, 4 * SIZE(Y1)
  2361. LFD y06, 5 * SIZE(Y1)
  2362. LFD y07, 6 * SIZE(Y1)
  2363. LFD y08, 7 * SIZE(Y1)
  2364. LFD a5, 4 * SIZE(AO1)
  2365. LFD a6, 5 * SIZE(AO1)
  2366. LFD a7, 6 * SIZE(AO1)
  2367. LFD a8, 7 * SIZE(AO1)
  2368. FMADD y01, alpha1, a1, y01
  2369. FMADD y02, alpha1, a2, y02
  2370. FMADD y03, alpha1, a3, y03
  2371. FMADD y04, alpha1, a4, y04
  2372. FMADD y05, alpha1, a5, y05
  2373. FMADD y06, alpha1, a6, y06
  2374. FMADD y07, alpha1, a7, y07
  2375. FMADD y08, alpha1, a8, y08
  2376. STFD y01, 0 * SIZE(Y1)
  2377. STFD y02, 1 * SIZE(Y1)
  2378. STFD y03, 2 * SIZE(Y1)
  2379. STFD y04, 3 * SIZE(Y1)
  2380. STFD y05, 4 * SIZE(Y1)
  2381. STFD y06, 5 * SIZE(Y1)
  2382. STFD y07, 6 * SIZE(Y1)
  2383. STFD y08, 7 * SIZE(Y1)
  2384. addi AO1, AO1, 8 * SIZE
  2385. addi Y1, Y1, 8 * SIZE
  2386. .align 4
  2387. LL(46):
  2388. andi. r0, M, 4
  2389. ble LL(47)
  2390. LFD y01, 0 * SIZE(Y1)
  2391. LFD y02, 1 * SIZE(Y1)
  2392. LFD y03, 2 * SIZE(Y1)
  2393. LFD y04, 3 * SIZE(Y1)
  2394. LFD a1, 0 * SIZE(AO1)
  2395. LFD a2, 1 * SIZE(AO1)
  2396. LFD a3, 2 * SIZE(AO1)
  2397. LFD a4, 3 * SIZE(AO1)
  2398. FMADD y01, alpha1, a1, y01
  2399. FMADD y02, alpha1, a2, y02
  2400. FMADD y03, alpha1, a3, y03
  2401. FMADD y04, alpha1, a4, y04
  2402. STFD y01, 0 * SIZE(Y1)
  2403. STFD y02, 1 * SIZE(Y1)
  2404. STFD y03, 2 * SIZE(Y1)
  2405. STFD y04, 3 * SIZE(Y1)
  2406. addi AO1, AO1, 4 * SIZE
  2407. addi Y1, Y1, 4 * SIZE
  2408. .align 4
  2409. LL(47):
  2410. andi. r0, M, 2
  2411. ble LL(48)
  2412. LFD y01, 0 * SIZE(Y1)
  2413. LFD y02, 1 * SIZE(Y1)
  2414. LFD a1, 0 * SIZE(AO1)
  2415. LFD a2, 1 * SIZE(AO1)
  2416. FMADD y01, alpha1, a1, y01
  2417. FMADD y02, alpha1, a2, y02
  2418. STFD y01, 0 * SIZE(Y1)
  2419. STFD y02, 1 * SIZE(Y1)
  2420. addi AO1, AO1, 2 * SIZE
  2421. addi Y1, Y1, 2 * SIZE
  2422. .align 4
  2423. LL(48):
  2424. andi. r0, M, 1
  2425. ble LL(990)
  2426. LFD y01, 0 * SIZE(Y1)
  2427. LFD a1, 0 * SIZE(AO1)
  2428. FMADD y01, alpha1, a1, y01
  2429. STFD y01, 0 * SIZE(Y1)
  2430. .align 4
  2431. LL(990):
  2432. cmpi cr0, 0, INCY, SIZE
  2433. beq LL(999)
  2434. mr YY, BUFFER
  2435. mr Y1, Y
  2436. srawi. r0, M, 3
  2437. mtspr CTR, r0
  2438. ble LL(995)
  2439. .align 4
  2440. LL(991):
  2441. LFD f0, 0 * SIZE(Y)
  2442. add Y, Y, INCY
  2443. LFD f1, 0 * SIZE(Y)
  2444. add Y, Y, INCY
  2445. LFD f2, 0 * SIZE(Y)
  2446. add Y, Y, INCY
  2447. LFD f3, 0 * SIZE(Y)
  2448. add Y, Y, INCY
  2449. LFD f4, 0 * SIZE(Y)
  2450. add Y, Y, INCY
  2451. LFD f5, 0 * SIZE(Y)
  2452. add Y, Y, INCY
  2453. LFD f6, 0 * SIZE(Y)
  2454. add Y, Y, INCY
  2455. LFD f7, 0 * SIZE(Y)
  2456. add Y, Y, INCY
  2457. LFD f8, 0 * SIZE(YY)
  2458. LFD f9, 1 * SIZE(YY)
  2459. LFD f10, 2 * SIZE(YY)
  2460. LFD f11, 3 * SIZE(YY)
  2461. LFD f12, 4 * SIZE(YY)
  2462. LFD f13, 5 * SIZE(YY)
  2463. LFD f14, 6 * SIZE(YY)
  2464. LFD f15, 7 * SIZE(YY)
  2465. addi YY, YY, 8 * SIZE
  2466. FADD f8, f8, f0
  2467. FADD f9, f9, f1
  2468. FADD f10, f10, f2
  2469. FADD f11, f11, f3
  2470. FADD f12, f12, f4
  2471. FADD f13, f13, f5
  2472. FADD f14, f14, f6
  2473. FADD f15, f15, f7
  2474. STFD f8, 0 * SIZE(Y1)
  2475. add Y1, Y1, INCY
  2476. STFD f9, 0 * SIZE(Y1)
  2477. add Y1, Y1, INCY
  2478. STFD f10, 0 * SIZE(Y1)
  2479. add Y1, Y1, INCY
  2480. STFD f11, 0 * SIZE(Y1)
  2481. add Y1, Y1, INCY
  2482. STFD f12, 0 * SIZE(Y1)
  2483. add Y1, Y1, INCY
  2484. STFD f13, 0 * SIZE(Y1)
  2485. add Y1, Y1, INCY
  2486. STFD f14, 0 * SIZE(Y1)
  2487. add Y1, Y1, INCY
  2488. STFD f15, 0 * SIZE(Y1)
  2489. add Y1, Y1, INCY
  2490. bdnz LL(991)
  2491. .align 4
  2492. LL(995):
  2493. andi. J, M, 4
  2494. ble LL(996)
  2495. LFD f0, 0 * SIZE(Y)
  2496. add Y, Y, INCY
  2497. LFD f1, 0 * SIZE(Y)
  2498. add Y, Y, INCY
  2499. LFD f2, 0 * SIZE(Y)
  2500. add Y, Y, INCY
  2501. LFD f3, 0 * SIZE(Y)
  2502. add Y, Y, INCY
  2503. LFD f8, 0 * SIZE(YY)
  2504. LFD f9, 1 * SIZE(YY)
  2505. LFD f10, 2 * SIZE(YY)
  2506. LFD f11, 3 * SIZE(YY)
  2507. addi YY, YY, 4 * SIZE
  2508. FADD f8, f8, f0
  2509. FADD f9, f9, f1
  2510. FADD f10, f10, f2
  2511. FADD f11, f11, f3
  2512. STFD f8, 0 * SIZE(Y1)
  2513. add Y1, Y1, INCY
  2514. STFD f9, 0 * SIZE(Y1)
  2515. add Y1, Y1, INCY
  2516. STFD f10, 0 * SIZE(Y1)
  2517. add Y1, Y1, INCY
  2518. STFD f11, 0 * SIZE(Y1)
  2519. add Y1, Y1, INCY
  2520. .align 4
  2521. LL(996):
  2522. andi. J, M, 2
  2523. ble LL(997)
  2524. LFD f0, 0 * SIZE(Y)
  2525. add Y, Y, INCY
  2526. LFD f1, 0 * SIZE(Y)
  2527. add Y, Y, INCY
  2528. LFD f8, 0 * SIZE(YY)
  2529. LFD f9, 1 * SIZE(YY)
  2530. addi YY, YY, 2 * SIZE
  2531. FADD f8, f8, f0
  2532. FADD f9, f9, f1
  2533. STFD f8, 0 * SIZE(Y1)
  2534. add Y1, Y1, INCY
  2535. STFD f9, 0 * SIZE(Y1)
  2536. add Y1, Y1, INCY
  2537. .align 4
  2538. LL(997):
  2539. andi. J, M, 1
  2540. ble LL(999)
  2541. LFD f0, 0 * SIZE(Y)
  2542. LFD f8, 0 * SIZE(YY)
  2543. FADD f8, f8, f0
  2544. STFD f8, 0 * SIZE(Y1)
  2545. .align 4
  2546. LL(999):
  2547. li r3, 0
  2548. lfd f14, 0(SP)
  2549. lfd f15, 8(SP)
  2550. lfd f16, 16(SP)
  2551. lfd f17, 24(SP)
  2552. lfd f18, 32(SP)
  2553. lfd f19, 40(SP)
  2554. lfd f20, 48(SP)
  2555. lfd f21, 56(SP)
  2556. lfd f22, 64(SP)
  2557. lfd f23, 72(SP)
  2558. lfd f24, 80(SP)
  2559. lfd f25, 88(SP)
  2560. lfd f26, 96(SP)
  2561. lfd f27, 104(SP)
  2562. lfd f28, 112(SP)
  2563. lfd f29, 120(SP)
  2564. lfd f30, 128(SP)
  2565. lfd f31, 136(SP)
  2566. #ifdef __64BIT__
  2567. ld r14, 144(SP)
  2568. ld r15, 152(SP)
  2569. ld r16, 160(SP)
  2570. ld r17, 168(SP)
  2571. ld r18, 176(SP)
  2572. ld r19, 184(SP)
  2573. ld r20, 192(SP)
  2574. ld r21, 200(SP)
  2575. ld r22, 208(SP)
  2576. ld r23, 216(SP)
  2577. ld r24, 224(SP)
  2578. ld r25, 232(SP)
  2579. ld r26, 240(SP)
  2580. ld r27, 248(SP)
  2581. #else
  2582. lwz r14, 144(SP)
  2583. lwz r15, 148(SP)
  2584. lwz r16, 152(SP)
  2585. lwz r17, 156(SP)
  2586. lwz r18, 160(SP)
  2587. lwz r19, 164(SP)
  2588. lwz r20, 168(SP)
  2589. lwz r21, 172(SP)
  2590. lwz r22, 176(SP)
  2591. lwz r23, 180(SP)
  2592. lwz r24, 184(SP)
  2593. lwz r25, 188(SP)
  2594. lwz r26, 192(SP)
  2595. lwz r27, 196(SP)
  2596. #endif
  2597. addi SP, SP, STACKSIZE
  2598. blr
  2599. EPILOGUE
  2600. #endif