You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 57 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef linux
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define BUFFER r11
  83. #define XP r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define AO5 r18
  89. #define AO6 r19
  90. #define AO7 r20
  91. #define AO8 r21
  92. #define MIN_N r22
  93. #define J r23
  94. #define CO r24
  95. #define PREA r25
  96. #define PREC r26
  97. #define BO r27
  98. #define PLDA_M r28
  99. #define IS r29
  100. #define Y1 CO
  101. #if defined(PPCG4)
  102. #define PREFETCHSIZE_A 42
  103. #define PREFETCHSIZE_C 16
  104. #endif
  105. #if defined(PPC440) || defined(PPC440FP2)
  106. #define PREFETCHSIZE_A 42
  107. #define PREFETCHSIZE_C 16
  108. #endif
  109. #ifdef PPC970
  110. #define PREFETCHSIZE_A 42
  111. #define PREFETCHSIZE_C 16
  112. #endif
  113. #ifdef CELL
  114. #define PREFETCHSIZE_A 42
  115. #define PREFETCHSIZE_C 16
  116. #endif
  117. #ifdef POWER4
  118. #define PREFETCHSIZE_A 48
  119. #define PREFETCHSIZE_C 16
  120. #endif
  121. #ifdef POWER5
  122. #define PREFETCHSIZE_A 40
  123. #define PREFETCHSIZE_C 8
  124. #endif
  125. #ifdef POWER6
  126. #define PREFETCHSIZE_A 96
  127. #define PREFETCHSIZE_C 8
  128. #endif
  129. #define y01 f0
  130. #define y02 f1
  131. #define y03 f2
  132. #define y04 f3
  133. #define y05 f4
  134. #define y06 f5
  135. #define y07 f6
  136. #define y08 f7
  137. #define y09 f8
  138. #define y10 f9
  139. #define y11 f10
  140. #define y12 f11
  141. #define y13 f12
  142. #define y14 f13
  143. #define y15 f14
  144. #define y16 f15
  145. #define a1 f16
  146. #define a2 f17
  147. #define a3 f18
  148. #define a4 f19
  149. #define a5 f20
  150. #define a6 f21
  151. #define a7 f22
  152. #define a8 f23
  153. #define b1 f24
  154. #define b2 f25
  155. #define b3 f26
  156. #define b4 f27
  157. #define b5 f28
  158. #define b6 f29
  159. #define b7 f30
  160. #define b8 f31
  161. #define alpha f31
  162. #ifndef NEEDPARAM
  163. #define P 2048
  164. #ifndef __64BIT__
  165. #define STACKSIZE 224
  166. #else
  167. #define STACKSIZE 288
  168. #endif
  169. #define FZERO 144(SP)
  170. #define ALPHA 152(SP)
  171. PROLOGUE
  172. PROFCODE
  173. addi SP, SP, -STACKSIZE
  174. li r0, 0
  175. stfd f14, 0(SP)
  176. stfd f15, 8(SP)
  177. stfd f16, 16(SP)
  178. stfd f17, 24(SP)
  179. stfd f18, 32(SP)
  180. stfd f19, 40(SP)
  181. stfd f20, 48(SP)
  182. stfd f21, 56(SP)
  183. stfd f22, 64(SP)
  184. stfd f23, 72(SP)
  185. stfd f24, 80(SP)
  186. stfd f25, 88(SP)
  187. stfd f26, 96(SP)
  188. stfd f27, 104(SP)
  189. stfd f28, 112(SP)
  190. stfd f29, 120(SP)
  191. stfd f30, 128(SP)
  192. stfd f31, 136(SP)
  193. #ifdef __64BIT__
  194. std r0, FZERO
  195. stfd f1, ALPHA
  196. std r14, 160(SP)
  197. std r15, 168(SP)
  198. std r16, 176(SP)
  199. std r17, 184(SP)
  200. std r18, 192(SP)
  201. std r19, 200(SP)
  202. std r20, 208(SP)
  203. std r21, 216(SP)
  204. std r22, 224(SP)
  205. std r23, 232(SP)
  206. std r24, 240(SP)
  207. std r25, 248(SP)
  208. std r26, 256(SP)
  209. std r27, 264(SP)
  210. std r28, 272(SP)
  211. std r29, 280(SP)
  212. #else
  213. stw r0, 0 + FZERO
  214. stw r0, 4 + FZERO
  215. stfd f1, ALPHA
  216. stw r14, 160(SP)
  217. stw r15, 164(SP)
  218. stw r16, 168(SP)
  219. stw r17, 172(SP)
  220. stw r18, 176(SP)
  221. stw r19, 180(SP)
  222. stw r20, 184(SP)
  223. stw r21, 188(SP)
  224. stw r22, 192(SP)
  225. stw r23, 196(SP)
  226. stw r24, 200(SP)
  227. stw r25, 204(SP)
  228. stw r26, 208(SP)
  229. stw r27, 212(SP)
  230. stw r28, 216(SP)
  231. stw r29, 220(SP)
  232. #endif
  233. #ifdef linux
  234. #ifndef __64BIT__
  235. lwz INCY, 8 + STACKSIZE(SP)
  236. lwz BUFFER, 12 + STACKSIZE(SP)
  237. #else
  238. ld Y, 112 + STACKSIZE(SP)
  239. ld INCY, 120 + STACKSIZE(SP)
  240. ld BUFFER, 128 + STACKSIZE(SP)
  241. #endif
  242. #endif
  243. #if defined(_AIX) || defined(__APPLE__)
  244. #ifndef __64BIT__
  245. #ifdef DOUBLE
  246. lwz INCX, 56 + STACKSIZE(SP)
  247. lwz Y, 60 + STACKSIZE(SP)
  248. lwz INCY, 64 + STACKSIZE(SP)
  249. lwz BUFFER, 68 + STACKSIZE(SP)
  250. #else
  251. lwz Y, 56 + STACKSIZE(SP)
  252. lwz INCY, 60 + STACKSIZE(SP)
  253. lwz BUFFER, 64 + STACKSIZE(SP)
  254. #endif
  255. #else
  256. ld Y, 112 + STACKSIZE(SP)
  257. ld INCY, 120 + STACKSIZE(SP)
  258. ld BUFFER, 128 + STACKSIZE(SP)
  259. #endif
  260. #endif
  261. mullw PLDA_M, LDA, N
  262. li XP, P
  263. subf PLDA_M, XP, PLDA_M
  264. slwi PLDA_M, PLDA_M, BASE_SHIFT
  265. slwi LDA, LDA, BASE_SHIFT
  266. slwi INCX, INCX, BASE_SHIFT
  267. slwi INCY, INCY, BASE_SHIFT
  268. subf Y, INCY, Y
  269. li IS, 0
  270. addi A, A, -SIZE
  271. li PREA, PREFETCHSIZE_A * SIZE
  272. li PREC, PREFETCHSIZE_C * SIZE
  273. cmpi cr0, 0, M, 0
  274. ble LL(999)
  275. cmpi cr0, 0, N, 0
  276. ble LL(999)
  277. .align 4
  278. LL(ISLoop):
  279. subf MIN_N, IS, M
  280. slwi r0, IS, BASE_SHIFT
  281. cmpi cr0, 0, MIN_N, P
  282. ble+ LL(min_nP)
  283. li MIN_N, P
  284. LL(min_nP):
  285. add XP, X, r0
  286. cmpi cr0, 0, INCX, SIZE
  287. beq LL(10)
  288. mr XP, BUFFER
  289. addi CO, BUFFER, -SIZE
  290. srawi. r0, MIN_N, 3
  291. mtspr CTR, r0
  292. ble LL(CopyRemain)
  293. .align 4
  294. LL(CopyKernel):
  295. LFD f0, 0 * SIZE(X)
  296. add X, X, INCX
  297. LFD f1, 0 * SIZE(X)
  298. add X, X, INCX
  299. LFD f2, 0 * SIZE(X)
  300. add X, X, INCX
  301. LFD f3, 0 * SIZE(X)
  302. add X, X, INCX
  303. LFD f4, 0 * SIZE(X)
  304. add X, X, INCX
  305. LFD f5, 0 * SIZE(X)
  306. add X, X, INCX
  307. LFD f6, 0 * SIZE(X)
  308. add X, X, INCX
  309. LFD f7, 0 * SIZE(X)
  310. add X, X, INCX
  311. STFD f0, 1 * SIZE(CO)
  312. STFD f1, 2 * SIZE(CO)
  313. STFD f2, 3 * SIZE(CO)
  314. STFD f3, 4 * SIZE(CO)
  315. STFD f4, 5 * SIZE(CO)
  316. STFD f5, 6 * SIZE(CO)
  317. STFD f6, 7 * SIZE(CO)
  318. STFDU f7, 8 * SIZE(CO)
  319. bdnz LL(CopyKernel)
  320. .align 4
  321. LL(CopyRemain):
  322. andi. r0, MIN_N, 7
  323. mtspr CTR, r0
  324. ble LL(10)
  325. .align 4
  326. LL(CopySub):
  327. LFD f0, 0 * SIZE(X)
  328. add X, X, INCX
  329. STFDU f0, 1 * SIZE(CO)
  330. bdnz LL(CopySub)
  331. .align 4
  332. LL(10):
  333. mr CO, Y
  334. addi XP, XP, -SIZE
  335. srawi. J, N, 3
  336. ble LL(20)
  337. .align 4
  338. LL(11):
  339. mr AO1, A
  340. add AO2, A, LDA
  341. add AO3, AO2, LDA
  342. add AO4, AO3, LDA
  343. add AO5, AO4, LDA
  344. add AO6, AO5, LDA
  345. add AO7, AO6, LDA
  346. add AO8, AO7, LDA
  347. add A, AO8, LDA
  348. mr BO, XP
  349. lfd y01, FZERO
  350. fmr y02, y01
  351. fmr y03, y01
  352. fmr y04, y01
  353. fmr y05, y01
  354. fmr y06, y01
  355. fmr y07, y01
  356. fmr y08, y01
  357. fmr y09, y01
  358. fmr y10, y01
  359. fmr y11, y01
  360. fmr y12, y01
  361. fmr y13, y01
  362. fmr y14, y01
  363. fmr y15, y01
  364. fmr y16, y01
  365. DCBT(Y1, PREC)
  366. srawi. r0, MIN_N, 4
  367. mtspr CTR, r0
  368. ble LL(14)
  369. LFD a1, 1 * SIZE(AO1)
  370. LFD a2, 1 * SIZE(AO2)
  371. LFD a3, 1 * SIZE(AO3)
  372. LFD a4, 1 * SIZE(AO4)
  373. LFD a5, 1 * SIZE(AO5)
  374. LFD a6, 1 * SIZE(AO6)
  375. LFD a7, 1 * SIZE(AO7)
  376. LFD a8, 1 * SIZE(AO8)
  377. LFD b1, 1 * SIZE(BO)
  378. LFD b2, 2 * SIZE(BO)
  379. LFD b3, 3 * SIZE(BO)
  380. LFD b4, 4 * SIZE(BO)
  381. LFD b5, 5 * SIZE(BO)
  382. LFD b6, 6 * SIZE(BO)
  383. LFD b7, 7 * SIZE(BO)
  384. LFD b8, 8 * SIZE(BO)
  385. bdz LL(13)
  386. .align 4
  387. LL(12):
  388. FMADD y01, a1, b1, y01
  389. LFD a1, 2 * SIZE(AO1)
  390. FMADD y02, a2, b1, y02
  391. LFD a2, 2 * SIZE(AO2)
  392. FMADD y03, a3, b1, y03
  393. LFD a3, 2 * SIZE(AO3)
  394. FMADD y04, a4, b1, y04
  395. LFD a4, 2 * SIZE(AO4)
  396. FMADD y05, a5, b1, y05
  397. LFD a5, 2 * SIZE(AO5)
  398. FMADD y06, a6, b1, y06
  399. LFD a6, 2 * SIZE(AO6)
  400. FMADD y07, a7, b1, y07
  401. LFD a7, 2 * SIZE(AO7)
  402. FMADD y08, a8, b1, y08
  403. LFD a8, 2 * SIZE(AO8)
  404. FMADD y09, a1, b2, y09
  405. LFD a1, 3 * SIZE(AO1)
  406. FMADD y10, a2, b2, y10
  407. LFD a2, 3 * SIZE(AO2)
  408. FMADD y11, a3, b2, y11
  409. LFD a3, 3 * SIZE(AO3)
  410. FMADD y12, a4, b2, y12
  411. LFD a4, 3 * SIZE(AO4)
  412. FMADD y13, a5, b2, y13
  413. LFD a5, 3 * SIZE(AO5)
  414. FMADD y14, a6, b2, y14
  415. LFD a6, 3 * SIZE(AO6)
  416. FMADD y15, a7, b2, y15
  417. LFD a7, 3 * SIZE(AO7)
  418. FMADD y16, a8, b2, y16
  419. LFD a8, 3 * SIZE(AO8)
  420. FMADD y01, a1, b3, y01
  421. LFD a1, 4 * SIZE(AO1)
  422. FMADD y02, a2, b3, y02
  423. LFD a2, 4 * SIZE(AO2)
  424. FMADD y03, a3, b3, y03
  425. LFD a3, 4 * SIZE(AO3)
  426. FMADD y04, a4, b3, y04
  427. LFD a4, 4 * SIZE(AO4)
  428. FMADD y05, a5, b3, y05
  429. LFD a5, 4 * SIZE(AO5)
  430. FMADD y06, a6, b3, y06
  431. LFD a6, 4 * SIZE(AO6)
  432. FMADD y07, a7, b3, y07
  433. LFD a7, 4 * SIZE(AO7)
  434. FMADD y08, a8, b3, y08
  435. LFD a8, 4 * SIZE(AO8)
  436. FMADD y09, a1, b4, y09
  437. LFD a1, 5 * SIZE(AO1)
  438. FMADD y10, a2, b4, y10
  439. LFD a2, 5 * SIZE(AO2)
  440. FMADD y11, a3, b4, y11
  441. LFD a3, 5 * SIZE(AO3)
  442. FMADD y12, a4, b4, y12
  443. LFD a4, 5 * SIZE(AO4)
  444. FMADD y13, a5, b4, y13
  445. LFD a5, 5 * SIZE(AO5)
  446. FMADD y14, a6, b4, y14
  447. LFD a6, 5 * SIZE(AO6)
  448. FMADD y15, a7, b4, y15
  449. LFD a7, 5 * SIZE(AO7)
  450. FMADD y16, a8, b4, y16
  451. LFD a8, 5 * SIZE(AO8)
  452. LFD b1, 9 * SIZE(BO)
  453. LFD b2, 10 * SIZE(BO)
  454. LFD b3, 11 * SIZE(BO)
  455. LFD b4, 12 * SIZE(BO)
  456. FMADD y01, a1, b5, y01
  457. LFD a1, 6 * SIZE(AO1)
  458. FMADD y02, a2, b5, y02
  459. LFD a2, 6 * SIZE(AO2)
  460. FMADD y03, a3, b5, y03
  461. LFD a3, 6 * SIZE(AO3)
  462. FMADD y04, a4, b5, y04
  463. LFD a4, 6 * SIZE(AO4)
  464. FMADD y05, a5, b5, y05
  465. LFD a5, 6 * SIZE(AO5)
  466. FMADD y06, a6, b5, y06
  467. LFD a6, 6 * SIZE(AO6)
  468. FMADD y07, a7, b5, y07
  469. LFD a7, 6 * SIZE(AO7)
  470. FMADD y08, a8, b5, y08
  471. LFD a8, 6 * SIZE(AO8)
  472. FMADD y09, a1, b6, y09
  473. LFD a1, 7 * SIZE(AO1)
  474. FMADD y10, a2, b6, y10
  475. LFD a2, 7 * SIZE(AO2)
  476. FMADD y11, a3, b6, y11
  477. LFD a3, 7 * SIZE(AO3)
  478. FMADD y12, a4, b6, y12
  479. LFD a4, 7 * SIZE(AO4)
  480. FMADD y13, a5, b6, y13
  481. LFD a5, 7 * SIZE(AO5)
  482. FMADD y14, a6, b6, y14
  483. LFD a6, 7 * SIZE(AO6)
  484. FMADD y15, a7, b6, y15
  485. LFD a7, 7 * SIZE(AO7)
  486. FMADD y16, a8, b6, y16
  487. LFD a8, 7 * SIZE(AO8)
  488. FMADD y01, a1, b7, y01
  489. LFD a1, 8 * SIZE(AO1)
  490. FMADD y02, a2, b7, y02
  491. LFD a2, 8 * SIZE(AO2)
  492. FMADD y03, a3, b7, y03
  493. LFD a3, 8 * SIZE(AO3)
  494. FMADD y04, a4, b7, y04
  495. LFD a4, 8 * SIZE(AO4)
  496. FMADD y05, a5, b7, y05
  497. LFD a5, 8 * SIZE(AO5)
  498. FMADD y06, a6, b7, y06
  499. LFD a6, 8 * SIZE(AO6)
  500. FMADD y07, a7, b7, y07
  501. LFD a7, 8 * SIZE(AO7)
  502. FMADD y08, a8, b7, y08
  503. LFD a8, 8 * SIZE(AO8)
  504. FMADD y09, a1, b8, y09
  505. LFD a1, 9 * SIZE(AO1)
  506. FMADD y10, a2, b8, y10
  507. LFD a2, 9 * SIZE(AO2)
  508. FMADD y11, a3, b8, y11
  509. LFD a3, 9 * SIZE(AO3)
  510. FMADD y12, a4, b8, y12
  511. LFD a4, 9 * SIZE(AO4)
  512. FMADD y13, a5, b8, y13
  513. LFD a5, 9 * SIZE(AO5)
  514. FMADD y14, a6, b8, y14
  515. LFD a6, 9 * SIZE(AO6)
  516. FMADD y15, a7, b8, y15
  517. LFD a7, 9 * SIZE(AO7)
  518. FMADD y16, a8, b8, y16
  519. LFD a8, 9 * SIZE(AO8)
  520. LFD b5, 13 * SIZE(BO)
  521. LFD b6, 14 * SIZE(BO)
  522. LFD b7, 15 * SIZE(BO)
  523. LFD b8, 16 * SIZE(BO)
  524. DCBT(AO1, PREA)
  525. DCBT(AO2, PREA)
  526. DCBT(AO3, PREA)
  527. DCBT(AO4, PREA)
  528. FMADD y01, a1, b1, y01
  529. LFD a1, 10 * SIZE(AO1)
  530. FMADD y02, a2, b1, y02
  531. LFD a2, 10 * SIZE(AO2)
  532. FMADD y03, a3, b1, y03
  533. LFD a3, 10 * SIZE(AO3)
  534. FMADD y04, a4, b1, y04
  535. LFD a4, 10 * SIZE(AO4)
  536. FMADD y05, a5, b1, y05
  537. LFD a5, 10 * SIZE(AO5)
  538. FMADD y06, a6, b1, y06
  539. LFD a6, 10 * SIZE(AO6)
  540. FMADD y07, a7, b1, y07
  541. LFD a7, 10 * SIZE(AO7)
  542. FMADD y08, a8, b1, y08
  543. LFD a8, 10 * SIZE(AO8)
  544. FMADD y09, a1, b2, y09
  545. LFD a1, 11 * SIZE(AO1)
  546. FMADD y10, a2, b2, y10
  547. LFD a2, 11 * SIZE(AO2)
  548. FMADD y11, a3, b2, y11
  549. LFD a3, 11 * SIZE(AO3)
  550. FMADD y12, a4, b2, y12
  551. LFD a4, 11 * SIZE(AO4)
  552. FMADD y13, a5, b2, y13
  553. LFD a5, 11 * SIZE(AO5)
  554. FMADD y14, a6, b2, y14
  555. LFD a6, 11 * SIZE(AO6)
  556. FMADD y15, a7, b2, y15
  557. LFD a7, 11 * SIZE(AO7)
  558. FMADD y16, a8, b2, y16
  559. LFD a8, 11 * SIZE(AO8)
  560. FMADD y01, a1, b3, y01
  561. LFD a1, 12 * SIZE(AO1)
  562. FMADD y02, a2, b3, y02
  563. LFD a2, 12 * SIZE(AO2)
  564. FMADD y03, a3, b3, y03
  565. LFD a3, 12 * SIZE(AO3)
  566. FMADD y04, a4, b3, y04
  567. LFD a4, 12 * SIZE(AO4)
  568. FMADD y05, a5, b3, y05
  569. LFD a5, 12 * SIZE(AO5)
  570. FMADD y06, a6, b3, y06
  571. LFD a6, 12 * SIZE(AO6)
  572. FMADD y07, a7, b3, y07
  573. LFD a7, 12 * SIZE(AO7)
  574. FMADD y08, a8, b3, y08
  575. LFD a8, 12 * SIZE(AO8)
  576. FMADD y09, a1, b4, y09
  577. LFD a1, 13 * SIZE(AO1)
  578. FMADD y10, a2, b4, y10
  579. LFD a2, 13 * SIZE(AO2)
  580. FMADD y11, a3, b4, y11
  581. LFD a3, 13 * SIZE(AO3)
  582. FMADD y12, a4, b4, y12
  583. LFD a4, 13 * SIZE(AO4)
  584. FMADD y13, a5, b4, y13
  585. LFD a5, 13 * SIZE(AO5)
  586. FMADD y14, a6, b4, y14
  587. LFD a6, 13 * SIZE(AO6)
  588. FMADD y15, a7, b4, y15
  589. LFD a7, 13 * SIZE(AO7)
  590. FMADD y16, a8, b4, y16
  591. LFD a8, 13 * SIZE(AO8)
  592. LFD b1, 17 * SIZE(BO)
  593. LFD b2, 18 * SIZE(BO)
  594. LFD b3, 19 * SIZE(BO)
  595. LFD b4, 20 * SIZE(BO)
  596. FMADD y01, a1, b5, y01
  597. LFD a1, 14 * SIZE(AO1)
  598. FMADD y02, a2, b5, y02
  599. LFD a2, 14 * SIZE(AO2)
  600. FMADD y03, a3, b5, y03
  601. LFD a3, 14 * SIZE(AO3)
  602. FMADD y04, a4, b5, y04
  603. LFD a4, 14 * SIZE(AO4)
  604. FMADD y05, a5, b5, y05
  605. LFD a5, 14 * SIZE(AO5)
  606. FMADD y06, a6, b5, y06
  607. LFD a6, 14 * SIZE(AO6)
  608. FMADD y07, a7, b5, y07
  609. LFD a7, 14 * SIZE(AO7)
  610. FMADD y08, a8, b5, y08
  611. LFD a8, 14 * SIZE(AO8)
  612. FMADD y09, a1, b6, y09
  613. LFD a1, 15 * SIZE(AO1)
  614. FMADD y10, a2, b6, y10
  615. LFD a2, 15 * SIZE(AO2)
  616. FMADD y11, a3, b6, y11
  617. LFD a3, 15 * SIZE(AO3)
  618. FMADD y12, a4, b6, y12
  619. LFD a4, 15 * SIZE(AO4)
  620. FMADD y13, a5, b6, y13
  621. LFD a5, 15 * SIZE(AO5)
  622. FMADD y14, a6, b6, y14
  623. LFD a6, 15 * SIZE(AO6)
  624. FMADD y15, a7, b6, y15
  625. LFD a7, 15 * SIZE(AO7)
  626. FMADD y16, a8, b6, y16
  627. LFD a8, 15 * SIZE(AO8)
  628. FMADD y01, a1, b7, y01
  629. LFD a1, 16 * SIZE(AO1)
  630. FMADD y02, a2, b7, y02
  631. LFD a2, 16 * SIZE(AO2)
  632. FMADD y03, a3, b7, y03
  633. LFD a3, 16 * SIZE(AO3)
  634. FMADD y04, a4, b7, y04
  635. LFD a4, 16 * SIZE(AO4)
  636. FMADD y05, a5, b7, y05
  637. LFD a5, 16 * SIZE(AO5)
  638. FMADD y06, a6, b7, y06
  639. LFD a6, 16 * SIZE(AO6)
  640. FMADD y07, a7, b7, y07
  641. LFD a7, 16 * SIZE(AO7)
  642. FMADD y08, a8, b7, y08
  643. LFD a8, 16 * SIZE(AO8)
  644. FMADD y09, a1, b8, y09
  645. LFD a1, 17 * SIZE(AO1)
  646. FMADD y10, a2, b8, y10
  647. LFD a2, 17 * SIZE(AO2)
  648. FMADD y11, a3, b8, y11
  649. LFD a3, 17 * SIZE(AO3)
  650. FMADD y12, a4, b8, y12
  651. LFD a4, 17 * SIZE(AO4)
  652. addi AO1, AO1, 16 * SIZE
  653. addi AO2, AO2, 16 * SIZE
  654. addi AO3, AO3, 16 * SIZE
  655. addi AO4, AO4, 16 * SIZE
  656. FMADD y13, a5, b8, y13
  657. LFD a5, 17 * SIZE(AO5)
  658. FMADD y14, a6, b8, y14
  659. LFD a6, 17 * SIZE(AO6)
  660. FMADD y15, a7, b8, y15
  661. LFD a7, 17 * SIZE(AO7)
  662. FMADD y16, a8, b8, y16
  663. LFD a8, 17 * SIZE(AO8)
  664. LFD b5, 21 * SIZE(BO)
  665. LFD b6, 22 * SIZE(BO)
  666. LFD b7, 23 * SIZE(BO)
  667. LFD b8, 24 * SIZE(BO)
  668. addi AO5, AO5, 16 * SIZE
  669. addi AO6, AO6, 16 * SIZE
  670. DCBT(AO5, PREA)
  671. DCBT(AO6, PREA)
  672. addi AO7, AO7, 16 * SIZE
  673. addi AO8, AO8, 16 * SIZE
  674. DCBT(AO7, PREA)
  675. DCBT(AO8, PREA)
  676. addi BO, BO, 16 * SIZE
  677. bdnz LL(12)
  678. .align 4
  679. LL(13):
  680. FMADD y01, a1, b1, y01
  681. LFD a1, 2 * SIZE(AO1)
  682. FMADD y02, a2, b1, y02
  683. LFD a2, 2 * SIZE(AO2)
  684. FMADD y03, a3, b1, y03
  685. LFD a3, 2 * SIZE(AO3)
  686. FMADD y04, a4, b1, y04
  687. LFD a4, 2 * SIZE(AO4)
  688. FMADD y05, a5, b1, y05
  689. LFD a5, 2 * SIZE(AO5)
  690. FMADD y06, a6, b1, y06
  691. LFD a6, 2 * SIZE(AO6)
  692. FMADD y07, a7, b1, y07
  693. LFD a7, 2 * SIZE(AO7)
  694. FMADD y08, a8, b1, y08
  695. LFD a8, 2 * SIZE(AO8)
  696. FMADD y09, a1, b2, y09
  697. LFD a1, 3 * SIZE(AO1)
  698. FMADD y10, a2, b2, y10
  699. LFD a2, 3 * SIZE(AO2)
  700. FMADD y11, a3, b2, y11
  701. LFD a3, 3 * SIZE(AO3)
  702. FMADD y12, a4, b2, y12
  703. LFD a4, 3 * SIZE(AO4)
  704. FMADD y13, a5, b2, y13
  705. LFD a5, 3 * SIZE(AO5)
  706. FMADD y14, a6, b2, y14
  707. LFD a6, 3 * SIZE(AO6)
  708. FMADD y15, a7, b2, y15
  709. LFD a7, 3 * SIZE(AO7)
  710. FMADD y16, a8, b2, y16
  711. LFD a8, 3 * SIZE(AO8)
  712. FMADD y01, a1, b3, y01
  713. LFD a1, 4 * SIZE(AO1)
  714. FMADD y02, a2, b3, y02
  715. LFD a2, 4 * SIZE(AO2)
  716. FMADD y03, a3, b3, y03
  717. LFD a3, 4 * SIZE(AO3)
  718. FMADD y04, a4, b3, y04
  719. LFD a4, 4 * SIZE(AO4)
  720. FMADD y05, a5, b3, y05
  721. LFD a5, 4 * SIZE(AO5)
  722. FMADD y06, a6, b3, y06
  723. LFD a6, 4 * SIZE(AO6)
  724. FMADD y07, a7, b3, y07
  725. LFD a7, 4 * SIZE(AO7)
  726. FMADD y08, a8, b3, y08
  727. LFD a8, 4 * SIZE(AO8)
  728. FMADD y09, a1, b4, y09
  729. LFD a1, 5 * SIZE(AO1)
  730. FMADD y10, a2, b4, y10
  731. LFD a2, 5 * SIZE(AO2)
  732. FMADD y11, a3, b4, y11
  733. LFD a3, 5 * SIZE(AO3)
  734. FMADD y12, a4, b4, y12
  735. LFD a4, 5 * SIZE(AO4)
  736. FMADD y13, a5, b4, y13
  737. LFD a5, 5 * SIZE(AO5)
  738. FMADD y14, a6, b4, y14
  739. LFD a6, 5 * SIZE(AO6)
  740. FMADD y15, a7, b4, y15
  741. LFD a7, 5 * SIZE(AO7)
  742. FMADD y16, a8, b4, y16
  743. LFD a8, 5 * SIZE(AO8)
  744. LFD b1, 9 * SIZE(BO)
  745. LFD b2, 10 * SIZE(BO)
  746. LFD b3, 11 * SIZE(BO)
  747. LFD b4, 12 * SIZE(BO)
  748. FMADD y01, a1, b5, y01
  749. LFD a1, 6 * SIZE(AO1)
  750. FMADD y02, a2, b5, y02
  751. LFD a2, 6 * SIZE(AO2)
  752. FMADD y03, a3, b5, y03
  753. LFD a3, 6 * SIZE(AO3)
  754. FMADD y04, a4, b5, y04
  755. LFD a4, 6 * SIZE(AO4)
  756. FMADD y05, a5, b5, y05
  757. LFD a5, 6 * SIZE(AO5)
  758. FMADD y06, a6, b5, y06
  759. LFD a6, 6 * SIZE(AO6)
  760. FMADD y07, a7, b5, y07
  761. LFD a7, 6 * SIZE(AO7)
  762. FMADD y08, a8, b5, y08
  763. LFD a8, 6 * SIZE(AO8)
  764. FMADD y09, a1, b6, y09
  765. LFD a1, 7 * SIZE(AO1)
  766. FMADD y10, a2, b6, y10
  767. LFD a2, 7 * SIZE(AO2)
  768. FMADD y11, a3, b6, y11
  769. LFD a3, 7 * SIZE(AO3)
  770. FMADD y12, a4, b6, y12
  771. LFD a4, 7 * SIZE(AO4)
  772. FMADD y13, a5, b6, y13
  773. LFD a5, 7 * SIZE(AO5)
  774. FMADD y14, a6, b6, y14
  775. LFD a6, 7 * SIZE(AO6)
  776. FMADD y15, a7, b6, y15
  777. LFD a7, 7 * SIZE(AO7)
  778. FMADD y16, a8, b6, y16
  779. LFD a8, 7 * SIZE(AO8)
  780. FMADD y01, a1, b7, y01
  781. LFD a1, 8 * SIZE(AO1)
  782. FMADD y02, a2, b7, y02
  783. LFD a2, 8 * SIZE(AO2)
  784. FMADD y03, a3, b7, y03
  785. LFD a3, 8 * SIZE(AO3)
  786. FMADD y04, a4, b7, y04
  787. LFD a4, 8 * SIZE(AO4)
  788. FMADD y05, a5, b7, y05
  789. LFD a5, 8 * SIZE(AO5)
  790. FMADD y06, a6, b7, y06
  791. LFD a6, 8 * SIZE(AO6)
  792. FMADD y07, a7, b7, y07
  793. LFD a7, 8 * SIZE(AO7)
  794. FMADD y08, a8, b7, y08
  795. LFD a8, 8 * SIZE(AO8)
  796. FMADD y09, a1, b8, y09
  797. LFD a1, 9 * SIZE(AO1)
  798. FMADD y10, a2, b8, y10
  799. LFD a2, 9 * SIZE(AO2)
  800. FMADD y11, a3, b8, y11
  801. LFD a3, 9 * SIZE(AO3)
  802. FMADD y12, a4, b8, y12
  803. LFD a4, 9 * SIZE(AO4)
  804. FMADD y13, a5, b8, y13
  805. LFD a5, 9 * SIZE(AO5)
  806. FMADD y14, a6, b8, y14
  807. LFD a6, 9 * SIZE(AO6)
  808. FMADD y15, a7, b8, y15
  809. LFD a7, 9 * SIZE(AO7)
  810. FMADD y16, a8, b8, y16
  811. LFD a8, 9 * SIZE(AO8)
  812. LFD b5, 13 * SIZE(BO)
  813. LFD b6, 14 * SIZE(BO)
  814. LFD b7, 15 * SIZE(BO)
  815. LFD b8, 16 * SIZE(BO)
  816. FMADD y01, a1, b1, y01
  817. LFD a1, 10 * SIZE(AO1)
  818. FMADD y02, a2, b1, y02
  819. LFD a2, 10 * SIZE(AO2)
  820. FMADD y03, a3, b1, y03
  821. LFD a3, 10 * SIZE(AO3)
  822. FMADD y04, a4, b1, y04
  823. LFD a4, 10 * SIZE(AO4)
  824. FMADD y05, a5, b1, y05
  825. LFD a5, 10 * SIZE(AO5)
  826. FMADD y06, a6, b1, y06
  827. LFD a6, 10 * SIZE(AO6)
  828. FMADD y07, a7, b1, y07
  829. LFD a7, 10 * SIZE(AO7)
  830. FMADD y08, a8, b1, y08
  831. LFD a8, 10 * SIZE(AO8)
  832. FMADD y09, a1, b2, y09
  833. LFD a1, 11 * SIZE(AO1)
  834. FMADD y10, a2, b2, y10
  835. LFD a2, 11 * SIZE(AO2)
  836. FMADD y11, a3, b2, y11
  837. LFD a3, 11 * SIZE(AO3)
  838. FMADD y12, a4, b2, y12
  839. LFD a4, 11 * SIZE(AO4)
  840. FMADD y13, a5, b2, y13
  841. LFD a5, 11 * SIZE(AO5)
  842. FMADD y14, a6, b2, y14
  843. LFD a6, 11 * SIZE(AO6)
  844. FMADD y15, a7, b2, y15
  845. LFD a7, 11 * SIZE(AO7)
  846. FMADD y16, a8, b2, y16
  847. LFD a8, 11 * SIZE(AO8)
  848. FMADD y01, a1, b3, y01
  849. LFD a1, 12 * SIZE(AO1)
  850. FMADD y02, a2, b3, y02
  851. LFD a2, 12 * SIZE(AO2)
  852. FMADD y03, a3, b3, y03
  853. LFD a3, 12 * SIZE(AO3)
  854. FMADD y04, a4, b3, y04
  855. LFD a4, 12 * SIZE(AO4)
  856. FMADD y05, a5, b3, y05
  857. LFD a5, 12 * SIZE(AO5)
  858. FMADD y06, a6, b3, y06
  859. LFD a6, 12 * SIZE(AO6)
  860. FMADD y07, a7, b3, y07
  861. LFD a7, 12 * SIZE(AO7)
  862. FMADD y08, a8, b3, y08
  863. LFD a8, 12 * SIZE(AO8)
  864. FMADD y09, a1, b4, y09
  865. LFD a1, 13 * SIZE(AO1)
  866. FMADD y10, a2, b4, y10
  867. LFD a2, 13 * SIZE(AO2)
  868. FMADD y11, a3, b4, y11
  869. LFD a3, 13 * SIZE(AO3)
  870. FMADD y12, a4, b4, y12
  871. LFD a4, 13 * SIZE(AO4)
  872. FMADD y13, a5, b4, y13
  873. LFD a5, 13 * SIZE(AO5)
  874. FMADD y14, a6, b4, y14
  875. LFD a6, 13 * SIZE(AO6)
  876. FMADD y15, a7, b4, y15
  877. LFD a7, 13 * SIZE(AO7)
  878. FMADD y16, a8, b4, y16
  879. LFD a8, 13 * SIZE(AO8)
  880. FMADD y01, a1, b5, y01
  881. LFD a1, 14 * SIZE(AO1)
  882. FMADD y02, a2, b5, y02
  883. LFD a2, 14 * SIZE(AO2)
  884. FMADD y03, a3, b5, y03
  885. LFD a3, 14 * SIZE(AO3)
  886. FMADD y04, a4, b5, y04
  887. LFD a4, 14 * SIZE(AO4)
  888. FMADD y05, a5, b5, y05
  889. LFD a5, 14 * SIZE(AO5)
  890. FMADD y06, a6, b5, y06
  891. LFD a6, 14 * SIZE(AO6)
  892. FMADD y07, a7, b5, y07
  893. LFD a7, 14 * SIZE(AO7)
  894. FMADD y08, a8, b5, y08
  895. LFD a8, 14 * SIZE(AO8)
  896. FMADD y09, a1, b6, y09
  897. LFD a1, 15 * SIZE(AO1)
  898. FMADD y10, a2, b6, y10
  899. LFD a2, 15 * SIZE(AO2)
  900. FMADD y11, a3, b6, y11
  901. LFD a3, 15 * SIZE(AO3)
  902. FMADD y12, a4, b6, y12
  903. LFD a4, 15 * SIZE(AO4)
  904. FMADD y13, a5, b6, y13
  905. LFD a5, 15 * SIZE(AO5)
  906. FMADD y14, a6, b6, y14
  907. LFD a6, 15 * SIZE(AO6)
  908. FMADD y15, a7, b6, y15
  909. LFD a7, 15 * SIZE(AO7)
  910. FMADD y16, a8, b6, y16
  911. LFD a8, 15 * SIZE(AO8)
  912. FMADD y01, a1, b7, y01
  913. LFD a1, 16 * SIZE(AO1)
  914. FMADD y02, a2, b7, y02
  915. LFD a2, 16 * SIZE(AO2)
  916. FMADD y03, a3, b7, y03
  917. LFD a3, 16 * SIZE(AO3)
  918. FMADD y04, a4, b7, y04
  919. LFD a4, 16 * SIZE(AO4)
  920. FMADD y05, a5, b7, y05
  921. LFD a5, 16 * SIZE(AO5)
  922. FMADD y06, a6, b7, y06
  923. LFD a6, 16 * SIZE(AO6)
  924. FMADD y07, a7, b7, y07
  925. LFD a7, 16 * SIZE(AO7)
  926. FMADD y08, a8, b7, y08
  927. LFD a8, 16 * SIZE(AO8)
  928. FMADD y09, a1, b8, y09
  929. FMADD y10, a2, b8, y10
  930. FMADD y11, a3, b8, y11
  931. FMADD y12, a4, b8, y12
  932. addi AO1, AO1, 16 * SIZE
  933. addi AO2, AO2, 16 * SIZE
  934. addi AO3, AO3, 16 * SIZE
  935. addi AO4, AO4, 16 * SIZE
  936. FMADD y13, a5, b8, y13
  937. FMADD y14, a6, b8, y14
  938. FMADD y15, a7, b8, y15
  939. FMADD y16, a8, b8, y16
  940. addi AO5, AO5, 16 * SIZE
  941. addi AO6, AO6, 16 * SIZE
  942. addi AO7, AO7, 16 * SIZE
  943. addi AO8, AO8, 16 * SIZE
  944. addi BO, BO, 16 * SIZE
  945. .align 4
  946. LL(14):
  947. andi. r0, MIN_N, 15
  948. ble LL(18)
  949. andi. r0, MIN_N, 8
  950. ble LL(15)
  951. LFD a1, 1 * SIZE(AO1)
  952. LFD b1, 1 * SIZE(BO)
  953. LFD a2, 1 * SIZE(AO2)
  954. LFD a3, 1 * SIZE(AO3)
  955. LFD a4, 1 * SIZE(AO4)
  956. LFD a5, 1 * SIZE(AO5)
  957. LFD a6, 1 * SIZE(AO6)
  958. LFD a7, 1 * SIZE(AO7)
  959. LFD a8, 1 * SIZE(AO8)
  960. LFD b2, 2 * SIZE(BO)
  961. LFD b3, 3 * SIZE(BO)
  962. LFD b4, 4 * SIZE(BO)
  963. FMADD y01, a1, b1, y01
  964. LFD a1, 2 * SIZE(AO1)
  965. FMADD y02, a2, b1, y02
  966. LFD a2, 2 * SIZE(AO2)
  967. FMADD y03, a3, b1, y03
  968. LFD a3, 2 * SIZE(AO3)
  969. FMADD y04, a4, b1, y04
  970. LFD a4, 2 * SIZE(AO4)
  971. FMADD y05, a5, b1, y05
  972. LFD a5, 2 * SIZE(AO5)
  973. FMADD y06, a6, b1, y06
  974. LFD a6, 2 * SIZE(AO6)
  975. FMADD y07, a7, b1, y07
  976. LFD a7, 2 * SIZE(AO7)
  977. FMADD y08, a8, b1, y08
  978. LFD a8, 2 * SIZE(AO8)
  979. FMADD y09, a1, b2, y09
  980. LFD a1, 3 * SIZE(AO1)
  981. FMADD y10, a2, b2, y10
  982. LFD a2, 3 * SIZE(AO2)
  983. FMADD y11, a3, b2, y11
  984. LFD a3, 3 * SIZE(AO3)
  985. FMADD y12, a4, b2, y12
  986. LFD a4, 3 * SIZE(AO4)
  987. FMADD y13, a5, b2, y13
  988. LFD a5, 3 * SIZE(AO5)
  989. FMADD y14, a6, b2, y14
  990. LFD a6, 3 * SIZE(AO6)
  991. FMADD y15, a7, b2, y15
  992. LFD a7, 3 * SIZE(AO7)
  993. FMADD y16, a8, b2, y16
  994. LFD a8, 3 * SIZE(AO8)
  995. LFD b5, 5 * SIZE(BO)
  996. LFD b6, 6 * SIZE(BO)
  997. LFD b7, 7 * SIZE(BO)
  998. LFD b8, 8 * SIZE(BO)
  999. FMADD y01, a1, b3, y01
  1000. LFD a1, 4 * SIZE(AO1)
  1001. FMADD y02, a2, b3, y02
  1002. LFD a2, 4 * SIZE(AO2)
  1003. FMADD y03, a3, b3, y03
  1004. LFD a3, 4 * SIZE(AO3)
  1005. FMADD y04, a4, b3, y04
  1006. LFD a4, 4 * SIZE(AO4)
  1007. FMADD y05, a5, b3, y05
  1008. LFD a5, 4 * SIZE(AO5)
  1009. FMADD y06, a6, b3, y06
  1010. LFD a6, 4 * SIZE(AO6)
  1011. FMADD y07, a7, b3, y07
  1012. LFD a7, 4 * SIZE(AO7)
  1013. FMADD y08, a8, b3, y08
  1014. LFD a8, 4 * SIZE(AO8)
  1015. FMADD y09, a1, b4, y09
  1016. LFD a1, 5 * SIZE(AO1)
  1017. FMADD y10, a2, b4, y10
  1018. LFD a2, 5 * SIZE(AO2)
  1019. FMADD y11, a3, b4, y11
  1020. LFD a3, 5 * SIZE(AO3)
  1021. FMADD y12, a4, b4, y12
  1022. LFD a4, 5 * SIZE(AO4)
  1023. FMADD y13, a5, b4, y13
  1024. LFD a5, 5 * SIZE(AO5)
  1025. FMADD y14, a6, b4, y14
  1026. LFD a6, 5 * SIZE(AO6)
  1027. FMADD y15, a7, b4, y15
  1028. LFD a7, 5 * SIZE(AO7)
  1029. FMADD y16, a8, b4, y16
  1030. LFD a8, 5 * SIZE(AO8)
  1031. FMADD y01, a1, b5, y01
  1032. LFD a1, 6 * SIZE(AO1)
  1033. FMADD y02, a2, b5, y02
  1034. LFD a2, 6 * SIZE(AO2)
  1035. FMADD y03, a3, b5, y03
  1036. LFD a3, 6 * SIZE(AO3)
  1037. FMADD y04, a4, b5, y04
  1038. LFD a4, 6 * SIZE(AO4)
  1039. FMADD y05, a5, b5, y05
  1040. LFD a5, 6 * SIZE(AO5)
  1041. FMADD y06, a6, b5, y06
  1042. LFD a6, 6 * SIZE(AO6)
  1043. FMADD y07, a7, b5, y07
  1044. LFD a7, 6 * SIZE(AO7)
  1045. FMADD y08, a8, b5, y08
  1046. LFD a8, 6 * SIZE(AO8)
  1047. FMADD y09, a1, b6, y09
  1048. LFD a1, 7 * SIZE(AO1)
  1049. FMADD y10, a2, b6, y10
  1050. LFD a2, 7 * SIZE(AO2)
  1051. FMADD y11, a3, b6, y11
  1052. LFD a3, 7 * SIZE(AO3)
  1053. FMADD y12, a4, b6, y12
  1054. LFD a4, 7 * SIZE(AO4)
  1055. FMADD y13, a5, b6, y13
  1056. LFD a5, 7 * SIZE(AO5)
  1057. FMADD y14, a6, b6, y14
  1058. LFD a6, 7 * SIZE(AO6)
  1059. FMADD y15, a7, b6, y15
  1060. LFD a7, 7 * SIZE(AO7)
  1061. FMADD y16, a8, b6, y16
  1062. LFD a8, 7 * SIZE(AO8)
  1063. FMADD y01, a1, b7, y01
  1064. LFD a1, 8 * SIZE(AO1)
  1065. FMADD y02, a2, b7, y02
  1066. LFD a2, 8 * SIZE(AO2)
  1067. FMADD y03, a3, b7, y03
  1068. LFD a3, 8 * SIZE(AO3)
  1069. FMADD y04, a4, b7, y04
  1070. LFD a4, 8 * SIZE(AO4)
  1071. FMADD y05, a5, b7, y05
  1072. LFD a5, 8 * SIZE(AO5)
  1073. FMADD y06, a6, b7, y06
  1074. LFD a6, 8 * SIZE(AO6)
  1075. FMADD y07, a7, b7, y07
  1076. LFD a7, 8 * SIZE(AO7)
  1077. FMADD y08, a8, b7, y08
  1078. LFD a8, 8 * SIZE(AO8)
  1079. FMADD y09, a1, b8, y09
  1080. addi AO1, AO1, 8 * SIZE
  1081. FMADD y10, a2, b8, y10
  1082. addi AO2, AO2, 8 * SIZE
  1083. FMADD y11, a3, b8, y11
  1084. addi AO3, AO3, 8 * SIZE
  1085. FMADD y12, a4, b8, y12
  1086. addi AO4, AO4, 8 * SIZE
  1087. FMADD y13, a5, b8, y13
  1088. addi AO5, AO5, 8 * SIZE
  1089. FMADD y14, a6, b8, y14
  1090. addi AO6, AO6, 8 * SIZE
  1091. FMADD y15, a7, b8, y15
  1092. addi AO7, AO7, 8 * SIZE
  1093. FMADD y16, a8, b8, y16
  1094. addi AO8, AO8, 8 * SIZE
  1095. addi BO, BO, 8 * SIZE
  1096. .align 4
  1097. LL(15):
  1098. andi. r0, MIN_N, 4
  1099. ble LL(16)
  1100. LFD a1, 1 * SIZE(AO1)
  1101. LFD b1, 1 * SIZE(BO)
  1102. LFD a2, 1 * SIZE(AO2)
  1103. LFD a3, 1 * SIZE(AO3)
  1104. LFD a4, 1 * SIZE(AO4)
  1105. LFD a5, 1 * SIZE(AO5)
  1106. LFD a6, 1 * SIZE(AO6)
  1107. LFD a7, 1 * SIZE(AO7)
  1108. LFD a8, 1 * SIZE(AO8)
  1109. LFD b2, 2 * SIZE(BO)
  1110. LFD b3, 3 * SIZE(BO)
  1111. LFD b4, 4 * SIZE(BO)
  1112. FMADD y01, a1, b1, y01
  1113. LFD a1, 2 * SIZE(AO1)
  1114. FMADD y02, a2, b1, y02
  1115. LFD a2, 2 * SIZE(AO2)
  1116. FMADD y03, a3, b1, y03
  1117. LFD a3, 2 * SIZE(AO3)
  1118. FMADD y04, a4, b1, y04
  1119. LFD a4, 2 * SIZE(AO4)
  1120. FMADD y05, a5, b1, y05
  1121. LFD a5, 2 * SIZE(AO5)
  1122. FMADD y06, a6, b1, y06
  1123. LFD a6, 2 * SIZE(AO6)
  1124. FMADD y07, a7, b1, y07
  1125. LFD a7, 2 * SIZE(AO7)
  1126. FMADD y08, a8, b1, y08
  1127. LFD a8, 2 * SIZE(AO8)
  1128. FMADD y09, a1, b2, y09
  1129. LFD a1, 3 * SIZE(AO1)
  1130. FMADD y10, a2, b2, y10
  1131. LFD a2, 3 * SIZE(AO2)
  1132. FMADD y11, a3, b2, y11
  1133. LFD a3, 3 * SIZE(AO3)
  1134. FMADD y12, a4, b2, y12
  1135. LFD a4, 3 * SIZE(AO4)
  1136. FMADD y13, a5, b2, y13
  1137. LFD a5, 3 * SIZE(AO5)
  1138. FMADD y14, a6, b2, y14
  1139. LFD a6, 3 * SIZE(AO6)
  1140. FMADD y15, a7, b2, y15
  1141. LFD a7, 3 * SIZE(AO7)
  1142. FMADD y16, a8, b2, y16
  1143. LFD a8, 3 * SIZE(AO8)
  1144. FMADD y01, a1, b3, y01
  1145. LFD a1, 4 * SIZE(AO1)
  1146. FMADD y02, a2, b3, y02
  1147. LFD a2, 4 * SIZE(AO2)
  1148. FMADD y03, a3, b3, y03
  1149. LFD a3, 4 * SIZE(AO3)
  1150. FMADD y04, a4, b3, y04
  1151. LFD a4, 4 * SIZE(AO4)
  1152. FMADD y05, a5, b3, y05
  1153. LFD a5, 4 * SIZE(AO5)
  1154. FMADD y06, a6, b3, y06
  1155. LFD a6, 4 * SIZE(AO6)
  1156. FMADD y07, a7, b3, y07
  1157. LFD a7, 4 * SIZE(AO7)
  1158. FMADD y08, a8, b3, y08
  1159. LFD a8, 4 * SIZE(AO8)
  1160. FMADD y09, a1, b4, y09
  1161. addi AO1, AO1, 4 * SIZE
  1162. FMADD y10, a2, b4, y10
  1163. addi AO2, AO2, 4 * SIZE
  1164. FMADD y11, a3, b4, y11
  1165. addi AO3, AO3, 4 * SIZE
  1166. FMADD y12, a4, b4, y12
  1167. addi AO4, AO4, 4 * SIZE
  1168. FMADD y13, a5, b4, y13
  1169. addi AO5, AO5, 4 * SIZE
  1170. FMADD y14, a6, b4, y14
  1171. addi AO6, AO6, 4 * SIZE
  1172. FMADD y15, a7, b4, y15
  1173. addi AO7, AO7, 4 * SIZE
  1174. FMADD y16, a8, b4, y16
  1175. addi AO8, AO8, 4 * SIZE
  1176. addi BO, BO, 4 * SIZE
  1177. .align 4
  1178. LL(16):
  1179. andi. r0, MIN_N, 2
  1180. ble LL(17)
  1181. LFD a1, 1 * SIZE(AO1)
  1182. LFD b1, 1 * SIZE(BO)
  1183. LFD a2, 1 * SIZE(AO2)
  1184. LFD a3, 1 * SIZE(AO3)
  1185. LFD a4, 1 * SIZE(AO4)
  1186. LFD a5, 1 * SIZE(AO5)
  1187. LFD a6, 1 * SIZE(AO6)
  1188. LFD a7, 1 * SIZE(AO7)
  1189. LFD a8, 1 * SIZE(AO8)
  1190. LFD b2, 2 * SIZE(BO)
  1191. FMADD y01, a1, b1, y01
  1192. LFD a1, 2 * SIZE(AO1)
  1193. FMADD y02, a2, b1, y02
  1194. LFD a2, 2 * SIZE(AO2)
  1195. FMADD y03, a3, b1, y03
  1196. LFD a3, 2 * SIZE(AO3)
  1197. FMADD y04, a4, b1, y04
  1198. LFD a4, 2 * SIZE(AO4)
  1199. FMADD y05, a5, b1, y05
  1200. LFD a5, 2 * SIZE(AO5)
  1201. FMADD y06, a6, b1, y06
  1202. LFD a6, 2 * SIZE(AO6)
  1203. FMADD y07, a7, b1, y07
  1204. LFD a7, 2 * SIZE(AO7)
  1205. FMADD y08, a8, b1, y08
  1206. LFD a8, 2 * SIZE(AO8)
  1207. FMADD y09, a1, b2, y09
  1208. addi AO1, AO1, 2 * SIZE
  1209. addi AO2, AO2, 2 * SIZE
  1210. FMADD y10, a2, b2, y10
  1211. addi AO3, AO3, 2 * SIZE
  1212. addi AO4, AO4, 2 * SIZE
  1213. FMADD y11, a3, b2, y11
  1214. FMADD y12, a4, b2, y12
  1215. addi AO5, AO5, 2 * SIZE
  1216. addi AO6, AO6, 2 * SIZE
  1217. FMADD y13, a5, b2, y13
  1218. FMADD y14, a6, b2, y14
  1219. addi AO7, AO7, 2 * SIZE
  1220. addi AO8, AO8, 2 * SIZE
  1221. FMADD y15, a7, b2, y15
  1222. FMADD y16, a8, b2, y16
  1223. addi BO, BO, 2 * SIZE
  1224. .align 4
  1225. LL(17):
  1226. andi. r0, MIN_N, 1
  1227. ble LL(18)
  1228. LFD a1, 1 * SIZE(AO1)
  1229. LFD b1, 1 * SIZE(BO)
  1230. LFD a2, 1 * SIZE(AO2)
  1231. LFD a3, 1 * SIZE(AO3)
  1232. LFD a4, 1 * SIZE(AO4)
  1233. LFD a5, 1 * SIZE(AO5)
  1234. LFD a6, 1 * SIZE(AO6)
  1235. LFD a7, 1 * SIZE(AO7)
  1236. LFD a8, 1 * SIZE(AO8)
  1237. FMADD y01, a1, b1, y01
  1238. FMADD y02, a2, b1, y02
  1239. FMADD y03, a3, b1, y03
  1240. FMADD y04, a4, b1, y04
  1241. FMADD y05, a5, b1, y05
  1242. FMADD y06, a6, b1, y06
  1243. FMADD y07, a7, b1, y07
  1244. FMADD y08, a8, b1, y08
  1245. .align 4
  1246. LL(18):
  1247. mr BO, CO
  1248. lfd alpha, ALPHA
  1249. cmpi cr0, 0, INCY, SIZE
  1250. bne LL(19)
  1251. LFD a1, 1 * SIZE(CO)
  1252. LFD a2, 2 * SIZE(CO)
  1253. LFD a3, 3 * SIZE(CO)
  1254. LFD a4, 4 * SIZE(CO)
  1255. LFD a5, 5 * SIZE(CO)
  1256. LFD a6, 6 * SIZE(CO)
  1257. LFD a7, 7 * SIZE(CO)
  1258. LFD a8, 8 * SIZE(CO)
  1259. FADD y01, y09, y01
  1260. FADD y02, y10, y02
  1261. FADD y03, y11, y03
  1262. FADD y04, y12, y04
  1263. FADD y05, y13, y05
  1264. FADD y06, y14, y06
  1265. FADD y07, y15, y07
  1266. FADD y08, y16, y08
  1267. FMADD a1, alpha, y01, a1
  1268. FMADD a2, alpha, y02, a2
  1269. FMADD a3, alpha, y03, a3
  1270. FMADD a4, alpha, y04, a4
  1271. FMADD a5, alpha, y05, a5
  1272. FMADD a6, alpha, y06, a6
  1273. FMADD a7, alpha, y07, a7
  1274. FMADD a8, alpha, y08, a8
  1275. STFD a1, 1 * SIZE(CO)
  1276. STFD a2, 2 * SIZE(CO)
  1277. STFD a3, 3 * SIZE(CO)
  1278. STFD a4, 4 * SIZE(CO)
  1279. STFD a5, 5 * SIZE(CO)
  1280. STFD a6, 6 * SIZE(CO)
  1281. STFD a7, 7 * SIZE(CO)
  1282. STFD a8, 8 * SIZE(CO)
  1283. addi J, J, -1
  1284. addi CO, CO, 8 * SIZE
  1285. cmpi cr0, 0, J, 0
  1286. bgt LL(11)
  1287. b LL(20)
  1288. .align 4
  1289. LL(19):
  1290. LFDUX a1, CO, INCY
  1291. LFDUX a2, CO, INCY
  1292. LFDUX a3, CO, INCY
  1293. LFDUX a4, CO, INCY
  1294. LFDUX a5, CO, INCY
  1295. LFDUX a6, CO, INCY
  1296. LFDUX a7, CO, INCY
  1297. LFDUX a8, CO, INCY
  1298. FADD y01, y09, y01
  1299. FADD y02, y10, y02
  1300. FADD y03, y11, y03
  1301. FADD y04, y12, y04
  1302. FADD y05, y13, y05
  1303. FADD y06, y14, y06
  1304. FADD y07, y15, y07
  1305. FADD y08, y16, y08
  1306. FMADD a1, alpha, f0, a1
  1307. FMADD a2, alpha, f1, a2
  1308. FMADD a3, alpha, f2, a3
  1309. FMADD a4, alpha, f3, a4
  1310. FMADD a5, alpha, f4, a5
  1311. FMADD a6, alpha, f5, a6
  1312. FMADD a7, alpha, f6, a7
  1313. FMADD a8, alpha, f7, a8
  1314. STFDUX a1, BO, INCY
  1315. STFDUX a2, BO, INCY
  1316. STFDUX a3, BO, INCY
  1317. STFDUX a4, BO, INCY
  1318. STFDUX a5, BO, INCY
  1319. STFDUX a6, BO, INCY
  1320. STFDUX a7, BO, INCY
  1321. STFDUX a8, BO, INCY
  1322. addi J, J, -1
  1323. cmpi cr0, 0, J, 0
  1324. bgt LL(11)
  1325. .align 4
  1326. LL(20):
  1327. andi. J, N, 7
  1328. ble LL(99)
  1329. andi. J, N, 4
  1330. ble LL(30)
  1331. mr AO1, A
  1332. add AO2, A, LDA
  1333. add AO3, AO2, LDA
  1334. add AO4, AO3, LDA
  1335. add A, AO4, LDA
  1336. mr BO, XP
  1337. lfd y01, FZERO
  1338. fmr y02, y01
  1339. fmr y03, y01
  1340. fmr y04, y01
  1341. fmr y09, y01
  1342. fmr y10, y01
  1343. fmr y11, y01
  1344. fmr y12, y01
  1345. DCBT(Y1, PREC)
  1346. srawi. r0, MIN_N, 4
  1347. mtspr CTR, r0
  1348. ble LL(24)
  1349. LFD a1, 1 * SIZE(AO1)
  1350. LFD a2, 1 * SIZE(AO2)
  1351. LFD a3, 1 * SIZE(AO3)
  1352. LFD a4, 1 * SIZE(AO4)
  1353. LFD a5, 2 * SIZE(AO1)
  1354. LFD a6, 2 * SIZE(AO2)
  1355. LFD a7, 2 * SIZE(AO3)
  1356. LFD a8, 2 * SIZE(AO4)
  1357. LFD b1, 1 * SIZE(BO)
  1358. LFD b2, 2 * SIZE(BO)
  1359. LFD b3, 3 * SIZE(BO)
  1360. LFD b4, 4 * SIZE(BO)
  1361. LFD b5, 5 * SIZE(BO)
  1362. LFD b6, 6 * SIZE(BO)
  1363. LFD b7, 7 * SIZE(BO)
  1364. LFD b8, 8 * SIZE(BO)
  1365. bdz LL(23)
  1366. .align 4
  1367. LL(22):
  1368. FMADD y01, a1, b1, y01
  1369. LFD a1, 3 * SIZE(AO1)
  1370. FMADD y02, a2, b1, y02
  1371. LFD a2, 3 * SIZE(AO2)
  1372. FMADD y03, a3, b1, y03
  1373. LFD a3, 3 * SIZE(AO3)
  1374. FMADD y04, a4, b1, y04
  1375. LFD a4, 3 * SIZE(AO4)
  1376. FMADD y09, a5, b2, y09
  1377. LFD a5, 4 * SIZE(AO1)
  1378. FMADD y10, a6, b2, y10
  1379. LFD a6, 4 * SIZE(AO2)
  1380. FMADD y11, a7, b2, y11
  1381. LFD a7, 4 * SIZE(AO3)
  1382. FMADD y12, a8, b2, y12
  1383. LFD a8, 4 * SIZE(AO4)
  1384. FMADD y01, a1, b3, y01
  1385. LFD a1, 5 * SIZE(AO1)
  1386. FMADD y02, a2, b3, y02
  1387. LFD a2, 5 * SIZE(AO2)
  1388. FMADD y03, a3, b3, y03
  1389. LFD a3, 5 * SIZE(AO3)
  1390. FMADD y04, a4, b3, y04
  1391. LFD a4, 5 * SIZE(AO4)
  1392. FMADD y09, a5, b4, y09
  1393. LFD a5, 6 * SIZE(AO1)
  1394. FMADD y10, a6, b4, y10
  1395. LFD a6, 6 * SIZE(AO2)
  1396. FMADD y11, a7, b4, y11
  1397. LFD a7, 6 * SIZE(AO3)
  1398. FMADD y12, a8, b4, y12
  1399. LFD a8, 6 * SIZE(AO4)
  1400. LFD b1, 9 * SIZE(BO)
  1401. LFD b2, 10 * SIZE(BO)
  1402. LFD b3, 11 * SIZE(BO)
  1403. LFD b4, 12 * SIZE(BO)
  1404. FMADD y01, a1, b5, y01
  1405. LFD a1, 7 * SIZE(AO1)
  1406. FMADD y02, a2, b5, y02
  1407. LFD a2, 7 * SIZE(AO2)
  1408. FMADD y03, a3, b5, y03
  1409. LFD a3, 7 * SIZE(AO3)
  1410. FMADD y04, a4, b5, y04
  1411. LFD a4, 7 * SIZE(AO4)
  1412. FMADD y09, a5, b6, y09
  1413. LFD a5, 8 * SIZE(AO1)
  1414. FMADD y10, a6, b6, y10
  1415. LFD a6, 8 * SIZE(AO2)
  1416. FMADD y11, a7, b6, y11
  1417. LFD a7, 8 * SIZE(AO3)
  1418. FMADD y12, a8, b6, y12
  1419. LFD a8, 8 * SIZE(AO4)
  1420. FMADD y01, a1, b7, y01
  1421. LFD a1, 9 * SIZE(AO1)
  1422. FMADD y02, a2, b7, y02
  1423. LFD a2, 9 * SIZE(AO2)
  1424. FMADD y03, a3, b7, y03
  1425. LFD a3, 9 * SIZE(AO3)
  1426. FMADD y04, a4, b7, y04
  1427. LFD a4, 9 * SIZE(AO4)
  1428. FMADD y09, a5, b8, y09
  1429. LFD a5, 10 * SIZE(AO1)
  1430. FMADD y10, a6, b8, y10
  1431. LFD a6, 10 * SIZE(AO2)
  1432. FMADD y11, a7, b8, y11
  1433. LFD a7, 10 * SIZE(AO3)
  1434. FMADD y12, a8, b8, y12
  1435. LFD a8, 10 * SIZE(AO4)
  1436. LFD b5, 13 * SIZE(BO)
  1437. LFD b6, 14 * SIZE(BO)
  1438. LFD b7, 15 * SIZE(BO)
  1439. LFD b8, 16 * SIZE(BO)
  1440. FMADD y01, a1, b1, y01
  1441. LFD a1, 11 * SIZE(AO1)
  1442. FMADD y02, a2, b1, y02
  1443. LFD a2, 11 * SIZE(AO2)
  1444. FMADD y03, a3, b1, y03
  1445. LFD a3, 11 * SIZE(AO3)
  1446. FMADD y04, a4, b1, y04
  1447. LFD a4, 11 * SIZE(AO4)
  1448. FMADD y09, a5, b2, y09
  1449. LFD a5, 12 * SIZE(AO1)
  1450. FMADD y10, a6, b2, y10
  1451. LFD a6, 12 * SIZE(AO2)
  1452. FMADD y11, a7, b2, y11
  1453. LFD a7, 12 * SIZE(AO3)
  1454. FMADD y12, a8, b2, y12
  1455. LFD a8, 12 * SIZE(AO4)
  1456. FMADD y01, a1, b3, y01
  1457. LFD a1, 13 * SIZE(AO1)
  1458. FMADD y02, a2, b3, y02
  1459. LFD a2, 13 * SIZE(AO2)
  1460. FMADD y03, a3, b3, y03
  1461. LFD a3, 13 * SIZE(AO3)
  1462. FMADD y04, a4, b3, y04
  1463. LFD a4, 13 * SIZE(AO4)
  1464. FMADD y09, a5, b4, y09
  1465. LFD a5, 14 * SIZE(AO1)
  1466. FMADD y10, a6, b4, y10
  1467. LFD a6, 14 * SIZE(AO2)
  1468. FMADD y11, a7, b4, y11
  1469. LFD a7, 14 * SIZE(AO3)
  1470. FMADD y12, a8, b4, y12
  1471. LFD a8, 14 * SIZE(AO4)
  1472. LFD b1, 17 * SIZE(BO)
  1473. LFD b2, 18 * SIZE(BO)
  1474. LFD b3, 19 * SIZE(BO)
  1475. LFD b4, 20 * SIZE(BO)
  1476. FMADD y01, a1, b5, y01
  1477. LFD a1, 15 * SIZE(AO1)
  1478. FMADD y02, a2, b5, y02
  1479. LFD a2, 15 * SIZE(AO2)
  1480. FMADD y03, a3, b5, y03
  1481. LFD a3, 15 * SIZE(AO3)
  1482. FMADD y04, a4, b5, y04
  1483. LFD a4, 15 * SIZE(AO4)
  1484. FMADD y09, a5, b6, y09
  1485. LFD a5, 16 * SIZE(AO1)
  1486. FMADD y10, a6, b6, y10
  1487. LFD a6, 16 * SIZE(AO2)
  1488. FMADD y11, a7, b6, y11
  1489. LFD a7, 16 * SIZE(AO3)
  1490. FMADD y12, a8, b6, y12
  1491. LFD a8, 16 * SIZE(AO4)
  1492. FMADD y01, a1, b7, y01
  1493. LFD a1, 17 * SIZE(AO1)
  1494. FMADD y02, a2, b7, y02
  1495. LFD a2, 17 * SIZE(AO2)
  1496. FMADD y03, a3, b7, y03
  1497. LFD a3, 17 * SIZE(AO3)
  1498. FMADD y04, a4, b7, y04
  1499. LFD a4, 17 * SIZE(AO4)
  1500. FMADD y09, a5, b8, y09
  1501. LFD a5, 18 * SIZE(AO1)
  1502. FMADD y10, a6, b8, y10
  1503. LFD a6, 18 * SIZE(AO2)
  1504. FMADD y11, a7, b8, y11
  1505. LFD a7, 18 * SIZE(AO3)
  1506. FMADD y12, a8, b8, y12
  1507. LFD a8, 18 * SIZE(AO4)
  1508. LFD b5, 21 * SIZE(BO)
  1509. LFD b6, 22 * SIZE(BO)
  1510. LFD b7, 23 * SIZE(BO)
  1511. LFD b8, 24 * SIZE(BO)
  1512. addi AO1, AO1, 16 * SIZE
  1513. addi AO2, AO2, 16 * SIZE
  1514. DCBT(AO1, PREA)
  1515. DCBT(AO2, PREA)
  1516. addi AO3, AO3, 16 * SIZE
  1517. addi AO4, AO4, 16 * SIZE
  1518. DCBT(AO3, PREA)
  1519. DCBT(AO4, PREA)
  1520. addi BO, BO, 16 * SIZE
  1521. bdnz LL(22)
  1522. .align 4
  1523. LL(23):
  1524. FMADD y01, a1, b1, y01
  1525. LFD a1, 3 * SIZE(AO1)
  1526. FMADD y02, a2, b1, y02
  1527. LFD a2, 3 * SIZE(AO2)
  1528. FMADD y03, a3, b1, y03
  1529. LFD a3, 3 * SIZE(AO3)
  1530. FMADD y04, a4, b1, y04
  1531. LFD a4, 3 * SIZE(AO4)
  1532. FMADD y09, a5, b2, y09
  1533. LFD a5, 4 * SIZE(AO1)
  1534. FMADD y10, a6, b2, y10
  1535. LFD a6, 4 * SIZE(AO2)
  1536. FMADD y11, a7, b2, y11
  1537. LFD a7, 4 * SIZE(AO3)
  1538. FMADD y12, a8, b2, y12
  1539. LFD a8, 4 * SIZE(AO4)
  1540. FMADD y01, a1, b3, y01
  1541. LFD a1, 5 * SIZE(AO1)
  1542. FMADD y02, a2, b3, y02
  1543. LFD a2, 5 * SIZE(AO2)
  1544. FMADD y03, a3, b3, y03
  1545. LFD a3, 5 * SIZE(AO3)
  1546. FMADD y04, a4, b3, y04
  1547. LFD a4, 5 * SIZE(AO4)
  1548. FMADD y09, a5, b4, y09
  1549. LFD a5, 6 * SIZE(AO1)
  1550. FMADD y10, a6, b4, y10
  1551. LFD a6, 6 * SIZE(AO2)
  1552. FMADD y11, a7, b4, y11
  1553. LFD a7, 6 * SIZE(AO3)
  1554. FMADD y12, a8, b4, y12
  1555. LFD a8, 6 * SIZE(AO4)
  1556. LFD b1, 9 * SIZE(BO)
  1557. LFD b2, 10 * SIZE(BO)
  1558. LFD b3, 11 * SIZE(BO)
  1559. LFD b4, 12 * SIZE(BO)
  1560. FMADD y01, a1, b5, y01
  1561. LFD a1, 7 * SIZE(AO1)
  1562. FMADD y02, a2, b5, y02
  1563. LFD a2, 7 * SIZE(AO2)
  1564. FMADD y03, a3, b5, y03
  1565. LFD a3, 7 * SIZE(AO3)
  1566. FMADD y04, a4, b5, y04
  1567. LFD a4, 7 * SIZE(AO4)
  1568. FMADD y09, a5, b6, y09
  1569. LFD a5, 8 * SIZE(AO1)
  1570. FMADD y10, a6, b6, y10
  1571. LFD a6, 8 * SIZE(AO2)
  1572. FMADD y11, a7, b6, y11
  1573. LFD a7, 8 * SIZE(AO3)
  1574. FMADD y12, a8, b6, y12
  1575. LFD a8, 8 * SIZE(AO4)
  1576. FMADD y01, a1, b7, y01
  1577. LFD a1, 9 * SIZE(AO1)
  1578. FMADD y02, a2, b7, y02
  1579. LFD a2, 9 * SIZE(AO2)
  1580. FMADD y03, a3, b7, y03
  1581. LFD a3, 9 * SIZE(AO3)
  1582. FMADD y04, a4, b7, y04
  1583. LFD a4, 9 * SIZE(AO4)
  1584. FMADD y09, a5, b8, y09
  1585. LFD a5, 10 * SIZE(AO1)
  1586. FMADD y10, a6, b8, y10
  1587. LFD a6, 10 * SIZE(AO2)
  1588. FMADD y11, a7, b8, y11
  1589. LFD a7, 10 * SIZE(AO3)
  1590. FMADD y12, a8, b8, y12
  1591. LFD a8, 10 * SIZE(AO4)
  1592. LFD b5, 13 * SIZE(BO)
  1593. LFD b6, 14 * SIZE(BO)
  1594. LFD b7, 15 * SIZE(BO)
  1595. LFD b8, 16 * SIZE(BO)
  1596. FMADD y01, a1, b1, y01
  1597. LFD a1, 11 * SIZE(AO1)
  1598. FMADD y02, a2, b1, y02
  1599. LFD a2, 11 * SIZE(AO2)
  1600. FMADD y03, a3, b1, y03
  1601. LFD a3, 11 * SIZE(AO3)
  1602. FMADD y04, a4, b1, y04
  1603. LFD a4, 11 * SIZE(AO4)
  1604. FMADD y09, a5, b2, y09
  1605. LFD a5, 12 * SIZE(AO1)
  1606. FMADD y10, a6, b2, y10
  1607. LFD a6, 12 * SIZE(AO2)
  1608. FMADD y11, a7, b2, y11
  1609. LFD a7, 12 * SIZE(AO3)
  1610. FMADD y12, a8, b2, y12
  1611. LFD a8, 12 * SIZE(AO4)
  1612. FMADD y01, a1, b3, y01
  1613. LFD a1, 13 * SIZE(AO1)
  1614. FMADD y02, a2, b3, y02
  1615. LFD a2, 13 * SIZE(AO2)
  1616. FMADD y03, a3, b3, y03
  1617. LFD a3, 13 * SIZE(AO3)
  1618. FMADD y04, a4, b3, y04
  1619. LFD a4, 13 * SIZE(AO4)
  1620. FMADD y09, a5, b4, y09
  1621. LFD a5, 14 * SIZE(AO1)
  1622. FMADD y10, a6, b4, y10
  1623. LFD a6, 14 * SIZE(AO2)
  1624. FMADD y11, a7, b4, y11
  1625. LFD a7, 14 * SIZE(AO3)
  1626. FMADD y12, a8, b4, y12
  1627. LFD a8, 14 * SIZE(AO4)
  1628. FMADD y01, a1, b5, y01
  1629. LFD a1, 15 * SIZE(AO1)
  1630. FMADD y02, a2, b5, y02
  1631. LFD a2, 15 * SIZE(AO2)
  1632. FMADD y03, a3, b5, y03
  1633. LFD a3, 15 * SIZE(AO3)
  1634. FMADD y04, a4, b5, y04
  1635. LFD a4, 15 * SIZE(AO4)
  1636. FMADD y09, a5, b6, y09
  1637. LFD a5, 16 * SIZE(AO1)
  1638. FMADD y10, a6, b6, y10
  1639. LFD a6, 16 * SIZE(AO2)
  1640. FMADD y11, a7, b6, y11
  1641. LFD a7, 16 * SIZE(AO3)
  1642. FMADD y12, a8, b6, y12
  1643. LFD a8, 16 * SIZE(AO4)
  1644. FMADD y01, a1, b7, y01
  1645. FMADD y02, a2, b7, y02
  1646. FMADD y03, a3, b7, y03
  1647. FMADD y04, a4, b7, y04
  1648. FMADD y09, a5, b8, y09
  1649. FMADD y10, a6, b8, y10
  1650. FMADD y11, a7, b8, y11
  1651. FMADD y12, a8, b8, y12
  1652. addi AO1, AO1, 16 * SIZE
  1653. addi AO2, AO2, 16 * SIZE
  1654. addi AO3, AO3, 16 * SIZE
  1655. addi AO4, AO4, 16 * SIZE
  1656. addi BO, BO, 16 * SIZE
  1657. .align 4
  1658. LL(24):
  1659. andi. r0, MIN_N, 15
  1660. ble LL(28)
  1661. andi. r0, MIN_N, 8
  1662. ble LL(25)
  1663. LFD a1, 1 * SIZE(AO1)
  1664. LFD a2, 1 * SIZE(AO2)
  1665. LFD a3, 1 * SIZE(AO3)
  1666. LFD a4, 1 * SIZE(AO4)
  1667. LFD b1, 1 * SIZE(BO)
  1668. LFD b2, 2 * SIZE(BO)
  1669. LFD b3, 3 * SIZE(BO)
  1670. LFD b4, 4 * SIZE(BO)
  1671. LFD a5, 2 * SIZE(AO1)
  1672. LFD a6, 2 * SIZE(AO2)
  1673. LFD a7, 2 * SIZE(AO3)
  1674. LFD a8, 2 * SIZE(AO4)
  1675. FMADD y01, a1, b1, y01
  1676. LFD a1, 3 * SIZE(AO1)
  1677. FMADD y02, a2, b1, y02
  1678. LFD a2, 3 * SIZE(AO2)
  1679. FMADD y03, a3, b1, y03
  1680. LFD a3, 3 * SIZE(AO3)
  1681. FMADD y04, a4, b1, y04
  1682. LFD a4, 3 * SIZE(AO4)
  1683. FMADD y09, a5, b2, y09
  1684. LFD a5, 4 * SIZE(AO1)
  1685. FMADD y10, a6, b2, y10
  1686. LFD a6, 4 * SIZE(AO2)
  1687. FMADD y11, a7, b2, y11
  1688. LFD a7, 4 * SIZE(AO3)
  1689. FMADD y12, a8, b2, y12
  1690. LFD a8, 4 * SIZE(AO4)
  1691. FMADD y01, a1, b3, y01
  1692. LFD a1, 5 * SIZE(AO1)
  1693. FMADD y02, a2, b3, y02
  1694. LFD a2, 5 * SIZE(AO2)
  1695. FMADD y03, a3, b3, y03
  1696. LFD a3, 5 * SIZE(AO3)
  1697. FMADD y04, a4, b3, y04
  1698. LFD a4, 5 * SIZE(AO4)
  1699. FMADD y09, a5, b4, y09
  1700. LFD a5, 6 * SIZE(AO1)
  1701. FMADD y10, a6, b4, y10
  1702. LFD a6, 6 * SIZE(AO2)
  1703. FMADD y11, a7, b4, y11
  1704. LFD a7, 6 * SIZE(AO3)
  1705. FMADD y12, a8, b4, y12
  1706. LFD a8, 6 * SIZE(AO4)
  1707. LFD b1, 5 * SIZE(BO)
  1708. LFD b2, 6 * SIZE(BO)
  1709. LFD b3, 7 * SIZE(BO)
  1710. LFD b4, 8 * SIZE(BO)
  1711. FMADD y01, a1, b1, y01
  1712. LFD a1, 7 * SIZE(AO1)
  1713. FMADD y02, a2, b1, y02
  1714. LFD a2, 7 * SIZE(AO2)
  1715. FMADD y03, a3, b1, y03
  1716. LFD a3, 7 * SIZE(AO3)
  1717. FMADD y04, a4, b1, y04
  1718. LFD a4, 7 * SIZE(AO4)
  1719. FMADD y09, a5, b2, y09
  1720. LFD a5, 8 * SIZE(AO1)
  1721. FMADD y10, a6, b2, y10
  1722. LFD a6, 8 * SIZE(AO2)
  1723. FMADD y11, a7, b2, y11
  1724. LFD a7, 8 * SIZE(AO3)
  1725. FMADD y12, a8, b2, y12
  1726. LFD a8, 8 * SIZE(AO4)
  1727. FMADD y01, a1, b3, y01
  1728. FMADD y02, a2, b3, y02
  1729. FMADD y03, a3, b3, y03
  1730. FMADD y04, a4, b3, y04
  1731. FMADD y09, a5, b4, y09
  1732. addi AO1, AO1, 8 * SIZE
  1733. FMADD y10, a6, b4, y10
  1734. addi AO2, AO2, 8 * SIZE
  1735. FMADD y11, a7, b4, y11
  1736. addi AO3, AO3, 8 * SIZE
  1737. FMADD y12, a8, b4, y12
  1738. addi AO4, AO4, 8 * SIZE
  1739. addi BO, BO, 8 * SIZE
  1740. .align 4
  1741. LL(25):
  1742. andi. r0, MIN_N, 4
  1743. ble LL(26)
  1744. LFD a1, 1 * SIZE(AO1)
  1745. LFD a2, 1 * SIZE(AO2)
  1746. LFD a3, 1 * SIZE(AO3)
  1747. LFD a4, 1 * SIZE(AO4)
  1748. LFD b1, 1 * SIZE(BO)
  1749. LFD b2, 2 * SIZE(BO)
  1750. LFD b3, 3 * SIZE(BO)
  1751. LFD b4, 4 * SIZE(BO)
  1752. LFD a5, 2 * SIZE(AO1)
  1753. LFD a6, 2 * SIZE(AO2)
  1754. LFD a7, 2 * SIZE(AO3)
  1755. LFD a8, 2 * SIZE(AO4)
  1756. FMADD y01, a1, b1, y01
  1757. LFD a1, 3 * SIZE(AO1)
  1758. FMADD y02, a2, b1, y02
  1759. LFD a2, 3 * SIZE(AO2)
  1760. FMADD y03, a3, b1, y03
  1761. LFD a3, 3 * SIZE(AO3)
  1762. FMADD y04, a4, b1, y04
  1763. LFD a4, 3 * SIZE(AO4)
  1764. FMADD y09, a5, b2, y09
  1765. LFD a5, 4 * SIZE(AO1)
  1766. FMADD y10, a6, b2, y10
  1767. LFD a6, 4 * SIZE(AO2)
  1768. FMADD y11, a7, b2, y11
  1769. LFD a7, 4 * SIZE(AO3)
  1770. FMADD y12, a8, b2, y12
  1771. LFD a8, 4 * SIZE(AO4)
  1772. FMADD y01, a1, b3, y01
  1773. FMADD y02, a2, b3, y02
  1774. FMADD y03, a3, b3, y03
  1775. FMADD y04, a4, b3, y04
  1776. FMADD y09, a5, b4, y09
  1777. addi AO1, AO1, 4 * SIZE
  1778. FMADD y10, a6, b4, y10
  1779. addi AO2, AO2, 4 * SIZE
  1780. FMADD y11, a7, b4, y11
  1781. addi AO3, AO3, 4 * SIZE
  1782. FMADD y12, a8, b4, y12
  1783. addi AO4, AO4, 4 * SIZE
  1784. addi BO, BO, 4 * SIZE
  1785. .align 4
  1786. LL(26):
  1787. andi. r0, MIN_N, 2
  1788. ble LL(27)
  1789. LFD a1, 1 * SIZE(AO1)
  1790. LFD a2, 1 * SIZE(AO2)
  1791. LFD b1, 1 * SIZE(BO)
  1792. LFD b2, 2 * SIZE(BO)
  1793. LFD a3, 1 * SIZE(AO3)
  1794. LFD a4, 1 * SIZE(AO4)
  1795. LFD a5, 2 * SIZE(AO1)
  1796. LFD a6, 2 * SIZE(AO2)
  1797. LFD a7, 2 * SIZE(AO3)
  1798. LFD a8, 2 * SIZE(AO4)
  1799. FMADD y01, a1, b1, y01
  1800. FMADD y02, a2, b1, y02
  1801. FMADD y03, a3, b1, y03
  1802. FMADD y04, a4, b1, y04
  1803. FMADD y09, a5, b2, y09
  1804. addi AO1, AO1, 2 * SIZE
  1805. FMADD y10, a6, b2, y10
  1806. addi AO2, AO2, 2 * SIZE
  1807. FMADD y11, a7, b2, y11
  1808. addi AO3, AO3, 2 * SIZE
  1809. FMADD y12, a8, b2, y12
  1810. addi AO4, AO4, 2 * SIZE
  1811. addi BO, BO, 2 * SIZE
  1812. .align 4
  1813. LL(27):
  1814. andi. r0, MIN_N, 1
  1815. ble LL(28)
  1816. LFD a1, 1 * SIZE(AO1)
  1817. LFD b1, 1 * SIZE(BO)
  1818. LFD a2, 1 * SIZE(AO2)
  1819. LFD a3, 1 * SIZE(AO3)
  1820. LFD a4, 1 * SIZE(AO4)
  1821. FMADD y01, a1, b1, y01
  1822. FMADD y02, a2, b1, y02
  1823. FMADD y03, a3, b1, y03
  1824. FMADD y04, a4, b1, y04
  1825. .align 4
  1826. LL(28):
  1827. mr BO, CO
  1828. lfd alpha, ALPHA
  1829. cmpi cr0, 0, INCY, SIZE
  1830. bne LL(29)
  1831. LFD a1, 1 * SIZE(CO)
  1832. LFD a2, 2 * SIZE(CO)
  1833. LFD a3, 3 * SIZE(CO)
  1834. LFD a4, 4 * SIZE(CO)
  1835. FADD y01, y09, y01
  1836. FADD y02, y10, y02
  1837. FADD y03, y11, y03
  1838. FADD y04, y12, y04
  1839. FMADD a1, alpha, y01, a1
  1840. FMADD a2, alpha, y02, a2
  1841. FMADD a3, alpha, y03, a3
  1842. FMADD a4, alpha, y04, a4
  1843. STFD a1, 1 * SIZE(CO)
  1844. STFD a2, 2 * SIZE(CO)
  1845. STFD a3, 3 * SIZE(CO)
  1846. STFD a4, 4 * SIZE(CO)
  1847. addi CO, CO, 4 * SIZE
  1848. b LL(30)
  1849. .align 4
  1850. LL(29):
  1851. LFDUX a1, CO, INCY
  1852. LFDUX a2, CO, INCY
  1853. LFDUX a3, CO, INCY
  1854. LFDUX a4, CO, INCY
  1855. FADD y01, y09, y01
  1856. FADD y02, y10, y02
  1857. FADD y03, y11, y03
  1858. FADD y04, y12, y04
  1859. FMADD a1, alpha, f0, a1
  1860. FMADD a2, alpha, f1, a2
  1861. FMADD a3, alpha, f2, a3
  1862. FMADD a4, alpha, f3, a4
  1863. STFDUX a1, BO, INCY
  1864. STFDUX a2, BO, INCY
  1865. STFDUX a3, BO, INCY
  1866. STFDUX a4, BO, INCY
  1867. .align 4
  1868. LL(30):
  1869. andi. J, N, 2
  1870. ble LL(40)
  1871. mr AO1, A
  1872. add AO2, A, LDA
  1873. add A, AO2, LDA
  1874. mr BO, XP
  1875. lfd y01, FZERO
  1876. fmr y02, y01
  1877. fmr y03, y01
  1878. fmr y04, y01
  1879. fmr y09, y01
  1880. fmr y10, y01
  1881. fmr y11, y01
  1882. fmr y12, y01
  1883. DCBT(Y1, PREC)
  1884. srawi. r0, MIN_N, 4
  1885. mtspr CTR, r0
  1886. ble LL(34)
  1887. LFD a1, 1 * SIZE(AO1)
  1888. LFD a2, 1 * SIZE(AO2)
  1889. LFD a3, 2 * SIZE(AO1)
  1890. LFD a4, 2 * SIZE(AO2)
  1891. LFD a5, 3 * SIZE(AO1)
  1892. LFD a6, 3 * SIZE(AO2)
  1893. LFD a7, 4 * SIZE(AO1)
  1894. LFD a8, 4 * SIZE(AO2)
  1895. LFD b1, 1 * SIZE(BO)
  1896. LFD b2, 2 * SIZE(BO)
  1897. LFD b3, 3 * SIZE(BO)
  1898. LFD b4, 4 * SIZE(BO)
  1899. LFD b5, 5 * SIZE(BO)
  1900. LFD b6, 6 * SIZE(BO)
  1901. LFD b7, 7 * SIZE(BO)
  1902. LFD b8, 8 * SIZE(BO)
  1903. bdz LL(33)
  1904. .align 4
  1905. LL(32):
  1906. FMADD y01, a1, b1, y01
  1907. LFD a1, 5 * SIZE(AO1)
  1908. FMADD y02, a2, b1, y02
  1909. LFD a2, 5 * SIZE(AO2)
  1910. FMADD y03, a3, b2, y03
  1911. LFD a3, 6 * SIZE(AO1)
  1912. FMADD y04, a4, b2, y04
  1913. LFD a4, 6 * SIZE(AO2)
  1914. FMADD y09, a5, b3, y09
  1915. LFD a5, 7 * SIZE(AO1)
  1916. FMADD y10, a6, b3, y10
  1917. LFD a6, 7 * SIZE(AO2)
  1918. FMADD y11, a7, b4, y11
  1919. LFD a7, 8 * SIZE(AO1)
  1920. FMADD y12, a8, b4, y12
  1921. LFD a8, 8 * SIZE(AO2)
  1922. LFD b1, 9 * SIZE(BO)
  1923. LFD b2, 10 * SIZE(BO)
  1924. LFD b3, 11 * SIZE(BO)
  1925. LFD b4, 12 * SIZE(BO)
  1926. FMADD y01, a1, b5, y01
  1927. LFD a1, 9 * SIZE(AO1)
  1928. FMADD y02, a2, b5, y02
  1929. LFD a2, 9 * SIZE(AO2)
  1930. FMADD y03, a3, b6, y03
  1931. LFD a3, 10 * SIZE(AO1)
  1932. FMADD y04, a4, b6, y04
  1933. LFD a4, 10 * SIZE(AO2)
  1934. FMADD y09, a5, b7, y09
  1935. LFD a5, 11 * SIZE(AO1)
  1936. FMADD y10, a6, b7, y10
  1937. LFD a6, 11 * SIZE(AO2)
  1938. FMADD y11, a7, b8, y11
  1939. LFD a7, 12 * SIZE(AO1)
  1940. FMADD y12, a8, b8, y12
  1941. LFD a8, 12 * SIZE(AO2)
  1942. LFD b5, 13 * SIZE(BO)
  1943. LFD b6, 14 * SIZE(BO)
  1944. LFD b7, 15 * SIZE(BO)
  1945. LFD b8, 16 * SIZE(BO)
  1946. FMADD y01, a1, b1, y01
  1947. LFD a1, 13 * SIZE(AO1)
  1948. FMADD y02, a2, b1, y02
  1949. LFD a2, 13 * SIZE(AO2)
  1950. FMADD y03, a3, b2, y03
  1951. LFD a3, 14 * SIZE(AO1)
  1952. FMADD y04, a4, b2, y04
  1953. LFD a4, 14 * SIZE(AO2)
  1954. FMADD y09, a5, b3, y09
  1955. LFD a5, 15 * SIZE(AO1)
  1956. FMADD y10, a6, b3, y10
  1957. LFD a6, 15 * SIZE(AO2)
  1958. FMADD y11, a7, b4, y11
  1959. LFD a7, 16 * SIZE(AO1)
  1960. FMADD y12, a8, b4, y12
  1961. LFD a8, 16 * SIZE(AO2)
  1962. LFD b1, 17 * SIZE(BO)
  1963. LFD b2, 18 * SIZE(BO)
  1964. LFD b3, 19 * SIZE(BO)
  1965. LFD b4, 20 * SIZE(BO)
  1966. FMADD y01, a1, b5, y01
  1967. LFD a1, 17 * SIZE(AO1)
  1968. FMADD y02, a2, b5, y02
  1969. LFD a2, 17 * SIZE(AO2)
  1970. FMADD y03, a3, b6, y03
  1971. LFD a3, 18 * SIZE(AO1)
  1972. FMADD y04, a4, b6, y04
  1973. LFD a4, 18 * SIZE(AO2)
  1974. FMADD y09, a5, b7, y09
  1975. LFD a5, 19 * SIZE(AO1)
  1976. FMADD y10, a6, b7, y10
  1977. LFD a6, 19 * SIZE(AO2)
  1978. FMADD y11, a7, b8, y11
  1979. LFD a7, 20 * SIZE(AO1)
  1980. FMADD y12, a8, b8, y12
  1981. LFD a8, 20 * SIZE(AO2)
  1982. LFD b5, 21 * SIZE(BO)
  1983. LFD b6, 22 * SIZE(BO)
  1984. LFD b7, 23 * SIZE(BO)
  1985. LFD b8, 24 * SIZE(BO)
  1986. addi AO1, AO1, 16 * SIZE
  1987. addi AO2, AO2, 16 * SIZE
  1988. DCBT(AO1, PREA)
  1989. DCBT(AO2, PREA)
  1990. addi BO, BO, 16 * SIZE
  1991. bdnz LL(32)
  1992. .align 4
  1993. LL(33):
  1994. FMADD y01, a1, b1, y01
  1995. LFD a1, 5 * SIZE(AO1)
  1996. FMADD y02, a2, b1, y02
  1997. LFD a2, 5 * SIZE(AO2)
  1998. FMADD y03, a3, b2, y03
  1999. LFD a3, 6 * SIZE(AO1)
  2000. FMADD y04, a4, b2, y04
  2001. LFD a4, 6 * SIZE(AO2)
  2002. FMADD y09, a5, b3, y09
  2003. LFD a5, 7 * SIZE(AO1)
  2004. FMADD y10, a6, b3, y10
  2005. LFD a6, 7 * SIZE(AO2)
  2006. FMADD y11, a7, b4, y11
  2007. LFD a7, 8 * SIZE(AO1)
  2008. FMADD y12, a8, b4, y12
  2009. LFD a8, 8 * SIZE(AO2)
  2010. LFD b1, 9 * SIZE(BO)
  2011. LFD b2, 10 * SIZE(BO)
  2012. LFD b3, 11 * SIZE(BO)
  2013. LFD b4, 12 * SIZE(BO)
  2014. FMADD y01, a1, b5, y01
  2015. LFD a1, 9 * SIZE(AO1)
  2016. FMADD y02, a2, b5, y02
  2017. LFD a2, 9 * SIZE(AO2)
  2018. FMADD y03, a3, b6, y03
  2019. LFD a3, 10 * SIZE(AO1)
  2020. FMADD y04, a4, b6, y04
  2021. LFD a4, 10 * SIZE(AO2)
  2022. FMADD y09, a5, b7, y09
  2023. LFD a5, 11 * SIZE(AO1)
  2024. FMADD y10, a6, b7, y10
  2025. LFD a6, 11 * SIZE(AO2)
  2026. FMADD y11, a7, b8, y11
  2027. LFD a7, 12 * SIZE(AO1)
  2028. FMADD y12, a8, b8, y12
  2029. LFD a8, 12 * SIZE(AO2)
  2030. LFD b5, 13 * SIZE(BO)
  2031. LFD b6, 14 * SIZE(BO)
  2032. LFD b7, 15 * SIZE(BO)
  2033. LFD b8, 16 * SIZE(BO)
  2034. FMADD y01, a1, b1, y01
  2035. LFD a1, 13 * SIZE(AO1)
  2036. FMADD y02, a2, b1, y02
  2037. LFD a2, 13 * SIZE(AO2)
  2038. FMADD y03, a3, b2, y03
  2039. LFD a3, 14 * SIZE(AO1)
  2040. FMADD y04, a4, b2, y04
  2041. LFD a4, 14 * SIZE(AO2)
  2042. FMADD y09, a5, b3, y09
  2043. LFD a5, 15 * SIZE(AO1)
  2044. FMADD y10, a6, b3, y10
  2045. LFD a6, 15 * SIZE(AO2)
  2046. FMADD y11, a7, b4, y11
  2047. LFD a7, 16 * SIZE(AO1)
  2048. FMADD y12, a8, b4, y12
  2049. LFD a8, 16 * SIZE(AO2)
  2050. FMADD y01, a1, b5, y01
  2051. FMADD y02, a2, b5, y02
  2052. FMADD y03, a3, b6, y03
  2053. FMADD y04, a4, b6, y04
  2054. FMADD y09, a5, b7, y09
  2055. FMADD y10, a6, b7, y10
  2056. FMADD y11, a7, b8, y11
  2057. FMADD y12, a8, b8, y12
  2058. addi AO1, AO1, 16 * SIZE
  2059. addi AO2, AO2, 16 * SIZE
  2060. addi BO, BO, 16 * SIZE
  2061. .align 4
  2062. LL(34):
  2063. andi. r0, MIN_N, 15
  2064. ble LL(38)
  2065. andi. r0, MIN_N, 8
  2066. ble LL(35)
  2067. LFD a1, 1 * SIZE(AO1)
  2068. LFD a2, 1 * SIZE(AO2)
  2069. LFD a3, 2 * SIZE(AO1)
  2070. LFD a4, 2 * SIZE(AO2)
  2071. LFD b1, 1 * SIZE(BO)
  2072. LFD b2, 2 * SIZE(BO)
  2073. LFD b3, 3 * SIZE(BO)
  2074. LFD b4, 4 * SIZE(BO)
  2075. LFD a5, 3 * SIZE(AO1)
  2076. LFD a6, 3 * SIZE(AO2)
  2077. LFD a7, 4 * SIZE(AO1)
  2078. LFD a8, 4 * SIZE(AO2)
  2079. LFD b5, 5 * SIZE(BO)
  2080. LFD b6, 6 * SIZE(BO)
  2081. LFD b7, 7 * SIZE(BO)
  2082. LFD b8, 8 * SIZE(BO)
  2083. FMADD y01, a1, b1, y01
  2084. LFD a1, 5 * SIZE(AO1)
  2085. FMADD y02, a2, b1, y02
  2086. LFD a2, 5 * SIZE(AO2)
  2087. FMADD y09, a3, b2, y09
  2088. LFD a3, 6 * SIZE(AO1)
  2089. FMADD y10, a4, b2, y10
  2090. LFD a4, 6 * SIZE(AO2)
  2091. FMADD y01, a5, b3, y01
  2092. LFD a5, 7 * SIZE(AO1)
  2093. FMADD y02, a6, b3, y02
  2094. LFD a6, 7 * SIZE(AO2)
  2095. FMADD y09, a7, b4, y09
  2096. LFD a7, 8 * SIZE(AO1)
  2097. FMADD y10, a8, b4, y10
  2098. LFD a8, 8 * SIZE(AO2)
  2099. FMADD y01, a1, b5, y01
  2100. FMADD y02, a2, b5, y02
  2101. FMADD y09, a3, b6, y09
  2102. FMADD y10, a4, b6, y10
  2103. FMADD y01, a5, b7, y01
  2104. addi AO1, AO1, 8 * SIZE
  2105. FMADD y02, a6, b7, y02
  2106. addi AO2, AO2, 8 * SIZE
  2107. FMADD y09, a7, b8, y09
  2108. addi BO, BO, 8 * SIZE
  2109. FMADD y10, a8, b8, y10
  2110. nop
  2111. .align 4
  2112. LL(35):
  2113. andi. r0, MIN_N, 4
  2114. ble LL(36)
  2115. LFD a1, 1 * SIZE(AO1)
  2116. LFD a2, 1 * SIZE(AO2)
  2117. LFD a3, 2 * SIZE(AO1)
  2118. LFD a4, 2 * SIZE(AO2)
  2119. LFD a5, 3 * SIZE(AO1)
  2120. LFD a6, 3 * SIZE(AO2)
  2121. LFD a7, 4 * SIZE(AO1)
  2122. LFD a8, 4 * SIZE(AO2)
  2123. LFD b1, 1 * SIZE(BO)
  2124. LFD b2, 2 * SIZE(BO)
  2125. LFD b3, 3 * SIZE(BO)
  2126. LFD b4, 4 * SIZE(BO)
  2127. FMADD y01, a1, b1, y01
  2128. FMADD y02, a2, b1, y02
  2129. FMADD y09, a3, b2, y09
  2130. FMADD y10, a4, b2, y10
  2131. FMADD y01, a5, b3, y01
  2132. addi AO1, AO1, 4 * SIZE
  2133. FMADD y02, a6, b3, y02
  2134. addi AO2, AO2, 4 * SIZE
  2135. FMADD y09, a7, b4, y09
  2136. addi BO, BO, 4 * SIZE
  2137. FMADD y10, a8, b4, y10
  2138. .align 4
  2139. LL(36):
  2140. andi. r0, MIN_N, 2
  2141. ble LL(37)
  2142. LFD a1, 1 * SIZE(AO1)
  2143. LFD a2, 1 * SIZE(AO2)
  2144. LFD b1, 1 * SIZE(BO)
  2145. LFD b2, 2 * SIZE(BO)
  2146. LFD a3, 2 * SIZE(AO1)
  2147. LFD a4, 2 * SIZE(AO2)
  2148. FMADD y01, a1, b1, y01
  2149. FMADD y02, a2, b1, y02
  2150. FMADD y09, a3, b2, y09
  2151. FMADD y10, a4, b2, y10
  2152. addi AO1, AO1, 2 * SIZE
  2153. addi AO2, AO2, 2 * SIZE
  2154. addi BO, BO, 2 * SIZE
  2155. .align 4
  2156. LL(37):
  2157. andi. r0, MIN_N, 1
  2158. ble LL(38)
  2159. LFD a1, 1 * SIZE(AO1)
  2160. LFD b1, 1 * SIZE(BO)
  2161. LFD a2, 1 * SIZE(AO2)
  2162. FMADD y01, a1, b1, y01
  2163. FMADD y02, a2, b1, y02
  2164. .align 4
  2165. LL(38):
  2166. mr BO, CO
  2167. lfd alpha, ALPHA
  2168. cmpi cr0, 0, INCY, SIZE
  2169. bne LL(39)
  2170. LFD a1, 1 * SIZE(CO)
  2171. LFD a2, 2 * SIZE(CO)
  2172. FADD y01, y03, y01
  2173. FADD y02, y04, y02
  2174. FADD y09, y11, y09
  2175. FADD y10, y12, y10
  2176. FADD y01, y09, y01
  2177. FADD y02, y10, y02
  2178. FMADD a1, alpha, y01, a1
  2179. FMADD a2, alpha, y02, a2
  2180. STFD a1, 1 * SIZE(CO)
  2181. STFD a2, 2 * SIZE(CO)
  2182. addi CO, CO, 2 * SIZE
  2183. b LL(40)
  2184. .align 4
  2185. LL(39):
  2186. LFDUX a1, CO, INCY
  2187. LFDUX a2, CO, INCY
  2188. FADD y01, y03, y01
  2189. FADD y02, y04, y02
  2190. FADD y09, y11, y09
  2191. FADD y10, y12, y10
  2192. FADD y01, y09, y01
  2193. FADD y02, y10, y02
  2194. FMADD a1, alpha, f0, a1
  2195. FMADD a2, alpha, f1, a2
  2196. STFDUX a1, BO, INCY
  2197. STFDUX a2, BO, INCY
  2198. .align 4
  2199. LL(40):
  2200. andi. J, N, 1
  2201. ble LL(99)
  2202. mr AO1, A
  2203. add A, A, LDA
  2204. mr BO, XP
  2205. lfd y01, FZERO
  2206. fmr y02, y01
  2207. fmr y03, y01
  2208. fmr y04, y01
  2209. fmr y09, y01
  2210. fmr y10, y01
  2211. fmr y11, y01
  2212. fmr y12, y01
  2213. DCBT(Y1, PREC)
  2214. srawi. r0, MIN_N, 4
  2215. mtspr CTR, r0
  2216. ble LL(44)
  2217. LFD a1, 1 * SIZE(AO1)
  2218. LFD a2, 2 * SIZE(AO1)
  2219. LFD a3, 3 * SIZE(AO1)
  2220. LFD a4, 4 * SIZE(AO1)
  2221. LFD a5, 5 * SIZE(AO1)
  2222. LFD a6, 6 * SIZE(AO1)
  2223. LFD a7, 7 * SIZE(AO1)
  2224. LFD a8, 8 * SIZE(AO1)
  2225. LFD b1, 1 * SIZE(BO)
  2226. LFD b2, 2 * SIZE(BO)
  2227. LFD b3, 3 * SIZE(BO)
  2228. LFD b4, 4 * SIZE(BO)
  2229. LFD b5, 5 * SIZE(BO)
  2230. LFD b6, 6 * SIZE(BO)
  2231. LFD b7, 7 * SIZE(BO)
  2232. LFD b8, 8 * SIZE(BO)
  2233. bdz LL(43)
  2234. .align 4
  2235. LL(42):
  2236. FMADD y01, a1, b1, y01
  2237. nop
  2238. LFD a1, 9 * SIZE(AO1)
  2239. LFD b1, 9 * SIZE(BO)
  2240. FMADD y02, a2, b2, y02
  2241. nop
  2242. LFD a2, 10 * SIZE(AO1)
  2243. LFD b2, 10 * SIZE(BO)
  2244. FMADD y03, a3, b3, y03
  2245. nop
  2246. LFD a3, 11 * SIZE(AO1)
  2247. LFD b3, 11 * SIZE(BO)
  2248. FMADD y04, a4, b4, y04
  2249. nop
  2250. LFD a4, 12 * SIZE(AO1)
  2251. LFD b4, 12 * SIZE(BO)
  2252. FMADD y01, a5, b5, y01
  2253. nop
  2254. LFD a5, 13 * SIZE(AO1)
  2255. LFD b5, 13 * SIZE(BO)
  2256. FMADD y02, a6, b6, y02
  2257. nop
  2258. LFD a6, 14 * SIZE(AO1)
  2259. LFD b6, 14 * SIZE(BO)
  2260. FMADD y03, a7, b7, y03
  2261. nop
  2262. LFD a7, 15 * SIZE(AO1)
  2263. LFD b7, 15 * SIZE(BO)
  2264. FMADD y04, a8, b8, y04
  2265. nop
  2266. LFD a8, 16 * SIZE(AO1)
  2267. LFD b8, 16 * SIZE(BO)
  2268. FMADD y01, a1, b1, y01
  2269. nop
  2270. LFD a1, 17 * SIZE(AO1)
  2271. LFD b1, 17 * SIZE(BO)
  2272. FMADD y02, a2, b2, y02
  2273. nop
  2274. LFD a2, 18 * SIZE(AO1)
  2275. LFD b2, 18 * SIZE(BO)
  2276. FMADD y03, a3, b3, y03
  2277. nop
  2278. LFD a3, 19 * SIZE(AO1)
  2279. LFD b3, 19 * SIZE(BO)
  2280. FMADD y04, a4, b4, y04
  2281. nop
  2282. LFD a4, 20 * SIZE(AO1)
  2283. LFD b4, 20 * SIZE(BO)
  2284. FMADD y01, a5, b5, y01
  2285. nop
  2286. LFD a5, 21 * SIZE(AO1)
  2287. LFD b5, 21 * SIZE(BO)
  2288. FMADD y02, a6, b6, y02
  2289. nop
  2290. LFD a6, 22 * SIZE(AO1)
  2291. LFD b6, 22 * SIZE(BO)
  2292. FMADD y03, a7, b7, y03
  2293. nop
  2294. LFD a7, 23 * SIZE(AO1)
  2295. LFD b7, 23 * SIZE(BO)
  2296. FMADD y04, a8, b8, y04
  2297. nop
  2298. LFD a8, 24 * SIZE(AO1)
  2299. LFD b8, 24 * SIZE(BO)
  2300. addi AO1, AO1, 16 * SIZE
  2301. addi BO, BO, 16 * SIZE
  2302. DCBT(AO1, PREA)
  2303. bdnz LL(42)
  2304. .align 4
  2305. LL(43):
  2306. FMADD y01, a1, b1, y01
  2307. nop
  2308. LFD a1, 9 * SIZE(AO1)
  2309. LFD b1, 9 * SIZE(BO)
  2310. FMADD y02, a2, b2, y02
  2311. nop
  2312. LFD a2, 10 * SIZE(AO1)
  2313. LFD b2, 10 * SIZE(BO)
  2314. FMADD y03, a3, b3, y03
  2315. nop
  2316. LFD a3, 11 * SIZE(AO1)
  2317. LFD b3, 11 * SIZE(BO)
  2318. FMADD y04, a4, b4, y04
  2319. nop
  2320. LFD a4, 12 * SIZE(AO1)
  2321. LFD b4, 12 * SIZE(BO)
  2322. FMADD y01, a5, b5, y01
  2323. nop
  2324. LFD a5, 13 * SIZE(AO1)
  2325. LFD b5, 13 * SIZE(BO)
  2326. FMADD y02, a6, b6, y02
  2327. nop
  2328. LFD a6, 14 * SIZE(AO1)
  2329. LFD b6, 14 * SIZE(BO)
  2330. FMADD y03, a7, b7, y03
  2331. nop
  2332. LFD a7, 15 * SIZE(AO1)
  2333. LFD b7, 15 * SIZE(BO)
  2334. FMADD y04, a8, b8, y04
  2335. nop
  2336. LFD a8, 16 * SIZE(AO1)
  2337. LFD b8, 16 * SIZE(BO)
  2338. FMADD y01, a1, b1, y01
  2339. FMADD y02, a2, b2, y02
  2340. FMADD y03, a3, b3, y03
  2341. FMADD y04, a4, b4, y04
  2342. FMADD y01, a5, b5, y01
  2343. addi AO1, AO1, 16 * SIZE
  2344. FMADD y02, a6, b6, y02
  2345. addi BO, BO, 16 * SIZE
  2346. FMADD y03, a7, b7, y03
  2347. nop
  2348. FMADD y04, a8, b8, y04
  2349. nop
  2350. .align 4
  2351. LL(44):
  2352. andi. r0, MIN_N, 15
  2353. ble LL(48)
  2354. andi. r0, MIN_N, 8
  2355. ble LL(45)
  2356. LFD a1, 1 * SIZE(AO1)
  2357. LFD a2, 2 * SIZE(AO1)
  2358. LFD a3, 3 * SIZE(AO1)
  2359. LFD a4, 4 * SIZE(AO1)
  2360. LFD b1, 1 * SIZE(BO)
  2361. LFD b2, 2 * SIZE(BO)
  2362. LFD b3, 3 * SIZE(BO)
  2363. LFD b4, 4 * SIZE(BO)
  2364. LFD a5, 5 * SIZE(AO1)
  2365. LFD a6, 6 * SIZE(AO1)
  2366. LFD a7, 7 * SIZE(AO1)
  2367. LFD a8, 8 * SIZE(AO1)
  2368. LFD b5, 5 * SIZE(BO)
  2369. LFD b6, 6 * SIZE(BO)
  2370. LFD b7, 7 * SIZE(BO)
  2371. LFD b8, 8 * SIZE(BO)
  2372. FMADD y01, a1, b1, y01
  2373. FMADD y02, a2, b2, y02
  2374. FMADD y03, a3, b3, y03
  2375. FMADD y04, a4, b4, y04
  2376. FMADD y01, a5, b5, y01
  2377. addi AO1, AO1, 8 * SIZE
  2378. FMADD y02, a6, b6, y02
  2379. addi BO, BO, 8 * SIZE
  2380. FMADD y03, a7, b7, y03
  2381. nop
  2382. FMADD y04, a8, b8, y04
  2383. nop
  2384. .align 4
  2385. LL(45):
  2386. andi. r0, MIN_N, 4
  2387. ble LL(46)
  2388. LFD a1, 1 * SIZE(AO1)
  2389. LFD b1, 1 * SIZE(BO)
  2390. LFD a2, 2 * SIZE(AO1)
  2391. LFD b2, 2 * SIZE(BO)
  2392. LFD a3, 3 * SIZE(AO1)
  2393. LFD b3, 3 * SIZE(BO)
  2394. LFD a4, 4 * SIZE(AO1)
  2395. LFD b4, 4 * SIZE(BO)
  2396. FMADD y01, a1, b1, y01
  2397. addi AO1, AO1, 4 * SIZE
  2398. FMADD y02, a2, b2, y02
  2399. addi AO2, AO2, 4 * SIZE
  2400. FMADD y03, a3, b3, y03
  2401. addi BO, BO, 4 * SIZE
  2402. FMADD y04, a4, b4, y04
  2403. nop
  2404. .align 4
  2405. LL(46):
  2406. andi. r0, MIN_N, 2
  2407. ble LL(47)
  2408. LFD a1, 1 * SIZE(AO1)
  2409. LFD b1, 1 * SIZE(BO)
  2410. LFD a2, 2 * SIZE(AO1)
  2411. LFD b2, 2 * SIZE(BO)
  2412. FMADD y01, a1, b1, y01
  2413. addi AO1, AO1, 2 * SIZE
  2414. FMADD y02, a2, b2, y02
  2415. addi BO, BO, 2 * SIZE
  2416. .align 4
  2417. LL(47):
  2418. andi. r0, MIN_N, 1
  2419. ble LL(48)
  2420. LFD a1, 1 * SIZE(AO1)
  2421. LFD b1, 1 * SIZE(BO)
  2422. FMADD y01, a1, b1, y01
  2423. .align 4
  2424. LL(48):
  2425. mr BO, CO
  2426. lfd alpha, ALPHA
  2427. cmpi cr0, 0, INCY, SIZE
  2428. bne LL(49)
  2429. LFD a1, 1 * SIZE(CO)
  2430. FADD y01, y02, y01
  2431. FADD y03, y04, y03
  2432. FADD y01, y03, y01
  2433. FMADD a1, alpha, y01, a1
  2434. STFD a1, 1 * SIZE(CO)
  2435. b LL(99)
  2436. .align 4
  2437. LL(49):
  2438. LFDUX a1, CO, INCY
  2439. FADD y01, y02, y01
  2440. FADD y03, y04, y03
  2441. FADD y01, y03, y01
  2442. FMADD a1, alpha, f0, a1
  2443. STFDUX a1, BO, INCY
  2444. .align 4
  2445. LL(99):
  2446. subf A, PLDA_M, A
  2447. addi IS, IS, P
  2448. cmp cr0, 0, IS, M
  2449. blt LL(ISLoop)
  2450. .align 4
  2451. LL(999):
  2452. li r3, 0
  2453. lfd f14, 0(SP)
  2454. lfd f15, 8(SP)
  2455. lfd f16, 16(SP)
  2456. lfd f17, 24(SP)
  2457. lfd f18, 32(SP)
  2458. lfd f19, 40(SP)
  2459. lfd f20, 48(SP)
  2460. lfd f21, 56(SP)
  2461. lfd f22, 64(SP)
  2462. lfd f23, 72(SP)
  2463. lfd f24, 80(SP)
  2464. lfd f25, 88(SP)
  2465. lfd f26, 96(SP)
  2466. lfd f27, 104(SP)
  2467. lfd f28, 112(SP)
  2468. lfd f29, 120(SP)
  2469. lfd f30, 128(SP)
  2470. lfd f31, 136(SP)
  2471. #ifdef __64BIT__
  2472. ld r14, 160(SP)
  2473. ld r15, 168(SP)
  2474. ld r16, 176(SP)
  2475. ld r17, 184(SP)
  2476. ld r18, 192(SP)
  2477. ld r19, 200(SP)
  2478. ld r20, 208(SP)
  2479. ld r21, 216(SP)
  2480. ld r22, 224(SP)
  2481. ld r23, 232(SP)
  2482. ld r24, 240(SP)
  2483. ld r25, 248(SP)
  2484. ld r26, 256(SP)
  2485. ld r27, 264(SP)
  2486. ld r28, 272(SP)
  2487. ld r29, 280(SP)
  2488. #else
  2489. lwz r14, 160(SP)
  2490. lwz r15, 164(SP)
  2491. lwz r16, 168(SP)
  2492. lwz r17, 172(SP)
  2493. lwz r18, 176(SP)
  2494. lwz r19, 180(SP)
  2495. lwz r20, 184(SP)
  2496. lwz r21, 188(SP)
  2497. lwz r22, 192(SP)
  2498. lwz r23, 196(SP)
  2499. lwz r24, 200(SP)
  2500. lwz r25, 204(SP)
  2501. lwz r26, 208(SP)
  2502. lwz r27, 212(SP)
  2503. lwz r28, 216(SP)
  2504. lwz r29, 220(SP)
  2505. #endif
  2506. addi SP, SP, STACKSIZE
  2507. blr
  2508. EPILOGUE
  2509. #endif