You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 57 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define BUFFER r11
  83. #define XP r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define AO5 r18
  89. #define AO6 r19
  90. #define AO7 r20
  91. #define AO8 r21
  92. #define MIN_N r22
  93. #define J r23
  94. #define CO r24
  95. #define PREA r25
  96. #define PREC r26
  97. #define BO r27
  98. #define PLDA_M r28
  99. #define IS r29
  100. #define Y1 CO
  101. #if defined(PPCG4)
  102. #define PREFETCHSIZE_A 42
  103. #define PREFETCHSIZE_C 16
  104. #endif
  105. #if defined(PPC440) || defined(PPC440FP2)
  106. #define PREFETCHSIZE_A 42
  107. #define PREFETCHSIZE_C 16
  108. #endif
  109. #ifdef PPC970
  110. #define PREFETCHSIZE_A 42
  111. #define PREFETCHSIZE_C 16
  112. #endif
  113. #ifdef CELL
  114. #define PREFETCHSIZE_A 42
  115. #define PREFETCHSIZE_C 16
  116. #endif
  117. #ifdef POWER3
  118. #define PREFETCHSIZE_A 16
  119. #define PREFETCHSIZE_C 16
  120. #endif
  121. #ifdef POWER4
  122. #define PREFETCHSIZE_A 48
  123. #define PREFETCHSIZE_C 16
  124. #endif
  125. #ifdef POWER5
  126. #define PREFETCHSIZE_A 40
  127. #define PREFETCHSIZE_C 8
  128. #endif
  129. #ifdef POWER6
  130. #define PREFETCHSIZE_A 96
  131. #define PREFETCHSIZE_C 8
  132. #endif
  133. #ifdef POWER8
  134. #define PREFETCHSIZE_A 96
  135. #define PREFETCHSIZE_C 8
  136. #endif
  137. #define y01 f0
  138. #define y02 f1
  139. #define y03 f2
  140. #define y04 f3
  141. #define y05 f4
  142. #define y06 f5
  143. #define y07 f6
  144. #define y08 f7
  145. #define y09 f8
  146. #define y10 f9
  147. #define y11 f10
  148. #define y12 f11
  149. #define y13 f12
  150. #define y14 f13
  151. #define y15 f14
  152. #define y16 f15
  153. #define a1 f16
  154. #define a2 f17
  155. #define a3 f18
  156. #define a4 f19
  157. #define a5 f20
  158. #define a6 f21
  159. #define a7 f22
  160. #define a8 f23
  161. #define b1 f24
  162. #define b2 f25
  163. #define b3 f26
  164. #define b4 f27
  165. #define b5 f28
  166. #define b6 f29
  167. #define b7 f30
  168. #define b8 f31
  169. #define alpha f31
  170. #ifndef NEEDPARAM
  171. #define P 2048
  172. #ifndef __64BIT__
  173. #define STACKSIZE 224
  174. #else
  175. #define STACKSIZE 288
  176. #endif
  177. #define FZERO 144(SP)
  178. #define ALPHA 152(SP)
  179. PROLOGUE
  180. PROFCODE
  181. addi SP, SP, -STACKSIZE
  182. li r0, 0
  183. stfd f14, 0(SP)
  184. stfd f15, 8(SP)
  185. stfd f16, 16(SP)
  186. stfd f17, 24(SP)
  187. stfd f18, 32(SP)
  188. stfd f19, 40(SP)
  189. stfd f20, 48(SP)
  190. stfd f21, 56(SP)
  191. stfd f22, 64(SP)
  192. stfd f23, 72(SP)
  193. stfd f24, 80(SP)
  194. stfd f25, 88(SP)
  195. stfd f26, 96(SP)
  196. stfd f27, 104(SP)
  197. stfd f28, 112(SP)
  198. stfd f29, 120(SP)
  199. stfd f30, 128(SP)
  200. stfd f31, 136(SP)
  201. #ifdef __64BIT__
  202. std r0, FZERO
  203. stfd f1, ALPHA
  204. std r14, 160(SP)
  205. std r15, 168(SP)
  206. std r16, 176(SP)
  207. std r17, 184(SP)
  208. std r18, 192(SP)
  209. std r19, 200(SP)
  210. std r20, 208(SP)
  211. std r21, 216(SP)
  212. std r22, 224(SP)
  213. std r23, 232(SP)
  214. std r24, 240(SP)
  215. std r25, 248(SP)
  216. std r26, 256(SP)
  217. std r27, 264(SP)
  218. std r28, 272(SP)
  219. std r29, 280(SP)
  220. #else
  221. stw r0, 0 + FZERO
  222. stw r0, 4 + FZERO
  223. stfd f1, ALPHA
  224. stw r14, 160(SP)
  225. stw r15, 164(SP)
  226. stw r16, 168(SP)
  227. stw r17, 172(SP)
  228. stw r18, 176(SP)
  229. stw r19, 180(SP)
  230. stw r20, 184(SP)
  231. stw r21, 188(SP)
  232. stw r22, 192(SP)
  233. stw r23, 196(SP)
  234. stw r24, 200(SP)
  235. stw r25, 204(SP)
  236. stw r26, 208(SP)
  237. stw r27, 212(SP)
  238. stw r28, 216(SP)
  239. stw r29, 220(SP)
  240. #endif
  241. #if defined(linux) || defined(__FreeBSD__)
  242. #ifndef __64BIT__
  243. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  244. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  245. #else
  246. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  247. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  248. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  249. #endif
  250. #endif
  251. #if defined(_AIX) || defined(__APPLE__)
  252. #ifndef __64BIT__
  253. #ifdef DOUBLE
  254. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  255. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  256. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  257. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  258. #else
  259. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  260. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  261. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  262. #endif
  263. #else
  264. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  265. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  266. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  267. #endif
  268. #endif
  269. mullw PLDA_M, LDA, N
  270. li XP, P
  271. subf PLDA_M, XP, PLDA_M
  272. slwi PLDA_M, PLDA_M, BASE_SHIFT
  273. slwi LDA, LDA, BASE_SHIFT
  274. slwi INCX, INCX, BASE_SHIFT
  275. slwi INCY, INCY, BASE_SHIFT
  276. subf Y, INCY, Y
  277. li IS, 0
  278. addi A, A, -SIZE
  279. li PREA, PREFETCHSIZE_A * SIZE
  280. li PREC, PREFETCHSIZE_C * SIZE
  281. cmpi cr0, 0, M, 0
  282. ble LL(999)
  283. cmpi cr0, 0, N, 0
  284. ble LL(999)
  285. .align 4
  286. LL(ISLoop):
  287. subf MIN_N, IS, M
  288. slwi r0, IS, BASE_SHIFT
  289. cmpi cr0, 0, MIN_N, P
  290. ble+ LL(min_nP)
  291. li MIN_N, P
  292. LL(min_nP):
  293. add XP, X, r0
  294. cmpi cr0, 0, INCX, SIZE
  295. beq LL(10)
  296. mr XP, BUFFER
  297. addi CO, BUFFER, -SIZE
  298. srawi. r0, MIN_N, 3
  299. mtspr CTR, r0
  300. ble LL(CopyRemain)
  301. .align 4
  302. LL(CopyKernel):
  303. LFD f0, 0 * SIZE(X)
  304. add X, X, INCX
  305. LFD f1, 0 * SIZE(X)
  306. add X, X, INCX
  307. LFD f2, 0 * SIZE(X)
  308. add X, X, INCX
  309. LFD f3, 0 * SIZE(X)
  310. add X, X, INCX
  311. LFD f4, 0 * SIZE(X)
  312. add X, X, INCX
  313. LFD f5, 0 * SIZE(X)
  314. add X, X, INCX
  315. LFD f6, 0 * SIZE(X)
  316. add X, X, INCX
  317. LFD f7, 0 * SIZE(X)
  318. add X, X, INCX
  319. STFD f0, 1 * SIZE(CO)
  320. STFD f1, 2 * SIZE(CO)
  321. STFD f2, 3 * SIZE(CO)
  322. STFD f3, 4 * SIZE(CO)
  323. STFD f4, 5 * SIZE(CO)
  324. STFD f5, 6 * SIZE(CO)
  325. STFD f6, 7 * SIZE(CO)
  326. STFDU f7, 8 * SIZE(CO)
  327. bdnz LL(CopyKernel)
  328. .align 4
  329. LL(CopyRemain):
  330. andi. r0, MIN_N, 7
  331. mtspr CTR, r0
  332. ble LL(10)
  333. .align 4
  334. LL(CopySub):
  335. LFD f0, 0 * SIZE(X)
  336. add X, X, INCX
  337. STFDU f0, 1 * SIZE(CO)
  338. bdnz LL(CopySub)
  339. .align 4
  340. LL(10):
  341. mr CO, Y
  342. addi XP, XP, -SIZE
  343. srawi. J, N, 3
  344. ble LL(20)
  345. .align 4
  346. LL(11):
  347. mr AO1, A
  348. add AO2, A, LDA
  349. add AO3, AO2, LDA
  350. add AO4, AO3, LDA
  351. add AO5, AO4, LDA
  352. add AO6, AO5, LDA
  353. add AO7, AO6, LDA
  354. add AO8, AO7, LDA
  355. add A, AO8, LDA
  356. mr BO, XP
  357. lfd y01, FZERO
  358. fmr y02, y01
  359. fmr y03, y01
  360. fmr y04, y01
  361. fmr y05, y01
  362. fmr y06, y01
  363. fmr y07, y01
  364. fmr y08, y01
  365. fmr y09, y01
  366. fmr y10, y01
  367. fmr y11, y01
  368. fmr y12, y01
  369. fmr y13, y01
  370. fmr y14, y01
  371. fmr y15, y01
  372. fmr y16, y01
  373. DCBT(Y1, PREC)
  374. srawi. r0, MIN_N, 4
  375. mtspr CTR, r0
  376. ble LL(14)
  377. LFD a1, 1 * SIZE(AO1)
  378. LFD a2, 1 * SIZE(AO2)
  379. LFD a3, 1 * SIZE(AO3)
  380. LFD a4, 1 * SIZE(AO4)
  381. LFD a5, 1 * SIZE(AO5)
  382. LFD a6, 1 * SIZE(AO6)
  383. LFD a7, 1 * SIZE(AO7)
  384. LFD a8, 1 * SIZE(AO8)
  385. LFD b1, 1 * SIZE(BO)
  386. LFD b2, 2 * SIZE(BO)
  387. LFD b3, 3 * SIZE(BO)
  388. LFD b4, 4 * SIZE(BO)
  389. LFD b5, 5 * SIZE(BO)
  390. LFD b6, 6 * SIZE(BO)
  391. LFD b7, 7 * SIZE(BO)
  392. LFD b8, 8 * SIZE(BO)
  393. bdz LL(13)
  394. .align 4
  395. LL(12):
  396. FMADD y01, a1, b1, y01
  397. LFD a1, 2 * SIZE(AO1)
  398. FMADD y02, a2, b1, y02
  399. LFD a2, 2 * SIZE(AO2)
  400. FMADD y03, a3, b1, y03
  401. LFD a3, 2 * SIZE(AO3)
  402. FMADD y04, a4, b1, y04
  403. LFD a4, 2 * SIZE(AO4)
  404. FMADD y05, a5, b1, y05
  405. LFD a5, 2 * SIZE(AO5)
  406. FMADD y06, a6, b1, y06
  407. LFD a6, 2 * SIZE(AO6)
  408. FMADD y07, a7, b1, y07
  409. LFD a7, 2 * SIZE(AO7)
  410. FMADD y08, a8, b1, y08
  411. LFD a8, 2 * SIZE(AO8)
  412. FMADD y09, a1, b2, y09
  413. LFD a1, 3 * SIZE(AO1)
  414. FMADD y10, a2, b2, y10
  415. LFD a2, 3 * SIZE(AO2)
  416. FMADD y11, a3, b2, y11
  417. LFD a3, 3 * SIZE(AO3)
  418. FMADD y12, a4, b2, y12
  419. LFD a4, 3 * SIZE(AO4)
  420. FMADD y13, a5, b2, y13
  421. LFD a5, 3 * SIZE(AO5)
  422. FMADD y14, a6, b2, y14
  423. LFD a6, 3 * SIZE(AO6)
  424. FMADD y15, a7, b2, y15
  425. LFD a7, 3 * SIZE(AO7)
  426. FMADD y16, a8, b2, y16
  427. LFD a8, 3 * SIZE(AO8)
  428. FMADD y01, a1, b3, y01
  429. LFD a1, 4 * SIZE(AO1)
  430. FMADD y02, a2, b3, y02
  431. LFD a2, 4 * SIZE(AO2)
  432. FMADD y03, a3, b3, y03
  433. LFD a3, 4 * SIZE(AO3)
  434. FMADD y04, a4, b3, y04
  435. LFD a4, 4 * SIZE(AO4)
  436. FMADD y05, a5, b3, y05
  437. LFD a5, 4 * SIZE(AO5)
  438. FMADD y06, a6, b3, y06
  439. LFD a6, 4 * SIZE(AO6)
  440. FMADD y07, a7, b3, y07
  441. LFD a7, 4 * SIZE(AO7)
  442. FMADD y08, a8, b3, y08
  443. LFD a8, 4 * SIZE(AO8)
  444. FMADD y09, a1, b4, y09
  445. LFD a1, 5 * SIZE(AO1)
  446. FMADD y10, a2, b4, y10
  447. LFD a2, 5 * SIZE(AO2)
  448. FMADD y11, a3, b4, y11
  449. LFD a3, 5 * SIZE(AO3)
  450. FMADD y12, a4, b4, y12
  451. LFD a4, 5 * SIZE(AO4)
  452. FMADD y13, a5, b4, y13
  453. LFD a5, 5 * SIZE(AO5)
  454. FMADD y14, a6, b4, y14
  455. LFD a6, 5 * SIZE(AO6)
  456. FMADD y15, a7, b4, y15
  457. LFD a7, 5 * SIZE(AO7)
  458. FMADD y16, a8, b4, y16
  459. LFD a8, 5 * SIZE(AO8)
  460. LFD b1, 9 * SIZE(BO)
  461. LFD b2, 10 * SIZE(BO)
  462. LFD b3, 11 * SIZE(BO)
  463. LFD b4, 12 * SIZE(BO)
  464. FMADD y01, a1, b5, y01
  465. LFD a1, 6 * SIZE(AO1)
  466. FMADD y02, a2, b5, y02
  467. LFD a2, 6 * SIZE(AO2)
  468. FMADD y03, a3, b5, y03
  469. LFD a3, 6 * SIZE(AO3)
  470. FMADD y04, a4, b5, y04
  471. LFD a4, 6 * SIZE(AO4)
  472. FMADD y05, a5, b5, y05
  473. LFD a5, 6 * SIZE(AO5)
  474. FMADD y06, a6, b5, y06
  475. LFD a6, 6 * SIZE(AO6)
  476. FMADD y07, a7, b5, y07
  477. LFD a7, 6 * SIZE(AO7)
  478. FMADD y08, a8, b5, y08
  479. LFD a8, 6 * SIZE(AO8)
  480. FMADD y09, a1, b6, y09
  481. LFD a1, 7 * SIZE(AO1)
  482. FMADD y10, a2, b6, y10
  483. LFD a2, 7 * SIZE(AO2)
  484. FMADD y11, a3, b6, y11
  485. LFD a3, 7 * SIZE(AO3)
  486. FMADD y12, a4, b6, y12
  487. LFD a4, 7 * SIZE(AO4)
  488. FMADD y13, a5, b6, y13
  489. LFD a5, 7 * SIZE(AO5)
  490. FMADD y14, a6, b6, y14
  491. LFD a6, 7 * SIZE(AO6)
  492. FMADD y15, a7, b6, y15
  493. LFD a7, 7 * SIZE(AO7)
  494. FMADD y16, a8, b6, y16
  495. LFD a8, 7 * SIZE(AO8)
  496. FMADD y01, a1, b7, y01
  497. LFD a1, 8 * SIZE(AO1)
  498. FMADD y02, a2, b7, y02
  499. LFD a2, 8 * SIZE(AO2)
  500. FMADD y03, a3, b7, y03
  501. LFD a3, 8 * SIZE(AO3)
  502. FMADD y04, a4, b7, y04
  503. LFD a4, 8 * SIZE(AO4)
  504. FMADD y05, a5, b7, y05
  505. LFD a5, 8 * SIZE(AO5)
  506. FMADD y06, a6, b7, y06
  507. LFD a6, 8 * SIZE(AO6)
  508. FMADD y07, a7, b7, y07
  509. LFD a7, 8 * SIZE(AO7)
  510. FMADD y08, a8, b7, y08
  511. LFD a8, 8 * SIZE(AO8)
  512. FMADD y09, a1, b8, y09
  513. LFD a1, 9 * SIZE(AO1)
  514. FMADD y10, a2, b8, y10
  515. LFD a2, 9 * SIZE(AO2)
  516. FMADD y11, a3, b8, y11
  517. LFD a3, 9 * SIZE(AO3)
  518. FMADD y12, a4, b8, y12
  519. LFD a4, 9 * SIZE(AO4)
  520. FMADD y13, a5, b8, y13
  521. LFD a5, 9 * SIZE(AO5)
  522. FMADD y14, a6, b8, y14
  523. LFD a6, 9 * SIZE(AO6)
  524. FMADD y15, a7, b8, y15
  525. LFD a7, 9 * SIZE(AO7)
  526. FMADD y16, a8, b8, y16
  527. LFD a8, 9 * SIZE(AO8)
  528. LFD b5, 13 * SIZE(BO)
  529. LFD b6, 14 * SIZE(BO)
  530. LFD b7, 15 * SIZE(BO)
  531. LFD b8, 16 * SIZE(BO)
  532. DCBT(AO1, PREA)
  533. DCBT(AO2, PREA)
  534. DCBT(AO3, PREA)
  535. DCBT(AO4, PREA)
  536. FMADD y01, a1, b1, y01
  537. LFD a1, 10 * SIZE(AO1)
  538. FMADD y02, a2, b1, y02
  539. LFD a2, 10 * SIZE(AO2)
  540. FMADD y03, a3, b1, y03
  541. LFD a3, 10 * SIZE(AO3)
  542. FMADD y04, a4, b1, y04
  543. LFD a4, 10 * SIZE(AO4)
  544. FMADD y05, a5, b1, y05
  545. LFD a5, 10 * SIZE(AO5)
  546. FMADD y06, a6, b1, y06
  547. LFD a6, 10 * SIZE(AO6)
  548. FMADD y07, a7, b1, y07
  549. LFD a7, 10 * SIZE(AO7)
  550. FMADD y08, a8, b1, y08
  551. LFD a8, 10 * SIZE(AO8)
  552. FMADD y09, a1, b2, y09
  553. LFD a1, 11 * SIZE(AO1)
  554. FMADD y10, a2, b2, y10
  555. LFD a2, 11 * SIZE(AO2)
  556. FMADD y11, a3, b2, y11
  557. LFD a3, 11 * SIZE(AO3)
  558. FMADD y12, a4, b2, y12
  559. LFD a4, 11 * SIZE(AO4)
  560. FMADD y13, a5, b2, y13
  561. LFD a5, 11 * SIZE(AO5)
  562. FMADD y14, a6, b2, y14
  563. LFD a6, 11 * SIZE(AO6)
  564. FMADD y15, a7, b2, y15
  565. LFD a7, 11 * SIZE(AO7)
  566. FMADD y16, a8, b2, y16
  567. LFD a8, 11 * SIZE(AO8)
  568. FMADD y01, a1, b3, y01
  569. LFD a1, 12 * SIZE(AO1)
  570. FMADD y02, a2, b3, y02
  571. LFD a2, 12 * SIZE(AO2)
  572. FMADD y03, a3, b3, y03
  573. LFD a3, 12 * SIZE(AO3)
  574. FMADD y04, a4, b3, y04
  575. LFD a4, 12 * SIZE(AO4)
  576. FMADD y05, a5, b3, y05
  577. LFD a5, 12 * SIZE(AO5)
  578. FMADD y06, a6, b3, y06
  579. LFD a6, 12 * SIZE(AO6)
  580. FMADD y07, a7, b3, y07
  581. LFD a7, 12 * SIZE(AO7)
  582. FMADD y08, a8, b3, y08
  583. LFD a8, 12 * SIZE(AO8)
  584. FMADD y09, a1, b4, y09
  585. LFD a1, 13 * SIZE(AO1)
  586. FMADD y10, a2, b4, y10
  587. LFD a2, 13 * SIZE(AO2)
  588. FMADD y11, a3, b4, y11
  589. LFD a3, 13 * SIZE(AO3)
  590. FMADD y12, a4, b4, y12
  591. LFD a4, 13 * SIZE(AO4)
  592. FMADD y13, a5, b4, y13
  593. LFD a5, 13 * SIZE(AO5)
  594. FMADD y14, a6, b4, y14
  595. LFD a6, 13 * SIZE(AO6)
  596. FMADD y15, a7, b4, y15
  597. LFD a7, 13 * SIZE(AO7)
  598. FMADD y16, a8, b4, y16
  599. LFD a8, 13 * SIZE(AO8)
  600. LFD b1, 17 * SIZE(BO)
  601. LFD b2, 18 * SIZE(BO)
  602. LFD b3, 19 * SIZE(BO)
  603. LFD b4, 20 * SIZE(BO)
  604. FMADD y01, a1, b5, y01
  605. LFD a1, 14 * SIZE(AO1)
  606. FMADD y02, a2, b5, y02
  607. LFD a2, 14 * SIZE(AO2)
  608. FMADD y03, a3, b5, y03
  609. LFD a3, 14 * SIZE(AO3)
  610. FMADD y04, a4, b5, y04
  611. LFD a4, 14 * SIZE(AO4)
  612. FMADD y05, a5, b5, y05
  613. LFD a5, 14 * SIZE(AO5)
  614. FMADD y06, a6, b5, y06
  615. LFD a6, 14 * SIZE(AO6)
  616. FMADD y07, a7, b5, y07
  617. LFD a7, 14 * SIZE(AO7)
  618. FMADD y08, a8, b5, y08
  619. LFD a8, 14 * SIZE(AO8)
  620. FMADD y09, a1, b6, y09
  621. LFD a1, 15 * SIZE(AO1)
  622. FMADD y10, a2, b6, y10
  623. LFD a2, 15 * SIZE(AO2)
  624. FMADD y11, a3, b6, y11
  625. LFD a3, 15 * SIZE(AO3)
  626. FMADD y12, a4, b6, y12
  627. LFD a4, 15 * SIZE(AO4)
  628. FMADD y13, a5, b6, y13
  629. LFD a5, 15 * SIZE(AO5)
  630. FMADD y14, a6, b6, y14
  631. LFD a6, 15 * SIZE(AO6)
  632. FMADD y15, a7, b6, y15
  633. LFD a7, 15 * SIZE(AO7)
  634. FMADD y16, a8, b6, y16
  635. LFD a8, 15 * SIZE(AO8)
  636. FMADD y01, a1, b7, y01
  637. LFD a1, 16 * SIZE(AO1)
  638. FMADD y02, a2, b7, y02
  639. LFD a2, 16 * SIZE(AO2)
  640. FMADD y03, a3, b7, y03
  641. LFD a3, 16 * SIZE(AO3)
  642. FMADD y04, a4, b7, y04
  643. LFD a4, 16 * SIZE(AO4)
  644. FMADD y05, a5, b7, y05
  645. LFD a5, 16 * SIZE(AO5)
  646. FMADD y06, a6, b7, y06
  647. LFD a6, 16 * SIZE(AO6)
  648. FMADD y07, a7, b7, y07
  649. LFD a7, 16 * SIZE(AO7)
  650. FMADD y08, a8, b7, y08
  651. LFD a8, 16 * SIZE(AO8)
  652. FMADD y09, a1, b8, y09
  653. LFD a1, 17 * SIZE(AO1)
  654. FMADD y10, a2, b8, y10
  655. LFD a2, 17 * SIZE(AO2)
  656. FMADD y11, a3, b8, y11
  657. LFD a3, 17 * SIZE(AO3)
  658. FMADD y12, a4, b8, y12
  659. LFD a4, 17 * SIZE(AO4)
  660. addi AO1, AO1, 16 * SIZE
  661. addi AO2, AO2, 16 * SIZE
  662. addi AO3, AO3, 16 * SIZE
  663. addi AO4, AO4, 16 * SIZE
  664. FMADD y13, a5, b8, y13
  665. LFD a5, 17 * SIZE(AO5)
  666. FMADD y14, a6, b8, y14
  667. LFD a6, 17 * SIZE(AO6)
  668. FMADD y15, a7, b8, y15
  669. LFD a7, 17 * SIZE(AO7)
  670. FMADD y16, a8, b8, y16
  671. LFD a8, 17 * SIZE(AO8)
  672. LFD b5, 21 * SIZE(BO)
  673. LFD b6, 22 * SIZE(BO)
  674. LFD b7, 23 * SIZE(BO)
  675. LFD b8, 24 * SIZE(BO)
  676. addi AO5, AO5, 16 * SIZE
  677. addi AO6, AO6, 16 * SIZE
  678. DCBT(AO5, PREA)
  679. DCBT(AO6, PREA)
  680. addi AO7, AO7, 16 * SIZE
  681. addi AO8, AO8, 16 * SIZE
  682. DCBT(AO7, PREA)
  683. DCBT(AO8, PREA)
  684. addi BO, BO, 16 * SIZE
  685. bdnz LL(12)
  686. .align 4
  687. LL(13):
  688. FMADD y01, a1, b1, y01
  689. LFD a1, 2 * SIZE(AO1)
  690. FMADD y02, a2, b1, y02
  691. LFD a2, 2 * SIZE(AO2)
  692. FMADD y03, a3, b1, y03
  693. LFD a3, 2 * SIZE(AO3)
  694. FMADD y04, a4, b1, y04
  695. LFD a4, 2 * SIZE(AO4)
  696. FMADD y05, a5, b1, y05
  697. LFD a5, 2 * SIZE(AO5)
  698. FMADD y06, a6, b1, y06
  699. LFD a6, 2 * SIZE(AO6)
  700. FMADD y07, a7, b1, y07
  701. LFD a7, 2 * SIZE(AO7)
  702. FMADD y08, a8, b1, y08
  703. LFD a8, 2 * SIZE(AO8)
  704. FMADD y09, a1, b2, y09
  705. LFD a1, 3 * SIZE(AO1)
  706. FMADD y10, a2, b2, y10
  707. LFD a2, 3 * SIZE(AO2)
  708. FMADD y11, a3, b2, y11
  709. LFD a3, 3 * SIZE(AO3)
  710. FMADD y12, a4, b2, y12
  711. LFD a4, 3 * SIZE(AO4)
  712. FMADD y13, a5, b2, y13
  713. LFD a5, 3 * SIZE(AO5)
  714. FMADD y14, a6, b2, y14
  715. LFD a6, 3 * SIZE(AO6)
  716. FMADD y15, a7, b2, y15
  717. LFD a7, 3 * SIZE(AO7)
  718. FMADD y16, a8, b2, y16
  719. LFD a8, 3 * SIZE(AO8)
  720. FMADD y01, a1, b3, y01
  721. LFD a1, 4 * SIZE(AO1)
  722. FMADD y02, a2, b3, y02
  723. LFD a2, 4 * SIZE(AO2)
  724. FMADD y03, a3, b3, y03
  725. LFD a3, 4 * SIZE(AO3)
  726. FMADD y04, a4, b3, y04
  727. LFD a4, 4 * SIZE(AO4)
  728. FMADD y05, a5, b3, y05
  729. LFD a5, 4 * SIZE(AO5)
  730. FMADD y06, a6, b3, y06
  731. LFD a6, 4 * SIZE(AO6)
  732. FMADD y07, a7, b3, y07
  733. LFD a7, 4 * SIZE(AO7)
  734. FMADD y08, a8, b3, y08
  735. LFD a8, 4 * SIZE(AO8)
  736. FMADD y09, a1, b4, y09
  737. LFD a1, 5 * SIZE(AO1)
  738. FMADD y10, a2, b4, y10
  739. LFD a2, 5 * SIZE(AO2)
  740. FMADD y11, a3, b4, y11
  741. LFD a3, 5 * SIZE(AO3)
  742. FMADD y12, a4, b4, y12
  743. LFD a4, 5 * SIZE(AO4)
  744. FMADD y13, a5, b4, y13
  745. LFD a5, 5 * SIZE(AO5)
  746. FMADD y14, a6, b4, y14
  747. LFD a6, 5 * SIZE(AO6)
  748. FMADD y15, a7, b4, y15
  749. LFD a7, 5 * SIZE(AO7)
  750. FMADD y16, a8, b4, y16
  751. LFD a8, 5 * SIZE(AO8)
  752. LFD b1, 9 * SIZE(BO)
  753. LFD b2, 10 * SIZE(BO)
  754. LFD b3, 11 * SIZE(BO)
  755. LFD b4, 12 * SIZE(BO)
  756. FMADD y01, a1, b5, y01
  757. LFD a1, 6 * SIZE(AO1)
  758. FMADD y02, a2, b5, y02
  759. LFD a2, 6 * SIZE(AO2)
  760. FMADD y03, a3, b5, y03
  761. LFD a3, 6 * SIZE(AO3)
  762. FMADD y04, a4, b5, y04
  763. LFD a4, 6 * SIZE(AO4)
  764. FMADD y05, a5, b5, y05
  765. LFD a5, 6 * SIZE(AO5)
  766. FMADD y06, a6, b5, y06
  767. LFD a6, 6 * SIZE(AO6)
  768. FMADD y07, a7, b5, y07
  769. LFD a7, 6 * SIZE(AO7)
  770. FMADD y08, a8, b5, y08
  771. LFD a8, 6 * SIZE(AO8)
  772. FMADD y09, a1, b6, y09
  773. LFD a1, 7 * SIZE(AO1)
  774. FMADD y10, a2, b6, y10
  775. LFD a2, 7 * SIZE(AO2)
  776. FMADD y11, a3, b6, y11
  777. LFD a3, 7 * SIZE(AO3)
  778. FMADD y12, a4, b6, y12
  779. LFD a4, 7 * SIZE(AO4)
  780. FMADD y13, a5, b6, y13
  781. LFD a5, 7 * SIZE(AO5)
  782. FMADD y14, a6, b6, y14
  783. LFD a6, 7 * SIZE(AO6)
  784. FMADD y15, a7, b6, y15
  785. LFD a7, 7 * SIZE(AO7)
  786. FMADD y16, a8, b6, y16
  787. LFD a8, 7 * SIZE(AO8)
  788. FMADD y01, a1, b7, y01
  789. LFD a1, 8 * SIZE(AO1)
  790. FMADD y02, a2, b7, y02
  791. LFD a2, 8 * SIZE(AO2)
  792. FMADD y03, a3, b7, y03
  793. LFD a3, 8 * SIZE(AO3)
  794. FMADD y04, a4, b7, y04
  795. LFD a4, 8 * SIZE(AO4)
  796. FMADD y05, a5, b7, y05
  797. LFD a5, 8 * SIZE(AO5)
  798. FMADD y06, a6, b7, y06
  799. LFD a6, 8 * SIZE(AO6)
  800. FMADD y07, a7, b7, y07
  801. LFD a7, 8 * SIZE(AO7)
  802. FMADD y08, a8, b7, y08
  803. LFD a8, 8 * SIZE(AO8)
  804. FMADD y09, a1, b8, y09
  805. LFD a1, 9 * SIZE(AO1)
  806. FMADD y10, a2, b8, y10
  807. LFD a2, 9 * SIZE(AO2)
  808. FMADD y11, a3, b8, y11
  809. LFD a3, 9 * SIZE(AO3)
  810. FMADD y12, a4, b8, y12
  811. LFD a4, 9 * SIZE(AO4)
  812. FMADD y13, a5, b8, y13
  813. LFD a5, 9 * SIZE(AO5)
  814. FMADD y14, a6, b8, y14
  815. LFD a6, 9 * SIZE(AO6)
  816. FMADD y15, a7, b8, y15
  817. LFD a7, 9 * SIZE(AO7)
  818. FMADD y16, a8, b8, y16
  819. LFD a8, 9 * SIZE(AO8)
  820. LFD b5, 13 * SIZE(BO)
  821. LFD b6, 14 * SIZE(BO)
  822. LFD b7, 15 * SIZE(BO)
  823. LFD b8, 16 * SIZE(BO)
  824. FMADD y01, a1, b1, y01
  825. LFD a1, 10 * SIZE(AO1)
  826. FMADD y02, a2, b1, y02
  827. LFD a2, 10 * SIZE(AO2)
  828. FMADD y03, a3, b1, y03
  829. LFD a3, 10 * SIZE(AO3)
  830. FMADD y04, a4, b1, y04
  831. LFD a4, 10 * SIZE(AO4)
  832. FMADD y05, a5, b1, y05
  833. LFD a5, 10 * SIZE(AO5)
  834. FMADD y06, a6, b1, y06
  835. LFD a6, 10 * SIZE(AO6)
  836. FMADD y07, a7, b1, y07
  837. LFD a7, 10 * SIZE(AO7)
  838. FMADD y08, a8, b1, y08
  839. LFD a8, 10 * SIZE(AO8)
  840. FMADD y09, a1, b2, y09
  841. LFD a1, 11 * SIZE(AO1)
  842. FMADD y10, a2, b2, y10
  843. LFD a2, 11 * SIZE(AO2)
  844. FMADD y11, a3, b2, y11
  845. LFD a3, 11 * SIZE(AO3)
  846. FMADD y12, a4, b2, y12
  847. LFD a4, 11 * SIZE(AO4)
  848. FMADD y13, a5, b2, y13
  849. LFD a5, 11 * SIZE(AO5)
  850. FMADD y14, a6, b2, y14
  851. LFD a6, 11 * SIZE(AO6)
  852. FMADD y15, a7, b2, y15
  853. LFD a7, 11 * SIZE(AO7)
  854. FMADD y16, a8, b2, y16
  855. LFD a8, 11 * SIZE(AO8)
  856. FMADD y01, a1, b3, y01
  857. LFD a1, 12 * SIZE(AO1)
  858. FMADD y02, a2, b3, y02
  859. LFD a2, 12 * SIZE(AO2)
  860. FMADD y03, a3, b3, y03
  861. LFD a3, 12 * SIZE(AO3)
  862. FMADD y04, a4, b3, y04
  863. LFD a4, 12 * SIZE(AO4)
  864. FMADD y05, a5, b3, y05
  865. LFD a5, 12 * SIZE(AO5)
  866. FMADD y06, a6, b3, y06
  867. LFD a6, 12 * SIZE(AO6)
  868. FMADD y07, a7, b3, y07
  869. LFD a7, 12 * SIZE(AO7)
  870. FMADD y08, a8, b3, y08
  871. LFD a8, 12 * SIZE(AO8)
  872. FMADD y09, a1, b4, y09
  873. LFD a1, 13 * SIZE(AO1)
  874. FMADD y10, a2, b4, y10
  875. LFD a2, 13 * SIZE(AO2)
  876. FMADD y11, a3, b4, y11
  877. LFD a3, 13 * SIZE(AO3)
  878. FMADD y12, a4, b4, y12
  879. LFD a4, 13 * SIZE(AO4)
  880. FMADD y13, a5, b4, y13
  881. LFD a5, 13 * SIZE(AO5)
  882. FMADD y14, a6, b4, y14
  883. LFD a6, 13 * SIZE(AO6)
  884. FMADD y15, a7, b4, y15
  885. LFD a7, 13 * SIZE(AO7)
  886. FMADD y16, a8, b4, y16
  887. LFD a8, 13 * SIZE(AO8)
  888. FMADD y01, a1, b5, y01
  889. LFD a1, 14 * SIZE(AO1)
  890. FMADD y02, a2, b5, y02
  891. LFD a2, 14 * SIZE(AO2)
  892. FMADD y03, a3, b5, y03
  893. LFD a3, 14 * SIZE(AO3)
  894. FMADD y04, a4, b5, y04
  895. LFD a4, 14 * SIZE(AO4)
  896. FMADD y05, a5, b5, y05
  897. LFD a5, 14 * SIZE(AO5)
  898. FMADD y06, a6, b5, y06
  899. LFD a6, 14 * SIZE(AO6)
  900. FMADD y07, a7, b5, y07
  901. LFD a7, 14 * SIZE(AO7)
  902. FMADD y08, a8, b5, y08
  903. LFD a8, 14 * SIZE(AO8)
  904. FMADD y09, a1, b6, y09
  905. LFD a1, 15 * SIZE(AO1)
  906. FMADD y10, a2, b6, y10
  907. LFD a2, 15 * SIZE(AO2)
  908. FMADD y11, a3, b6, y11
  909. LFD a3, 15 * SIZE(AO3)
  910. FMADD y12, a4, b6, y12
  911. LFD a4, 15 * SIZE(AO4)
  912. FMADD y13, a5, b6, y13
  913. LFD a5, 15 * SIZE(AO5)
  914. FMADD y14, a6, b6, y14
  915. LFD a6, 15 * SIZE(AO6)
  916. FMADD y15, a7, b6, y15
  917. LFD a7, 15 * SIZE(AO7)
  918. FMADD y16, a8, b6, y16
  919. LFD a8, 15 * SIZE(AO8)
  920. FMADD y01, a1, b7, y01
  921. LFD a1, 16 * SIZE(AO1)
  922. FMADD y02, a2, b7, y02
  923. LFD a2, 16 * SIZE(AO2)
  924. FMADD y03, a3, b7, y03
  925. LFD a3, 16 * SIZE(AO3)
  926. FMADD y04, a4, b7, y04
  927. LFD a4, 16 * SIZE(AO4)
  928. FMADD y05, a5, b7, y05
  929. LFD a5, 16 * SIZE(AO5)
  930. FMADD y06, a6, b7, y06
  931. LFD a6, 16 * SIZE(AO6)
  932. FMADD y07, a7, b7, y07
  933. LFD a7, 16 * SIZE(AO7)
  934. FMADD y08, a8, b7, y08
  935. LFD a8, 16 * SIZE(AO8)
  936. FMADD y09, a1, b8, y09
  937. FMADD y10, a2, b8, y10
  938. FMADD y11, a3, b8, y11
  939. FMADD y12, a4, b8, y12
  940. addi AO1, AO1, 16 * SIZE
  941. addi AO2, AO2, 16 * SIZE
  942. addi AO3, AO3, 16 * SIZE
  943. addi AO4, AO4, 16 * SIZE
  944. FMADD y13, a5, b8, y13
  945. FMADD y14, a6, b8, y14
  946. FMADD y15, a7, b8, y15
  947. FMADD y16, a8, b8, y16
  948. addi AO5, AO5, 16 * SIZE
  949. addi AO6, AO6, 16 * SIZE
  950. addi AO7, AO7, 16 * SIZE
  951. addi AO8, AO8, 16 * SIZE
  952. addi BO, BO, 16 * SIZE
  953. .align 4
  954. LL(14):
  955. andi. r0, MIN_N, 15
  956. ble LL(18)
  957. andi. r0, MIN_N, 8
  958. ble LL(15)
  959. LFD a1, 1 * SIZE(AO1)
  960. LFD b1, 1 * SIZE(BO)
  961. LFD a2, 1 * SIZE(AO2)
  962. LFD a3, 1 * SIZE(AO3)
  963. LFD a4, 1 * SIZE(AO4)
  964. LFD a5, 1 * SIZE(AO5)
  965. LFD a6, 1 * SIZE(AO6)
  966. LFD a7, 1 * SIZE(AO7)
  967. LFD a8, 1 * SIZE(AO8)
  968. LFD b2, 2 * SIZE(BO)
  969. LFD b3, 3 * SIZE(BO)
  970. LFD b4, 4 * SIZE(BO)
  971. FMADD y01, a1, b1, y01
  972. LFD a1, 2 * SIZE(AO1)
  973. FMADD y02, a2, b1, y02
  974. LFD a2, 2 * SIZE(AO2)
  975. FMADD y03, a3, b1, y03
  976. LFD a3, 2 * SIZE(AO3)
  977. FMADD y04, a4, b1, y04
  978. LFD a4, 2 * SIZE(AO4)
  979. FMADD y05, a5, b1, y05
  980. LFD a5, 2 * SIZE(AO5)
  981. FMADD y06, a6, b1, y06
  982. LFD a6, 2 * SIZE(AO6)
  983. FMADD y07, a7, b1, y07
  984. LFD a7, 2 * SIZE(AO7)
  985. FMADD y08, a8, b1, y08
  986. LFD a8, 2 * SIZE(AO8)
  987. FMADD y09, a1, b2, y09
  988. LFD a1, 3 * SIZE(AO1)
  989. FMADD y10, a2, b2, y10
  990. LFD a2, 3 * SIZE(AO2)
  991. FMADD y11, a3, b2, y11
  992. LFD a3, 3 * SIZE(AO3)
  993. FMADD y12, a4, b2, y12
  994. LFD a4, 3 * SIZE(AO4)
  995. FMADD y13, a5, b2, y13
  996. LFD a5, 3 * SIZE(AO5)
  997. FMADD y14, a6, b2, y14
  998. LFD a6, 3 * SIZE(AO6)
  999. FMADD y15, a7, b2, y15
  1000. LFD a7, 3 * SIZE(AO7)
  1001. FMADD y16, a8, b2, y16
  1002. LFD a8, 3 * SIZE(AO8)
  1003. LFD b5, 5 * SIZE(BO)
  1004. LFD b6, 6 * SIZE(BO)
  1005. LFD b7, 7 * SIZE(BO)
  1006. LFD b8, 8 * SIZE(BO)
  1007. FMADD y01, a1, b3, y01
  1008. LFD a1, 4 * SIZE(AO1)
  1009. FMADD y02, a2, b3, y02
  1010. LFD a2, 4 * SIZE(AO2)
  1011. FMADD y03, a3, b3, y03
  1012. LFD a3, 4 * SIZE(AO3)
  1013. FMADD y04, a4, b3, y04
  1014. LFD a4, 4 * SIZE(AO4)
  1015. FMADD y05, a5, b3, y05
  1016. LFD a5, 4 * SIZE(AO5)
  1017. FMADD y06, a6, b3, y06
  1018. LFD a6, 4 * SIZE(AO6)
  1019. FMADD y07, a7, b3, y07
  1020. LFD a7, 4 * SIZE(AO7)
  1021. FMADD y08, a8, b3, y08
  1022. LFD a8, 4 * SIZE(AO8)
  1023. FMADD y09, a1, b4, y09
  1024. LFD a1, 5 * SIZE(AO1)
  1025. FMADD y10, a2, b4, y10
  1026. LFD a2, 5 * SIZE(AO2)
  1027. FMADD y11, a3, b4, y11
  1028. LFD a3, 5 * SIZE(AO3)
  1029. FMADD y12, a4, b4, y12
  1030. LFD a4, 5 * SIZE(AO4)
  1031. FMADD y13, a5, b4, y13
  1032. LFD a5, 5 * SIZE(AO5)
  1033. FMADD y14, a6, b4, y14
  1034. LFD a6, 5 * SIZE(AO6)
  1035. FMADD y15, a7, b4, y15
  1036. LFD a7, 5 * SIZE(AO7)
  1037. FMADD y16, a8, b4, y16
  1038. LFD a8, 5 * SIZE(AO8)
  1039. FMADD y01, a1, b5, y01
  1040. LFD a1, 6 * SIZE(AO1)
  1041. FMADD y02, a2, b5, y02
  1042. LFD a2, 6 * SIZE(AO2)
  1043. FMADD y03, a3, b5, y03
  1044. LFD a3, 6 * SIZE(AO3)
  1045. FMADD y04, a4, b5, y04
  1046. LFD a4, 6 * SIZE(AO4)
  1047. FMADD y05, a5, b5, y05
  1048. LFD a5, 6 * SIZE(AO5)
  1049. FMADD y06, a6, b5, y06
  1050. LFD a6, 6 * SIZE(AO6)
  1051. FMADD y07, a7, b5, y07
  1052. LFD a7, 6 * SIZE(AO7)
  1053. FMADD y08, a8, b5, y08
  1054. LFD a8, 6 * SIZE(AO8)
  1055. FMADD y09, a1, b6, y09
  1056. LFD a1, 7 * SIZE(AO1)
  1057. FMADD y10, a2, b6, y10
  1058. LFD a2, 7 * SIZE(AO2)
  1059. FMADD y11, a3, b6, y11
  1060. LFD a3, 7 * SIZE(AO3)
  1061. FMADD y12, a4, b6, y12
  1062. LFD a4, 7 * SIZE(AO4)
  1063. FMADD y13, a5, b6, y13
  1064. LFD a5, 7 * SIZE(AO5)
  1065. FMADD y14, a6, b6, y14
  1066. LFD a6, 7 * SIZE(AO6)
  1067. FMADD y15, a7, b6, y15
  1068. LFD a7, 7 * SIZE(AO7)
  1069. FMADD y16, a8, b6, y16
  1070. LFD a8, 7 * SIZE(AO8)
  1071. FMADD y01, a1, b7, y01
  1072. LFD a1, 8 * SIZE(AO1)
  1073. FMADD y02, a2, b7, y02
  1074. LFD a2, 8 * SIZE(AO2)
  1075. FMADD y03, a3, b7, y03
  1076. LFD a3, 8 * SIZE(AO3)
  1077. FMADD y04, a4, b7, y04
  1078. LFD a4, 8 * SIZE(AO4)
  1079. FMADD y05, a5, b7, y05
  1080. LFD a5, 8 * SIZE(AO5)
  1081. FMADD y06, a6, b7, y06
  1082. LFD a6, 8 * SIZE(AO6)
  1083. FMADD y07, a7, b7, y07
  1084. LFD a7, 8 * SIZE(AO7)
  1085. FMADD y08, a8, b7, y08
  1086. LFD a8, 8 * SIZE(AO8)
  1087. FMADD y09, a1, b8, y09
  1088. addi AO1, AO1, 8 * SIZE
  1089. FMADD y10, a2, b8, y10
  1090. addi AO2, AO2, 8 * SIZE
  1091. FMADD y11, a3, b8, y11
  1092. addi AO3, AO3, 8 * SIZE
  1093. FMADD y12, a4, b8, y12
  1094. addi AO4, AO4, 8 * SIZE
  1095. FMADD y13, a5, b8, y13
  1096. addi AO5, AO5, 8 * SIZE
  1097. FMADD y14, a6, b8, y14
  1098. addi AO6, AO6, 8 * SIZE
  1099. FMADD y15, a7, b8, y15
  1100. addi AO7, AO7, 8 * SIZE
  1101. FMADD y16, a8, b8, y16
  1102. addi AO8, AO8, 8 * SIZE
  1103. addi BO, BO, 8 * SIZE
  1104. .align 4
  1105. LL(15):
  1106. andi. r0, MIN_N, 4
  1107. ble LL(16)
  1108. LFD a1, 1 * SIZE(AO1)
  1109. LFD b1, 1 * SIZE(BO)
  1110. LFD a2, 1 * SIZE(AO2)
  1111. LFD a3, 1 * SIZE(AO3)
  1112. LFD a4, 1 * SIZE(AO4)
  1113. LFD a5, 1 * SIZE(AO5)
  1114. LFD a6, 1 * SIZE(AO6)
  1115. LFD a7, 1 * SIZE(AO7)
  1116. LFD a8, 1 * SIZE(AO8)
  1117. LFD b2, 2 * SIZE(BO)
  1118. LFD b3, 3 * SIZE(BO)
  1119. LFD b4, 4 * SIZE(BO)
  1120. FMADD y01, a1, b1, y01
  1121. LFD a1, 2 * SIZE(AO1)
  1122. FMADD y02, a2, b1, y02
  1123. LFD a2, 2 * SIZE(AO2)
  1124. FMADD y03, a3, b1, y03
  1125. LFD a3, 2 * SIZE(AO3)
  1126. FMADD y04, a4, b1, y04
  1127. LFD a4, 2 * SIZE(AO4)
  1128. FMADD y05, a5, b1, y05
  1129. LFD a5, 2 * SIZE(AO5)
  1130. FMADD y06, a6, b1, y06
  1131. LFD a6, 2 * SIZE(AO6)
  1132. FMADD y07, a7, b1, y07
  1133. LFD a7, 2 * SIZE(AO7)
  1134. FMADD y08, a8, b1, y08
  1135. LFD a8, 2 * SIZE(AO8)
  1136. FMADD y09, a1, b2, y09
  1137. LFD a1, 3 * SIZE(AO1)
  1138. FMADD y10, a2, b2, y10
  1139. LFD a2, 3 * SIZE(AO2)
  1140. FMADD y11, a3, b2, y11
  1141. LFD a3, 3 * SIZE(AO3)
  1142. FMADD y12, a4, b2, y12
  1143. LFD a4, 3 * SIZE(AO4)
  1144. FMADD y13, a5, b2, y13
  1145. LFD a5, 3 * SIZE(AO5)
  1146. FMADD y14, a6, b2, y14
  1147. LFD a6, 3 * SIZE(AO6)
  1148. FMADD y15, a7, b2, y15
  1149. LFD a7, 3 * SIZE(AO7)
  1150. FMADD y16, a8, b2, y16
  1151. LFD a8, 3 * SIZE(AO8)
  1152. FMADD y01, a1, b3, y01
  1153. LFD a1, 4 * SIZE(AO1)
  1154. FMADD y02, a2, b3, y02
  1155. LFD a2, 4 * SIZE(AO2)
  1156. FMADD y03, a3, b3, y03
  1157. LFD a3, 4 * SIZE(AO3)
  1158. FMADD y04, a4, b3, y04
  1159. LFD a4, 4 * SIZE(AO4)
  1160. FMADD y05, a5, b3, y05
  1161. LFD a5, 4 * SIZE(AO5)
  1162. FMADD y06, a6, b3, y06
  1163. LFD a6, 4 * SIZE(AO6)
  1164. FMADD y07, a7, b3, y07
  1165. LFD a7, 4 * SIZE(AO7)
  1166. FMADD y08, a8, b3, y08
  1167. LFD a8, 4 * SIZE(AO8)
  1168. FMADD y09, a1, b4, y09
  1169. addi AO1, AO1, 4 * SIZE
  1170. FMADD y10, a2, b4, y10
  1171. addi AO2, AO2, 4 * SIZE
  1172. FMADD y11, a3, b4, y11
  1173. addi AO3, AO3, 4 * SIZE
  1174. FMADD y12, a4, b4, y12
  1175. addi AO4, AO4, 4 * SIZE
  1176. FMADD y13, a5, b4, y13
  1177. addi AO5, AO5, 4 * SIZE
  1178. FMADD y14, a6, b4, y14
  1179. addi AO6, AO6, 4 * SIZE
  1180. FMADD y15, a7, b4, y15
  1181. addi AO7, AO7, 4 * SIZE
  1182. FMADD y16, a8, b4, y16
  1183. addi AO8, AO8, 4 * SIZE
  1184. addi BO, BO, 4 * SIZE
  1185. .align 4
  1186. LL(16):
  1187. andi. r0, MIN_N, 2
  1188. ble LL(17)
  1189. LFD a1, 1 * SIZE(AO1)
  1190. LFD b1, 1 * SIZE(BO)
  1191. LFD a2, 1 * SIZE(AO2)
  1192. LFD a3, 1 * SIZE(AO3)
  1193. LFD a4, 1 * SIZE(AO4)
  1194. LFD a5, 1 * SIZE(AO5)
  1195. LFD a6, 1 * SIZE(AO6)
  1196. LFD a7, 1 * SIZE(AO7)
  1197. LFD a8, 1 * SIZE(AO8)
  1198. LFD b2, 2 * SIZE(BO)
  1199. FMADD y01, a1, b1, y01
  1200. LFD a1, 2 * SIZE(AO1)
  1201. FMADD y02, a2, b1, y02
  1202. LFD a2, 2 * SIZE(AO2)
  1203. FMADD y03, a3, b1, y03
  1204. LFD a3, 2 * SIZE(AO3)
  1205. FMADD y04, a4, b1, y04
  1206. LFD a4, 2 * SIZE(AO4)
  1207. FMADD y05, a5, b1, y05
  1208. LFD a5, 2 * SIZE(AO5)
  1209. FMADD y06, a6, b1, y06
  1210. LFD a6, 2 * SIZE(AO6)
  1211. FMADD y07, a7, b1, y07
  1212. LFD a7, 2 * SIZE(AO7)
  1213. FMADD y08, a8, b1, y08
  1214. LFD a8, 2 * SIZE(AO8)
  1215. FMADD y09, a1, b2, y09
  1216. addi AO1, AO1, 2 * SIZE
  1217. addi AO2, AO2, 2 * SIZE
  1218. FMADD y10, a2, b2, y10
  1219. addi AO3, AO3, 2 * SIZE
  1220. addi AO4, AO4, 2 * SIZE
  1221. FMADD y11, a3, b2, y11
  1222. FMADD y12, a4, b2, y12
  1223. addi AO5, AO5, 2 * SIZE
  1224. addi AO6, AO6, 2 * SIZE
  1225. FMADD y13, a5, b2, y13
  1226. FMADD y14, a6, b2, y14
  1227. addi AO7, AO7, 2 * SIZE
  1228. addi AO8, AO8, 2 * SIZE
  1229. FMADD y15, a7, b2, y15
  1230. FMADD y16, a8, b2, y16
  1231. addi BO, BO, 2 * SIZE
  1232. .align 4
  1233. LL(17):
  1234. andi. r0, MIN_N, 1
  1235. ble LL(18)
  1236. LFD a1, 1 * SIZE(AO1)
  1237. LFD b1, 1 * SIZE(BO)
  1238. LFD a2, 1 * SIZE(AO2)
  1239. LFD a3, 1 * SIZE(AO3)
  1240. LFD a4, 1 * SIZE(AO4)
  1241. LFD a5, 1 * SIZE(AO5)
  1242. LFD a6, 1 * SIZE(AO6)
  1243. LFD a7, 1 * SIZE(AO7)
  1244. LFD a8, 1 * SIZE(AO8)
  1245. FMADD y01, a1, b1, y01
  1246. FMADD y02, a2, b1, y02
  1247. FMADD y03, a3, b1, y03
  1248. FMADD y04, a4, b1, y04
  1249. FMADD y05, a5, b1, y05
  1250. FMADD y06, a6, b1, y06
  1251. FMADD y07, a7, b1, y07
  1252. FMADD y08, a8, b1, y08
  1253. .align 4
  1254. LL(18):
  1255. mr BO, CO
  1256. lfd alpha, ALPHA
  1257. cmpi cr0, 0, INCY, SIZE
  1258. bne LL(19)
  1259. LFD a1, 1 * SIZE(CO)
  1260. LFD a2, 2 * SIZE(CO)
  1261. LFD a3, 3 * SIZE(CO)
  1262. LFD a4, 4 * SIZE(CO)
  1263. LFD a5, 5 * SIZE(CO)
  1264. LFD a6, 6 * SIZE(CO)
  1265. LFD a7, 7 * SIZE(CO)
  1266. LFD a8, 8 * SIZE(CO)
  1267. FADD y01, y09, y01
  1268. FADD y02, y10, y02
  1269. FADD y03, y11, y03
  1270. FADD y04, y12, y04
  1271. FADD y05, y13, y05
  1272. FADD y06, y14, y06
  1273. FADD y07, y15, y07
  1274. FADD y08, y16, y08
  1275. FMADD a1, alpha, y01, a1
  1276. FMADD a2, alpha, y02, a2
  1277. FMADD a3, alpha, y03, a3
  1278. FMADD a4, alpha, y04, a4
  1279. FMADD a5, alpha, y05, a5
  1280. FMADD a6, alpha, y06, a6
  1281. FMADD a7, alpha, y07, a7
  1282. FMADD a8, alpha, y08, a8
  1283. STFD a1, 1 * SIZE(CO)
  1284. STFD a2, 2 * SIZE(CO)
  1285. STFD a3, 3 * SIZE(CO)
  1286. STFD a4, 4 * SIZE(CO)
  1287. STFD a5, 5 * SIZE(CO)
  1288. STFD a6, 6 * SIZE(CO)
  1289. STFD a7, 7 * SIZE(CO)
  1290. STFD a8, 8 * SIZE(CO)
  1291. addi J, J, -1
  1292. addi CO, CO, 8 * SIZE
  1293. cmpi cr0, 0, J, 0
  1294. bgt LL(11)
  1295. b LL(20)
  1296. .align 4
  1297. LL(19):
  1298. LFDUX a1, CO, INCY
  1299. LFDUX a2, CO, INCY
  1300. LFDUX a3, CO, INCY
  1301. LFDUX a4, CO, INCY
  1302. LFDUX a5, CO, INCY
  1303. LFDUX a6, CO, INCY
  1304. LFDUX a7, CO, INCY
  1305. LFDUX a8, CO, INCY
  1306. FADD y01, y09, y01
  1307. FADD y02, y10, y02
  1308. FADD y03, y11, y03
  1309. FADD y04, y12, y04
  1310. FADD y05, y13, y05
  1311. FADD y06, y14, y06
  1312. FADD y07, y15, y07
  1313. FADD y08, y16, y08
  1314. FMADD a1, alpha, f0, a1
  1315. FMADD a2, alpha, f1, a2
  1316. FMADD a3, alpha, f2, a3
  1317. FMADD a4, alpha, f3, a4
  1318. FMADD a5, alpha, f4, a5
  1319. FMADD a6, alpha, f5, a6
  1320. FMADD a7, alpha, f6, a7
  1321. FMADD a8, alpha, f7, a8
  1322. STFDUX a1, BO, INCY
  1323. STFDUX a2, BO, INCY
  1324. STFDUX a3, BO, INCY
  1325. STFDUX a4, BO, INCY
  1326. STFDUX a5, BO, INCY
  1327. STFDUX a6, BO, INCY
  1328. STFDUX a7, BO, INCY
  1329. STFDUX a8, BO, INCY
  1330. addi J, J, -1
  1331. cmpi cr0, 0, J, 0
  1332. bgt LL(11)
  1333. .align 4
  1334. LL(20):
  1335. andi. J, N, 7
  1336. ble LL(99)
  1337. andi. J, N, 4
  1338. ble LL(30)
  1339. mr AO1, A
  1340. add AO2, A, LDA
  1341. add AO3, AO2, LDA
  1342. add AO4, AO3, LDA
  1343. add A, AO4, LDA
  1344. mr BO, XP
  1345. lfd y01, FZERO
  1346. fmr y02, y01
  1347. fmr y03, y01
  1348. fmr y04, y01
  1349. fmr y09, y01
  1350. fmr y10, y01
  1351. fmr y11, y01
  1352. fmr y12, y01
  1353. DCBT(Y1, PREC)
  1354. srawi. r0, MIN_N, 4
  1355. mtspr CTR, r0
  1356. ble LL(24)
  1357. LFD a1, 1 * SIZE(AO1)
  1358. LFD a2, 1 * SIZE(AO2)
  1359. LFD a3, 1 * SIZE(AO3)
  1360. LFD a4, 1 * SIZE(AO4)
  1361. LFD a5, 2 * SIZE(AO1)
  1362. LFD a6, 2 * SIZE(AO2)
  1363. LFD a7, 2 * SIZE(AO3)
  1364. LFD a8, 2 * SIZE(AO4)
  1365. LFD b1, 1 * SIZE(BO)
  1366. LFD b2, 2 * SIZE(BO)
  1367. LFD b3, 3 * SIZE(BO)
  1368. LFD b4, 4 * SIZE(BO)
  1369. LFD b5, 5 * SIZE(BO)
  1370. LFD b6, 6 * SIZE(BO)
  1371. LFD b7, 7 * SIZE(BO)
  1372. LFD b8, 8 * SIZE(BO)
  1373. bdz LL(23)
  1374. .align 4
  1375. LL(22):
  1376. FMADD y01, a1, b1, y01
  1377. LFD a1, 3 * SIZE(AO1)
  1378. FMADD y02, a2, b1, y02
  1379. LFD a2, 3 * SIZE(AO2)
  1380. FMADD y03, a3, b1, y03
  1381. LFD a3, 3 * SIZE(AO3)
  1382. FMADD y04, a4, b1, y04
  1383. LFD a4, 3 * SIZE(AO4)
  1384. FMADD y09, a5, b2, y09
  1385. LFD a5, 4 * SIZE(AO1)
  1386. FMADD y10, a6, b2, y10
  1387. LFD a6, 4 * SIZE(AO2)
  1388. FMADD y11, a7, b2, y11
  1389. LFD a7, 4 * SIZE(AO3)
  1390. FMADD y12, a8, b2, y12
  1391. LFD a8, 4 * SIZE(AO4)
  1392. FMADD y01, a1, b3, y01
  1393. LFD a1, 5 * SIZE(AO1)
  1394. FMADD y02, a2, b3, y02
  1395. LFD a2, 5 * SIZE(AO2)
  1396. FMADD y03, a3, b3, y03
  1397. LFD a3, 5 * SIZE(AO3)
  1398. FMADD y04, a4, b3, y04
  1399. LFD a4, 5 * SIZE(AO4)
  1400. FMADD y09, a5, b4, y09
  1401. LFD a5, 6 * SIZE(AO1)
  1402. FMADD y10, a6, b4, y10
  1403. LFD a6, 6 * SIZE(AO2)
  1404. FMADD y11, a7, b4, y11
  1405. LFD a7, 6 * SIZE(AO3)
  1406. FMADD y12, a8, b4, y12
  1407. LFD a8, 6 * SIZE(AO4)
  1408. LFD b1, 9 * SIZE(BO)
  1409. LFD b2, 10 * SIZE(BO)
  1410. LFD b3, 11 * SIZE(BO)
  1411. LFD b4, 12 * SIZE(BO)
  1412. FMADD y01, a1, b5, y01
  1413. LFD a1, 7 * SIZE(AO1)
  1414. FMADD y02, a2, b5, y02
  1415. LFD a2, 7 * SIZE(AO2)
  1416. FMADD y03, a3, b5, y03
  1417. LFD a3, 7 * SIZE(AO3)
  1418. FMADD y04, a4, b5, y04
  1419. LFD a4, 7 * SIZE(AO4)
  1420. FMADD y09, a5, b6, y09
  1421. LFD a5, 8 * SIZE(AO1)
  1422. FMADD y10, a6, b6, y10
  1423. LFD a6, 8 * SIZE(AO2)
  1424. FMADD y11, a7, b6, y11
  1425. LFD a7, 8 * SIZE(AO3)
  1426. FMADD y12, a8, b6, y12
  1427. LFD a8, 8 * SIZE(AO4)
  1428. FMADD y01, a1, b7, y01
  1429. LFD a1, 9 * SIZE(AO1)
  1430. FMADD y02, a2, b7, y02
  1431. LFD a2, 9 * SIZE(AO2)
  1432. FMADD y03, a3, b7, y03
  1433. LFD a3, 9 * SIZE(AO3)
  1434. FMADD y04, a4, b7, y04
  1435. LFD a4, 9 * SIZE(AO4)
  1436. FMADD y09, a5, b8, y09
  1437. LFD a5, 10 * SIZE(AO1)
  1438. FMADD y10, a6, b8, y10
  1439. LFD a6, 10 * SIZE(AO2)
  1440. FMADD y11, a7, b8, y11
  1441. LFD a7, 10 * SIZE(AO3)
  1442. FMADD y12, a8, b8, y12
  1443. LFD a8, 10 * SIZE(AO4)
  1444. LFD b5, 13 * SIZE(BO)
  1445. LFD b6, 14 * SIZE(BO)
  1446. LFD b7, 15 * SIZE(BO)
  1447. LFD b8, 16 * SIZE(BO)
  1448. FMADD y01, a1, b1, y01
  1449. LFD a1, 11 * SIZE(AO1)
  1450. FMADD y02, a2, b1, y02
  1451. LFD a2, 11 * SIZE(AO2)
  1452. FMADD y03, a3, b1, y03
  1453. LFD a3, 11 * SIZE(AO3)
  1454. FMADD y04, a4, b1, y04
  1455. LFD a4, 11 * SIZE(AO4)
  1456. FMADD y09, a5, b2, y09
  1457. LFD a5, 12 * SIZE(AO1)
  1458. FMADD y10, a6, b2, y10
  1459. LFD a6, 12 * SIZE(AO2)
  1460. FMADD y11, a7, b2, y11
  1461. LFD a7, 12 * SIZE(AO3)
  1462. FMADD y12, a8, b2, y12
  1463. LFD a8, 12 * SIZE(AO4)
  1464. FMADD y01, a1, b3, y01
  1465. LFD a1, 13 * SIZE(AO1)
  1466. FMADD y02, a2, b3, y02
  1467. LFD a2, 13 * SIZE(AO2)
  1468. FMADD y03, a3, b3, y03
  1469. LFD a3, 13 * SIZE(AO3)
  1470. FMADD y04, a4, b3, y04
  1471. LFD a4, 13 * SIZE(AO4)
  1472. FMADD y09, a5, b4, y09
  1473. LFD a5, 14 * SIZE(AO1)
  1474. FMADD y10, a6, b4, y10
  1475. LFD a6, 14 * SIZE(AO2)
  1476. FMADD y11, a7, b4, y11
  1477. LFD a7, 14 * SIZE(AO3)
  1478. FMADD y12, a8, b4, y12
  1479. LFD a8, 14 * SIZE(AO4)
  1480. LFD b1, 17 * SIZE(BO)
  1481. LFD b2, 18 * SIZE(BO)
  1482. LFD b3, 19 * SIZE(BO)
  1483. LFD b4, 20 * SIZE(BO)
  1484. FMADD y01, a1, b5, y01
  1485. LFD a1, 15 * SIZE(AO1)
  1486. FMADD y02, a2, b5, y02
  1487. LFD a2, 15 * SIZE(AO2)
  1488. FMADD y03, a3, b5, y03
  1489. LFD a3, 15 * SIZE(AO3)
  1490. FMADD y04, a4, b5, y04
  1491. LFD a4, 15 * SIZE(AO4)
  1492. FMADD y09, a5, b6, y09
  1493. LFD a5, 16 * SIZE(AO1)
  1494. FMADD y10, a6, b6, y10
  1495. LFD a6, 16 * SIZE(AO2)
  1496. FMADD y11, a7, b6, y11
  1497. LFD a7, 16 * SIZE(AO3)
  1498. FMADD y12, a8, b6, y12
  1499. LFD a8, 16 * SIZE(AO4)
  1500. FMADD y01, a1, b7, y01
  1501. LFD a1, 17 * SIZE(AO1)
  1502. FMADD y02, a2, b7, y02
  1503. LFD a2, 17 * SIZE(AO2)
  1504. FMADD y03, a3, b7, y03
  1505. LFD a3, 17 * SIZE(AO3)
  1506. FMADD y04, a4, b7, y04
  1507. LFD a4, 17 * SIZE(AO4)
  1508. FMADD y09, a5, b8, y09
  1509. LFD a5, 18 * SIZE(AO1)
  1510. FMADD y10, a6, b8, y10
  1511. LFD a6, 18 * SIZE(AO2)
  1512. FMADD y11, a7, b8, y11
  1513. LFD a7, 18 * SIZE(AO3)
  1514. FMADD y12, a8, b8, y12
  1515. LFD a8, 18 * SIZE(AO4)
  1516. LFD b5, 21 * SIZE(BO)
  1517. LFD b6, 22 * SIZE(BO)
  1518. LFD b7, 23 * SIZE(BO)
  1519. LFD b8, 24 * SIZE(BO)
  1520. addi AO1, AO1, 16 * SIZE
  1521. addi AO2, AO2, 16 * SIZE
  1522. DCBT(AO1, PREA)
  1523. DCBT(AO2, PREA)
  1524. addi AO3, AO3, 16 * SIZE
  1525. addi AO4, AO4, 16 * SIZE
  1526. DCBT(AO3, PREA)
  1527. DCBT(AO4, PREA)
  1528. addi BO, BO, 16 * SIZE
  1529. bdnz LL(22)
  1530. .align 4
  1531. LL(23):
  1532. FMADD y01, a1, b1, y01
  1533. LFD a1, 3 * SIZE(AO1)
  1534. FMADD y02, a2, b1, y02
  1535. LFD a2, 3 * SIZE(AO2)
  1536. FMADD y03, a3, b1, y03
  1537. LFD a3, 3 * SIZE(AO3)
  1538. FMADD y04, a4, b1, y04
  1539. LFD a4, 3 * SIZE(AO4)
  1540. FMADD y09, a5, b2, y09
  1541. LFD a5, 4 * SIZE(AO1)
  1542. FMADD y10, a6, b2, y10
  1543. LFD a6, 4 * SIZE(AO2)
  1544. FMADD y11, a7, b2, y11
  1545. LFD a7, 4 * SIZE(AO3)
  1546. FMADD y12, a8, b2, y12
  1547. LFD a8, 4 * SIZE(AO4)
  1548. FMADD y01, a1, b3, y01
  1549. LFD a1, 5 * SIZE(AO1)
  1550. FMADD y02, a2, b3, y02
  1551. LFD a2, 5 * SIZE(AO2)
  1552. FMADD y03, a3, b3, y03
  1553. LFD a3, 5 * SIZE(AO3)
  1554. FMADD y04, a4, b3, y04
  1555. LFD a4, 5 * SIZE(AO4)
  1556. FMADD y09, a5, b4, y09
  1557. LFD a5, 6 * SIZE(AO1)
  1558. FMADD y10, a6, b4, y10
  1559. LFD a6, 6 * SIZE(AO2)
  1560. FMADD y11, a7, b4, y11
  1561. LFD a7, 6 * SIZE(AO3)
  1562. FMADD y12, a8, b4, y12
  1563. LFD a8, 6 * SIZE(AO4)
  1564. LFD b1, 9 * SIZE(BO)
  1565. LFD b2, 10 * SIZE(BO)
  1566. LFD b3, 11 * SIZE(BO)
  1567. LFD b4, 12 * SIZE(BO)
  1568. FMADD y01, a1, b5, y01
  1569. LFD a1, 7 * SIZE(AO1)
  1570. FMADD y02, a2, b5, y02
  1571. LFD a2, 7 * SIZE(AO2)
  1572. FMADD y03, a3, b5, y03
  1573. LFD a3, 7 * SIZE(AO3)
  1574. FMADD y04, a4, b5, y04
  1575. LFD a4, 7 * SIZE(AO4)
  1576. FMADD y09, a5, b6, y09
  1577. LFD a5, 8 * SIZE(AO1)
  1578. FMADD y10, a6, b6, y10
  1579. LFD a6, 8 * SIZE(AO2)
  1580. FMADD y11, a7, b6, y11
  1581. LFD a7, 8 * SIZE(AO3)
  1582. FMADD y12, a8, b6, y12
  1583. LFD a8, 8 * SIZE(AO4)
  1584. FMADD y01, a1, b7, y01
  1585. LFD a1, 9 * SIZE(AO1)
  1586. FMADD y02, a2, b7, y02
  1587. LFD a2, 9 * SIZE(AO2)
  1588. FMADD y03, a3, b7, y03
  1589. LFD a3, 9 * SIZE(AO3)
  1590. FMADD y04, a4, b7, y04
  1591. LFD a4, 9 * SIZE(AO4)
  1592. FMADD y09, a5, b8, y09
  1593. LFD a5, 10 * SIZE(AO1)
  1594. FMADD y10, a6, b8, y10
  1595. LFD a6, 10 * SIZE(AO2)
  1596. FMADD y11, a7, b8, y11
  1597. LFD a7, 10 * SIZE(AO3)
  1598. FMADD y12, a8, b8, y12
  1599. LFD a8, 10 * SIZE(AO4)
  1600. LFD b5, 13 * SIZE(BO)
  1601. LFD b6, 14 * SIZE(BO)
  1602. LFD b7, 15 * SIZE(BO)
  1603. LFD b8, 16 * SIZE(BO)
  1604. FMADD y01, a1, b1, y01
  1605. LFD a1, 11 * SIZE(AO1)
  1606. FMADD y02, a2, b1, y02
  1607. LFD a2, 11 * SIZE(AO2)
  1608. FMADD y03, a3, b1, y03
  1609. LFD a3, 11 * SIZE(AO3)
  1610. FMADD y04, a4, b1, y04
  1611. LFD a4, 11 * SIZE(AO4)
  1612. FMADD y09, a5, b2, y09
  1613. LFD a5, 12 * SIZE(AO1)
  1614. FMADD y10, a6, b2, y10
  1615. LFD a6, 12 * SIZE(AO2)
  1616. FMADD y11, a7, b2, y11
  1617. LFD a7, 12 * SIZE(AO3)
  1618. FMADD y12, a8, b2, y12
  1619. LFD a8, 12 * SIZE(AO4)
  1620. FMADD y01, a1, b3, y01
  1621. LFD a1, 13 * SIZE(AO1)
  1622. FMADD y02, a2, b3, y02
  1623. LFD a2, 13 * SIZE(AO2)
  1624. FMADD y03, a3, b3, y03
  1625. LFD a3, 13 * SIZE(AO3)
  1626. FMADD y04, a4, b3, y04
  1627. LFD a4, 13 * SIZE(AO4)
  1628. FMADD y09, a5, b4, y09
  1629. LFD a5, 14 * SIZE(AO1)
  1630. FMADD y10, a6, b4, y10
  1631. LFD a6, 14 * SIZE(AO2)
  1632. FMADD y11, a7, b4, y11
  1633. LFD a7, 14 * SIZE(AO3)
  1634. FMADD y12, a8, b4, y12
  1635. LFD a8, 14 * SIZE(AO4)
  1636. FMADD y01, a1, b5, y01
  1637. LFD a1, 15 * SIZE(AO1)
  1638. FMADD y02, a2, b5, y02
  1639. LFD a2, 15 * SIZE(AO2)
  1640. FMADD y03, a3, b5, y03
  1641. LFD a3, 15 * SIZE(AO3)
  1642. FMADD y04, a4, b5, y04
  1643. LFD a4, 15 * SIZE(AO4)
  1644. FMADD y09, a5, b6, y09
  1645. LFD a5, 16 * SIZE(AO1)
  1646. FMADD y10, a6, b6, y10
  1647. LFD a6, 16 * SIZE(AO2)
  1648. FMADD y11, a7, b6, y11
  1649. LFD a7, 16 * SIZE(AO3)
  1650. FMADD y12, a8, b6, y12
  1651. LFD a8, 16 * SIZE(AO4)
  1652. FMADD y01, a1, b7, y01
  1653. FMADD y02, a2, b7, y02
  1654. FMADD y03, a3, b7, y03
  1655. FMADD y04, a4, b7, y04
  1656. FMADD y09, a5, b8, y09
  1657. FMADD y10, a6, b8, y10
  1658. FMADD y11, a7, b8, y11
  1659. FMADD y12, a8, b8, y12
  1660. addi AO1, AO1, 16 * SIZE
  1661. addi AO2, AO2, 16 * SIZE
  1662. addi AO3, AO3, 16 * SIZE
  1663. addi AO4, AO4, 16 * SIZE
  1664. addi BO, BO, 16 * SIZE
  1665. .align 4
  1666. LL(24):
  1667. andi. r0, MIN_N, 15
  1668. ble LL(28)
  1669. andi. r0, MIN_N, 8
  1670. ble LL(25)
  1671. LFD a1, 1 * SIZE(AO1)
  1672. LFD a2, 1 * SIZE(AO2)
  1673. LFD a3, 1 * SIZE(AO3)
  1674. LFD a4, 1 * SIZE(AO4)
  1675. LFD b1, 1 * SIZE(BO)
  1676. LFD b2, 2 * SIZE(BO)
  1677. LFD b3, 3 * SIZE(BO)
  1678. LFD b4, 4 * SIZE(BO)
  1679. LFD a5, 2 * SIZE(AO1)
  1680. LFD a6, 2 * SIZE(AO2)
  1681. LFD a7, 2 * SIZE(AO3)
  1682. LFD a8, 2 * SIZE(AO4)
  1683. FMADD y01, a1, b1, y01
  1684. LFD a1, 3 * SIZE(AO1)
  1685. FMADD y02, a2, b1, y02
  1686. LFD a2, 3 * SIZE(AO2)
  1687. FMADD y03, a3, b1, y03
  1688. LFD a3, 3 * SIZE(AO3)
  1689. FMADD y04, a4, b1, y04
  1690. LFD a4, 3 * SIZE(AO4)
  1691. FMADD y09, a5, b2, y09
  1692. LFD a5, 4 * SIZE(AO1)
  1693. FMADD y10, a6, b2, y10
  1694. LFD a6, 4 * SIZE(AO2)
  1695. FMADD y11, a7, b2, y11
  1696. LFD a7, 4 * SIZE(AO3)
  1697. FMADD y12, a8, b2, y12
  1698. LFD a8, 4 * SIZE(AO4)
  1699. FMADD y01, a1, b3, y01
  1700. LFD a1, 5 * SIZE(AO1)
  1701. FMADD y02, a2, b3, y02
  1702. LFD a2, 5 * SIZE(AO2)
  1703. FMADD y03, a3, b3, y03
  1704. LFD a3, 5 * SIZE(AO3)
  1705. FMADD y04, a4, b3, y04
  1706. LFD a4, 5 * SIZE(AO4)
  1707. FMADD y09, a5, b4, y09
  1708. LFD a5, 6 * SIZE(AO1)
  1709. FMADD y10, a6, b4, y10
  1710. LFD a6, 6 * SIZE(AO2)
  1711. FMADD y11, a7, b4, y11
  1712. LFD a7, 6 * SIZE(AO3)
  1713. FMADD y12, a8, b4, y12
  1714. LFD a8, 6 * SIZE(AO4)
  1715. LFD b1, 5 * SIZE(BO)
  1716. LFD b2, 6 * SIZE(BO)
  1717. LFD b3, 7 * SIZE(BO)
  1718. LFD b4, 8 * SIZE(BO)
  1719. FMADD y01, a1, b1, y01
  1720. LFD a1, 7 * SIZE(AO1)
  1721. FMADD y02, a2, b1, y02
  1722. LFD a2, 7 * SIZE(AO2)
  1723. FMADD y03, a3, b1, y03
  1724. LFD a3, 7 * SIZE(AO3)
  1725. FMADD y04, a4, b1, y04
  1726. LFD a4, 7 * SIZE(AO4)
  1727. FMADD y09, a5, b2, y09
  1728. LFD a5, 8 * SIZE(AO1)
  1729. FMADD y10, a6, b2, y10
  1730. LFD a6, 8 * SIZE(AO2)
  1731. FMADD y11, a7, b2, y11
  1732. LFD a7, 8 * SIZE(AO3)
  1733. FMADD y12, a8, b2, y12
  1734. LFD a8, 8 * SIZE(AO4)
  1735. FMADD y01, a1, b3, y01
  1736. FMADD y02, a2, b3, y02
  1737. FMADD y03, a3, b3, y03
  1738. FMADD y04, a4, b3, y04
  1739. FMADD y09, a5, b4, y09
  1740. addi AO1, AO1, 8 * SIZE
  1741. FMADD y10, a6, b4, y10
  1742. addi AO2, AO2, 8 * SIZE
  1743. FMADD y11, a7, b4, y11
  1744. addi AO3, AO3, 8 * SIZE
  1745. FMADD y12, a8, b4, y12
  1746. addi AO4, AO4, 8 * SIZE
  1747. addi BO, BO, 8 * SIZE
  1748. .align 4
  1749. LL(25):
  1750. andi. r0, MIN_N, 4
  1751. ble LL(26)
  1752. LFD a1, 1 * SIZE(AO1)
  1753. LFD a2, 1 * SIZE(AO2)
  1754. LFD a3, 1 * SIZE(AO3)
  1755. LFD a4, 1 * SIZE(AO4)
  1756. LFD b1, 1 * SIZE(BO)
  1757. LFD b2, 2 * SIZE(BO)
  1758. LFD b3, 3 * SIZE(BO)
  1759. LFD b4, 4 * SIZE(BO)
  1760. LFD a5, 2 * SIZE(AO1)
  1761. LFD a6, 2 * SIZE(AO2)
  1762. LFD a7, 2 * SIZE(AO3)
  1763. LFD a8, 2 * SIZE(AO4)
  1764. FMADD y01, a1, b1, y01
  1765. LFD a1, 3 * SIZE(AO1)
  1766. FMADD y02, a2, b1, y02
  1767. LFD a2, 3 * SIZE(AO2)
  1768. FMADD y03, a3, b1, y03
  1769. LFD a3, 3 * SIZE(AO3)
  1770. FMADD y04, a4, b1, y04
  1771. LFD a4, 3 * SIZE(AO4)
  1772. FMADD y09, a5, b2, y09
  1773. LFD a5, 4 * SIZE(AO1)
  1774. FMADD y10, a6, b2, y10
  1775. LFD a6, 4 * SIZE(AO2)
  1776. FMADD y11, a7, b2, y11
  1777. LFD a7, 4 * SIZE(AO3)
  1778. FMADD y12, a8, b2, y12
  1779. LFD a8, 4 * SIZE(AO4)
  1780. FMADD y01, a1, b3, y01
  1781. FMADD y02, a2, b3, y02
  1782. FMADD y03, a3, b3, y03
  1783. FMADD y04, a4, b3, y04
  1784. FMADD y09, a5, b4, y09
  1785. addi AO1, AO1, 4 * SIZE
  1786. FMADD y10, a6, b4, y10
  1787. addi AO2, AO2, 4 * SIZE
  1788. FMADD y11, a7, b4, y11
  1789. addi AO3, AO3, 4 * SIZE
  1790. FMADD y12, a8, b4, y12
  1791. addi AO4, AO4, 4 * SIZE
  1792. addi BO, BO, 4 * SIZE
  1793. .align 4
  1794. LL(26):
  1795. andi. r0, MIN_N, 2
  1796. ble LL(27)
  1797. LFD a1, 1 * SIZE(AO1)
  1798. LFD a2, 1 * SIZE(AO2)
  1799. LFD b1, 1 * SIZE(BO)
  1800. LFD b2, 2 * SIZE(BO)
  1801. LFD a3, 1 * SIZE(AO3)
  1802. LFD a4, 1 * SIZE(AO4)
  1803. LFD a5, 2 * SIZE(AO1)
  1804. LFD a6, 2 * SIZE(AO2)
  1805. LFD a7, 2 * SIZE(AO3)
  1806. LFD a8, 2 * SIZE(AO4)
  1807. FMADD y01, a1, b1, y01
  1808. FMADD y02, a2, b1, y02
  1809. FMADD y03, a3, b1, y03
  1810. FMADD y04, a4, b1, y04
  1811. FMADD y09, a5, b2, y09
  1812. addi AO1, AO1, 2 * SIZE
  1813. FMADD y10, a6, b2, y10
  1814. addi AO2, AO2, 2 * SIZE
  1815. FMADD y11, a7, b2, y11
  1816. addi AO3, AO3, 2 * SIZE
  1817. FMADD y12, a8, b2, y12
  1818. addi AO4, AO4, 2 * SIZE
  1819. addi BO, BO, 2 * SIZE
  1820. .align 4
  1821. LL(27):
  1822. andi. r0, MIN_N, 1
  1823. ble LL(28)
  1824. LFD a1, 1 * SIZE(AO1)
  1825. LFD b1, 1 * SIZE(BO)
  1826. LFD a2, 1 * SIZE(AO2)
  1827. LFD a3, 1 * SIZE(AO3)
  1828. LFD a4, 1 * SIZE(AO4)
  1829. FMADD y01, a1, b1, y01
  1830. FMADD y02, a2, b1, y02
  1831. FMADD y03, a3, b1, y03
  1832. FMADD y04, a4, b1, y04
  1833. .align 4
  1834. LL(28):
  1835. mr BO, CO
  1836. lfd alpha, ALPHA
  1837. cmpi cr0, 0, INCY, SIZE
  1838. bne LL(29)
  1839. LFD a1, 1 * SIZE(CO)
  1840. LFD a2, 2 * SIZE(CO)
  1841. LFD a3, 3 * SIZE(CO)
  1842. LFD a4, 4 * SIZE(CO)
  1843. FADD y01, y09, y01
  1844. FADD y02, y10, y02
  1845. FADD y03, y11, y03
  1846. FADD y04, y12, y04
  1847. FMADD a1, alpha, y01, a1
  1848. FMADD a2, alpha, y02, a2
  1849. FMADD a3, alpha, y03, a3
  1850. FMADD a4, alpha, y04, a4
  1851. STFD a1, 1 * SIZE(CO)
  1852. STFD a2, 2 * SIZE(CO)
  1853. STFD a3, 3 * SIZE(CO)
  1854. STFD a4, 4 * SIZE(CO)
  1855. addi CO, CO, 4 * SIZE
  1856. b LL(30)
  1857. .align 4
  1858. LL(29):
  1859. LFDUX a1, CO, INCY
  1860. LFDUX a2, CO, INCY
  1861. LFDUX a3, CO, INCY
  1862. LFDUX a4, CO, INCY
  1863. FADD y01, y09, y01
  1864. FADD y02, y10, y02
  1865. FADD y03, y11, y03
  1866. FADD y04, y12, y04
  1867. FMADD a1, alpha, f0, a1
  1868. FMADD a2, alpha, f1, a2
  1869. FMADD a3, alpha, f2, a3
  1870. FMADD a4, alpha, f3, a4
  1871. STFDUX a1, BO, INCY
  1872. STFDUX a2, BO, INCY
  1873. STFDUX a3, BO, INCY
  1874. STFDUX a4, BO, INCY
  1875. .align 4
  1876. LL(30):
  1877. andi. J, N, 2
  1878. ble LL(40)
  1879. mr AO1, A
  1880. add AO2, A, LDA
  1881. add A, AO2, LDA
  1882. mr BO, XP
  1883. lfd y01, FZERO
  1884. fmr y02, y01
  1885. fmr y03, y01
  1886. fmr y04, y01
  1887. fmr y09, y01
  1888. fmr y10, y01
  1889. fmr y11, y01
  1890. fmr y12, y01
  1891. DCBT(Y1, PREC)
  1892. srawi. r0, MIN_N, 4
  1893. mtspr CTR, r0
  1894. ble LL(34)
  1895. LFD a1, 1 * SIZE(AO1)
  1896. LFD a2, 1 * SIZE(AO2)
  1897. LFD a3, 2 * SIZE(AO1)
  1898. LFD a4, 2 * SIZE(AO2)
  1899. LFD a5, 3 * SIZE(AO1)
  1900. LFD a6, 3 * SIZE(AO2)
  1901. LFD a7, 4 * SIZE(AO1)
  1902. LFD a8, 4 * SIZE(AO2)
  1903. LFD b1, 1 * SIZE(BO)
  1904. LFD b2, 2 * SIZE(BO)
  1905. LFD b3, 3 * SIZE(BO)
  1906. LFD b4, 4 * SIZE(BO)
  1907. LFD b5, 5 * SIZE(BO)
  1908. LFD b6, 6 * SIZE(BO)
  1909. LFD b7, 7 * SIZE(BO)
  1910. LFD b8, 8 * SIZE(BO)
  1911. bdz LL(33)
  1912. .align 4
  1913. LL(32):
  1914. FMADD y01, a1, b1, y01
  1915. LFD a1, 5 * SIZE(AO1)
  1916. FMADD y02, a2, b1, y02
  1917. LFD a2, 5 * SIZE(AO2)
  1918. FMADD y03, a3, b2, y03
  1919. LFD a3, 6 * SIZE(AO1)
  1920. FMADD y04, a4, b2, y04
  1921. LFD a4, 6 * SIZE(AO2)
  1922. FMADD y09, a5, b3, y09
  1923. LFD a5, 7 * SIZE(AO1)
  1924. FMADD y10, a6, b3, y10
  1925. LFD a6, 7 * SIZE(AO2)
  1926. FMADD y11, a7, b4, y11
  1927. LFD a7, 8 * SIZE(AO1)
  1928. FMADD y12, a8, b4, y12
  1929. LFD a8, 8 * SIZE(AO2)
  1930. LFD b1, 9 * SIZE(BO)
  1931. LFD b2, 10 * SIZE(BO)
  1932. LFD b3, 11 * SIZE(BO)
  1933. LFD b4, 12 * SIZE(BO)
  1934. FMADD y01, a1, b5, y01
  1935. LFD a1, 9 * SIZE(AO1)
  1936. FMADD y02, a2, b5, y02
  1937. LFD a2, 9 * SIZE(AO2)
  1938. FMADD y03, a3, b6, y03
  1939. LFD a3, 10 * SIZE(AO1)
  1940. FMADD y04, a4, b6, y04
  1941. LFD a4, 10 * SIZE(AO2)
  1942. FMADD y09, a5, b7, y09
  1943. LFD a5, 11 * SIZE(AO1)
  1944. FMADD y10, a6, b7, y10
  1945. LFD a6, 11 * SIZE(AO2)
  1946. FMADD y11, a7, b8, y11
  1947. LFD a7, 12 * SIZE(AO1)
  1948. FMADD y12, a8, b8, y12
  1949. LFD a8, 12 * SIZE(AO2)
  1950. LFD b5, 13 * SIZE(BO)
  1951. LFD b6, 14 * SIZE(BO)
  1952. LFD b7, 15 * SIZE(BO)
  1953. LFD b8, 16 * SIZE(BO)
  1954. FMADD y01, a1, b1, y01
  1955. LFD a1, 13 * SIZE(AO1)
  1956. FMADD y02, a2, b1, y02
  1957. LFD a2, 13 * SIZE(AO2)
  1958. FMADD y03, a3, b2, y03
  1959. LFD a3, 14 * SIZE(AO1)
  1960. FMADD y04, a4, b2, y04
  1961. LFD a4, 14 * SIZE(AO2)
  1962. FMADD y09, a5, b3, y09
  1963. LFD a5, 15 * SIZE(AO1)
  1964. FMADD y10, a6, b3, y10
  1965. LFD a6, 15 * SIZE(AO2)
  1966. FMADD y11, a7, b4, y11
  1967. LFD a7, 16 * SIZE(AO1)
  1968. FMADD y12, a8, b4, y12
  1969. LFD a8, 16 * SIZE(AO2)
  1970. LFD b1, 17 * SIZE(BO)
  1971. LFD b2, 18 * SIZE(BO)
  1972. LFD b3, 19 * SIZE(BO)
  1973. LFD b4, 20 * SIZE(BO)
  1974. FMADD y01, a1, b5, y01
  1975. LFD a1, 17 * SIZE(AO1)
  1976. FMADD y02, a2, b5, y02
  1977. LFD a2, 17 * SIZE(AO2)
  1978. FMADD y03, a3, b6, y03
  1979. LFD a3, 18 * SIZE(AO1)
  1980. FMADD y04, a4, b6, y04
  1981. LFD a4, 18 * SIZE(AO2)
  1982. FMADD y09, a5, b7, y09
  1983. LFD a5, 19 * SIZE(AO1)
  1984. FMADD y10, a6, b7, y10
  1985. LFD a6, 19 * SIZE(AO2)
  1986. FMADD y11, a7, b8, y11
  1987. LFD a7, 20 * SIZE(AO1)
  1988. FMADD y12, a8, b8, y12
  1989. LFD a8, 20 * SIZE(AO2)
  1990. LFD b5, 21 * SIZE(BO)
  1991. LFD b6, 22 * SIZE(BO)
  1992. LFD b7, 23 * SIZE(BO)
  1993. LFD b8, 24 * SIZE(BO)
  1994. addi AO1, AO1, 16 * SIZE
  1995. addi AO2, AO2, 16 * SIZE
  1996. DCBT(AO1, PREA)
  1997. DCBT(AO2, PREA)
  1998. addi BO, BO, 16 * SIZE
  1999. bdnz LL(32)
  2000. .align 4
  2001. LL(33):
  2002. FMADD y01, a1, b1, y01
  2003. LFD a1, 5 * SIZE(AO1)
  2004. FMADD y02, a2, b1, y02
  2005. LFD a2, 5 * SIZE(AO2)
  2006. FMADD y03, a3, b2, y03
  2007. LFD a3, 6 * SIZE(AO1)
  2008. FMADD y04, a4, b2, y04
  2009. LFD a4, 6 * SIZE(AO2)
  2010. FMADD y09, a5, b3, y09
  2011. LFD a5, 7 * SIZE(AO1)
  2012. FMADD y10, a6, b3, y10
  2013. LFD a6, 7 * SIZE(AO2)
  2014. FMADD y11, a7, b4, y11
  2015. LFD a7, 8 * SIZE(AO1)
  2016. FMADD y12, a8, b4, y12
  2017. LFD a8, 8 * SIZE(AO2)
  2018. LFD b1, 9 * SIZE(BO)
  2019. LFD b2, 10 * SIZE(BO)
  2020. LFD b3, 11 * SIZE(BO)
  2021. LFD b4, 12 * SIZE(BO)
  2022. FMADD y01, a1, b5, y01
  2023. LFD a1, 9 * SIZE(AO1)
  2024. FMADD y02, a2, b5, y02
  2025. LFD a2, 9 * SIZE(AO2)
  2026. FMADD y03, a3, b6, y03
  2027. LFD a3, 10 * SIZE(AO1)
  2028. FMADD y04, a4, b6, y04
  2029. LFD a4, 10 * SIZE(AO2)
  2030. FMADD y09, a5, b7, y09
  2031. LFD a5, 11 * SIZE(AO1)
  2032. FMADD y10, a6, b7, y10
  2033. LFD a6, 11 * SIZE(AO2)
  2034. FMADD y11, a7, b8, y11
  2035. LFD a7, 12 * SIZE(AO1)
  2036. FMADD y12, a8, b8, y12
  2037. LFD a8, 12 * SIZE(AO2)
  2038. LFD b5, 13 * SIZE(BO)
  2039. LFD b6, 14 * SIZE(BO)
  2040. LFD b7, 15 * SIZE(BO)
  2041. LFD b8, 16 * SIZE(BO)
  2042. FMADD y01, a1, b1, y01
  2043. LFD a1, 13 * SIZE(AO1)
  2044. FMADD y02, a2, b1, y02
  2045. LFD a2, 13 * SIZE(AO2)
  2046. FMADD y03, a3, b2, y03
  2047. LFD a3, 14 * SIZE(AO1)
  2048. FMADD y04, a4, b2, y04
  2049. LFD a4, 14 * SIZE(AO2)
  2050. FMADD y09, a5, b3, y09
  2051. LFD a5, 15 * SIZE(AO1)
  2052. FMADD y10, a6, b3, y10
  2053. LFD a6, 15 * SIZE(AO2)
  2054. FMADD y11, a7, b4, y11
  2055. LFD a7, 16 * SIZE(AO1)
  2056. FMADD y12, a8, b4, y12
  2057. LFD a8, 16 * SIZE(AO2)
  2058. FMADD y01, a1, b5, y01
  2059. FMADD y02, a2, b5, y02
  2060. FMADD y03, a3, b6, y03
  2061. FMADD y04, a4, b6, y04
  2062. FMADD y09, a5, b7, y09
  2063. FMADD y10, a6, b7, y10
  2064. FMADD y11, a7, b8, y11
  2065. FMADD y12, a8, b8, y12
  2066. addi AO1, AO1, 16 * SIZE
  2067. addi AO2, AO2, 16 * SIZE
  2068. addi BO, BO, 16 * SIZE
  2069. .align 4
  2070. LL(34):
  2071. andi. r0, MIN_N, 15
  2072. ble LL(38)
  2073. andi. r0, MIN_N, 8
  2074. ble LL(35)
  2075. LFD a1, 1 * SIZE(AO1)
  2076. LFD a2, 1 * SIZE(AO2)
  2077. LFD a3, 2 * SIZE(AO1)
  2078. LFD a4, 2 * SIZE(AO2)
  2079. LFD b1, 1 * SIZE(BO)
  2080. LFD b2, 2 * SIZE(BO)
  2081. LFD b3, 3 * SIZE(BO)
  2082. LFD b4, 4 * SIZE(BO)
  2083. LFD a5, 3 * SIZE(AO1)
  2084. LFD a6, 3 * SIZE(AO2)
  2085. LFD a7, 4 * SIZE(AO1)
  2086. LFD a8, 4 * SIZE(AO2)
  2087. LFD b5, 5 * SIZE(BO)
  2088. LFD b6, 6 * SIZE(BO)
  2089. LFD b7, 7 * SIZE(BO)
  2090. LFD b8, 8 * SIZE(BO)
  2091. FMADD y01, a1, b1, y01
  2092. LFD a1, 5 * SIZE(AO1)
  2093. FMADD y02, a2, b1, y02
  2094. LFD a2, 5 * SIZE(AO2)
  2095. FMADD y09, a3, b2, y09
  2096. LFD a3, 6 * SIZE(AO1)
  2097. FMADD y10, a4, b2, y10
  2098. LFD a4, 6 * SIZE(AO2)
  2099. FMADD y01, a5, b3, y01
  2100. LFD a5, 7 * SIZE(AO1)
  2101. FMADD y02, a6, b3, y02
  2102. LFD a6, 7 * SIZE(AO2)
  2103. FMADD y09, a7, b4, y09
  2104. LFD a7, 8 * SIZE(AO1)
  2105. FMADD y10, a8, b4, y10
  2106. LFD a8, 8 * SIZE(AO2)
  2107. FMADD y01, a1, b5, y01
  2108. FMADD y02, a2, b5, y02
  2109. FMADD y09, a3, b6, y09
  2110. FMADD y10, a4, b6, y10
  2111. FMADD y01, a5, b7, y01
  2112. addi AO1, AO1, 8 * SIZE
  2113. FMADD y02, a6, b7, y02
  2114. addi AO2, AO2, 8 * SIZE
  2115. FMADD y09, a7, b8, y09
  2116. addi BO, BO, 8 * SIZE
  2117. FMADD y10, a8, b8, y10
  2118. nop
  2119. .align 4
  2120. LL(35):
  2121. andi. r0, MIN_N, 4
  2122. ble LL(36)
  2123. LFD a1, 1 * SIZE(AO1)
  2124. LFD a2, 1 * SIZE(AO2)
  2125. LFD a3, 2 * SIZE(AO1)
  2126. LFD a4, 2 * SIZE(AO2)
  2127. LFD a5, 3 * SIZE(AO1)
  2128. LFD a6, 3 * SIZE(AO2)
  2129. LFD a7, 4 * SIZE(AO1)
  2130. LFD a8, 4 * SIZE(AO2)
  2131. LFD b1, 1 * SIZE(BO)
  2132. LFD b2, 2 * SIZE(BO)
  2133. LFD b3, 3 * SIZE(BO)
  2134. LFD b4, 4 * SIZE(BO)
  2135. FMADD y01, a1, b1, y01
  2136. FMADD y02, a2, b1, y02
  2137. FMADD y09, a3, b2, y09
  2138. FMADD y10, a4, b2, y10
  2139. FMADD y01, a5, b3, y01
  2140. addi AO1, AO1, 4 * SIZE
  2141. FMADD y02, a6, b3, y02
  2142. addi AO2, AO2, 4 * SIZE
  2143. FMADD y09, a7, b4, y09
  2144. addi BO, BO, 4 * SIZE
  2145. FMADD y10, a8, b4, y10
  2146. .align 4
  2147. LL(36):
  2148. andi. r0, MIN_N, 2
  2149. ble LL(37)
  2150. LFD a1, 1 * SIZE(AO1)
  2151. LFD a2, 1 * SIZE(AO2)
  2152. LFD b1, 1 * SIZE(BO)
  2153. LFD b2, 2 * SIZE(BO)
  2154. LFD a3, 2 * SIZE(AO1)
  2155. LFD a4, 2 * SIZE(AO2)
  2156. FMADD y01, a1, b1, y01
  2157. FMADD y02, a2, b1, y02
  2158. FMADD y09, a3, b2, y09
  2159. FMADD y10, a4, b2, y10
  2160. addi AO1, AO1, 2 * SIZE
  2161. addi AO2, AO2, 2 * SIZE
  2162. addi BO, BO, 2 * SIZE
  2163. .align 4
  2164. LL(37):
  2165. andi. r0, MIN_N, 1
  2166. ble LL(38)
  2167. LFD a1, 1 * SIZE(AO1)
  2168. LFD b1, 1 * SIZE(BO)
  2169. LFD a2, 1 * SIZE(AO2)
  2170. FMADD y01, a1, b1, y01
  2171. FMADD y02, a2, b1, y02
  2172. .align 4
  2173. LL(38):
  2174. mr BO, CO
  2175. lfd alpha, ALPHA
  2176. cmpi cr0, 0, INCY, SIZE
  2177. bne LL(39)
  2178. LFD a1, 1 * SIZE(CO)
  2179. LFD a2, 2 * SIZE(CO)
  2180. FADD y01, y03, y01
  2181. FADD y02, y04, y02
  2182. FADD y09, y11, y09
  2183. FADD y10, y12, y10
  2184. FADD y01, y09, y01
  2185. FADD y02, y10, y02
  2186. FMADD a1, alpha, y01, a1
  2187. FMADD a2, alpha, y02, a2
  2188. STFD a1, 1 * SIZE(CO)
  2189. STFD a2, 2 * SIZE(CO)
  2190. addi CO, CO, 2 * SIZE
  2191. b LL(40)
  2192. .align 4
  2193. LL(39):
  2194. LFDUX a1, CO, INCY
  2195. LFDUX a2, CO, INCY
  2196. FADD y01, y03, y01
  2197. FADD y02, y04, y02
  2198. FADD y09, y11, y09
  2199. FADD y10, y12, y10
  2200. FADD y01, y09, y01
  2201. FADD y02, y10, y02
  2202. FMADD a1, alpha, f0, a1
  2203. FMADD a2, alpha, f1, a2
  2204. STFDUX a1, BO, INCY
  2205. STFDUX a2, BO, INCY
  2206. .align 4
  2207. LL(40):
  2208. andi. J, N, 1
  2209. ble LL(99)
  2210. mr AO1, A
  2211. add A, A, LDA
  2212. mr BO, XP
  2213. lfd y01, FZERO
  2214. fmr y02, y01
  2215. fmr y03, y01
  2216. fmr y04, y01
  2217. fmr y09, y01
  2218. fmr y10, y01
  2219. fmr y11, y01
  2220. fmr y12, y01
  2221. DCBT(Y1, PREC)
  2222. srawi. r0, MIN_N, 4
  2223. mtspr CTR, r0
  2224. ble LL(44)
  2225. LFD a1, 1 * SIZE(AO1)
  2226. LFD a2, 2 * SIZE(AO1)
  2227. LFD a3, 3 * SIZE(AO1)
  2228. LFD a4, 4 * SIZE(AO1)
  2229. LFD a5, 5 * SIZE(AO1)
  2230. LFD a6, 6 * SIZE(AO1)
  2231. LFD a7, 7 * SIZE(AO1)
  2232. LFD a8, 8 * SIZE(AO1)
  2233. LFD b1, 1 * SIZE(BO)
  2234. LFD b2, 2 * SIZE(BO)
  2235. LFD b3, 3 * SIZE(BO)
  2236. LFD b4, 4 * SIZE(BO)
  2237. LFD b5, 5 * SIZE(BO)
  2238. LFD b6, 6 * SIZE(BO)
  2239. LFD b7, 7 * SIZE(BO)
  2240. LFD b8, 8 * SIZE(BO)
  2241. bdz LL(43)
  2242. .align 4
  2243. LL(42):
  2244. FMADD y01, a1, b1, y01
  2245. nop
  2246. LFD a1, 9 * SIZE(AO1)
  2247. LFD b1, 9 * SIZE(BO)
  2248. FMADD y02, a2, b2, y02
  2249. nop
  2250. LFD a2, 10 * SIZE(AO1)
  2251. LFD b2, 10 * SIZE(BO)
  2252. FMADD y03, a3, b3, y03
  2253. nop
  2254. LFD a3, 11 * SIZE(AO1)
  2255. LFD b3, 11 * SIZE(BO)
  2256. FMADD y04, a4, b4, y04
  2257. nop
  2258. LFD a4, 12 * SIZE(AO1)
  2259. LFD b4, 12 * SIZE(BO)
  2260. FMADD y01, a5, b5, y01
  2261. nop
  2262. LFD a5, 13 * SIZE(AO1)
  2263. LFD b5, 13 * SIZE(BO)
  2264. FMADD y02, a6, b6, y02
  2265. nop
  2266. LFD a6, 14 * SIZE(AO1)
  2267. LFD b6, 14 * SIZE(BO)
  2268. FMADD y03, a7, b7, y03
  2269. nop
  2270. LFD a7, 15 * SIZE(AO1)
  2271. LFD b7, 15 * SIZE(BO)
  2272. FMADD y04, a8, b8, y04
  2273. nop
  2274. LFD a8, 16 * SIZE(AO1)
  2275. LFD b8, 16 * SIZE(BO)
  2276. FMADD y01, a1, b1, y01
  2277. nop
  2278. LFD a1, 17 * SIZE(AO1)
  2279. LFD b1, 17 * SIZE(BO)
  2280. FMADD y02, a2, b2, y02
  2281. nop
  2282. LFD a2, 18 * SIZE(AO1)
  2283. LFD b2, 18 * SIZE(BO)
  2284. FMADD y03, a3, b3, y03
  2285. nop
  2286. LFD a3, 19 * SIZE(AO1)
  2287. LFD b3, 19 * SIZE(BO)
  2288. FMADD y04, a4, b4, y04
  2289. nop
  2290. LFD a4, 20 * SIZE(AO1)
  2291. LFD b4, 20 * SIZE(BO)
  2292. FMADD y01, a5, b5, y01
  2293. nop
  2294. LFD a5, 21 * SIZE(AO1)
  2295. LFD b5, 21 * SIZE(BO)
  2296. FMADD y02, a6, b6, y02
  2297. nop
  2298. LFD a6, 22 * SIZE(AO1)
  2299. LFD b6, 22 * SIZE(BO)
  2300. FMADD y03, a7, b7, y03
  2301. nop
  2302. LFD a7, 23 * SIZE(AO1)
  2303. LFD b7, 23 * SIZE(BO)
  2304. FMADD y04, a8, b8, y04
  2305. nop
  2306. LFD a8, 24 * SIZE(AO1)
  2307. LFD b8, 24 * SIZE(BO)
  2308. addi AO1, AO1, 16 * SIZE
  2309. addi BO, BO, 16 * SIZE
  2310. DCBT(AO1, PREA)
  2311. bdnz LL(42)
  2312. .align 4
  2313. LL(43):
  2314. FMADD y01, a1, b1, y01
  2315. nop
  2316. LFD a1, 9 * SIZE(AO1)
  2317. LFD b1, 9 * SIZE(BO)
  2318. FMADD y02, a2, b2, y02
  2319. nop
  2320. LFD a2, 10 * SIZE(AO1)
  2321. LFD b2, 10 * SIZE(BO)
  2322. FMADD y03, a3, b3, y03
  2323. nop
  2324. LFD a3, 11 * SIZE(AO1)
  2325. LFD b3, 11 * SIZE(BO)
  2326. FMADD y04, a4, b4, y04
  2327. nop
  2328. LFD a4, 12 * SIZE(AO1)
  2329. LFD b4, 12 * SIZE(BO)
  2330. FMADD y01, a5, b5, y01
  2331. nop
  2332. LFD a5, 13 * SIZE(AO1)
  2333. LFD b5, 13 * SIZE(BO)
  2334. FMADD y02, a6, b6, y02
  2335. nop
  2336. LFD a6, 14 * SIZE(AO1)
  2337. LFD b6, 14 * SIZE(BO)
  2338. FMADD y03, a7, b7, y03
  2339. nop
  2340. LFD a7, 15 * SIZE(AO1)
  2341. LFD b7, 15 * SIZE(BO)
  2342. FMADD y04, a8, b8, y04
  2343. nop
  2344. LFD a8, 16 * SIZE(AO1)
  2345. LFD b8, 16 * SIZE(BO)
  2346. FMADD y01, a1, b1, y01
  2347. FMADD y02, a2, b2, y02
  2348. FMADD y03, a3, b3, y03
  2349. FMADD y04, a4, b4, y04
  2350. FMADD y01, a5, b5, y01
  2351. addi AO1, AO1, 16 * SIZE
  2352. FMADD y02, a6, b6, y02
  2353. addi BO, BO, 16 * SIZE
  2354. FMADD y03, a7, b7, y03
  2355. nop
  2356. FMADD y04, a8, b8, y04
  2357. nop
  2358. .align 4
  2359. LL(44):
  2360. andi. r0, MIN_N, 15
  2361. ble LL(48)
  2362. andi. r0, MIN_N, 8
  2363. ble LL(45)
  2364. LFD a1, 1 * SIZE(AO1)
  2365. LFD a2, 2 * SIZE(AO1)
  2366. LFD a3, 3 * SIZE(AO1)
  2367. LFD a4, 4 * SIZE(AO1)
  2368. LFD b1, 1 * SIZE(BO)
  2369. LFD b2, 2 * SIZE(BO)
  2370. LFD b3, 3 * SIZE(BO)
  2371. LFD b4, 4 * SIZE(BO)
  2372. LFD a5, 5 * SIZE(AO1)
  2373. LFD a6, 6 * SIZE(AO1)
  2374. LFD a7, 7 * SIZE(AO1)
  2375. LFD a8, 8 * SIZE(AO1)
  2376. LFD b5, 5 * SIZE(BO)
  2377. LFD b6, 6 * SIZE(BO)
  2378. LFD b7, 7 * SIZE(BO)
  2379. LFD b8, 8 * SIZE(BO)
  2380. FMADD y01, a1, b1, y01
  2381. FMADD y02, a2, b2, y02
  2382. FMADD y03, a3, b3, y03
  2383. FMADD y04, a4, b4, y04
  2384. FMADD y01, a5, b5, y01
  2385. addi AO1, AO1, 8 * SIZE
  2386. FMADD y02, a6, b6, y02
  2387. addi BO, BO, 8 * SIZE
  2388. FMADD y03, a7, b7, y03
  2389. nop
  2390. FMADD y04, a8, b8, y04
  2391. nop
  2392. .align 4
  2393. LL(45):
  2394. andi. r0, MIN_N, 4
  2395. ble LL(46)
  2396. LFD a1, 1 * SIZE(AO1)
  2397. LFD b1, 1 * SIZE(BO)
  2398. LFD a2, 2 * SIZE(AO1)
  2399. LFD b2, 2 * SIZE(BO)
  2400. LFD a3, 3 * SIZE(AO1)
  2401. LFD b3, 3 * SIZE(BO)
  2402. LFD a4, 4 * SIZE(AO1)
  2403. LFD b4, 4 * SIZE(BO)
  2404. FMADD y01, a1, b1, y01
  2405. addi AO1, AO1, 4 * SIZE
  2406. FMADD y02, a2, b2, y02
  2407. addi AO2, AO2, 4 * SIZE
  2408. FMADD y03, a3, b3, y03
  2409. addi BO, BO, 4 * SIZE
  2410. FMADD y04, a4, b4, y04
  2411. nop
  2412. .align 4
  2413. LL(46):
  2414. andi. r0, MIN_N, 2
  2415. ble LL(47)
  2416. LFD a1, 1 * SIZE(AO1)
  2417. LFD b1, 1 * SIZE(BO)
  2418. LFD a2, 2 * SIZE(AO1)
  2419. LFD b2, 2 * SIZE(BO)
  2420. FMADD y01, a1, b1, y01
  2421. addi AO1, AO1, 2 * SIZE
  2422. FMADD y02, a2, b2, y02
  2423. addi BO, BO, 2 * SIZE
  2424. .align 4
  2425. LL(47):
  2426. andi. r0, MIN_N, 1
  2427. ble LL(48)
  2428. LFD a1, 1 * SIZE(AO1)
  2429. LFD b1, 1 * SIZE(BO)
  2430. FMADD y01, a1, b1, y01
  2431. .align 4
  2432. LL(48):
  2433. mr BO, CO
  2434. lfd alpha, ALPHA
  2435. cmpi cr0, 0, INCY, SIZE
  2436. bne LL(49)
  2437. LFD a1, 1 * SIZE(CO)
  2438. FADD y01, y02, y01
  2439. FADD y03, y04, y03
  2440. FADD y01, y03, y01
  2441. FMADD a1, alpha, y01, a1
  2442. STFD a1, 1 * SIZE(CO)
  2443. b LL(99)
  2444. .align 4
  2445. LL(49):
  2446. LFDUX a1, CO, INCY
  2447. FADD y01, y02, y01
  2448. FADD y03, y04, y03
  2449. FADD y01, y03, y01
  2450. FMADD a1, alpha, f0, a1
  2451. STFDUX a1, BO, INCY
  2452. .align 4
  2453. LL(99):
  2454. subf A, PLDA_M, A
  2455. addi IS, IS, P
  2456. cmp cr0, 0, IS, M
  2457. blt LL(ISLoop)
  2458. .align 4
  2459. LL(999):
  2460. li r3, 0
  2461. lfd f14, 0(SP)
  2462. lfd f15, 8(SP)
  2463. lfd f16, 16(SP)
  2464. lfd f17, 24(SP)
  2465. lfd f18, 32(SP)
  2466. lfd f19, 40(SP)
  2467. lfd f20, 48(SP)
  2468. lfd f21, 56(SP)
  2469. lfd f22, 64(SP)
  2470. lfd f23, 72(SP)
  2471. lfd f24, 80(SP)
  2472. lfd f25, 88(SP)
  2473. lfd f26, 96(SP)
  2474. lfd f27, 104(SP)
  2475. lfd f28, 112(SP)
  2476. lfd f29, 120(SP)
  2477. lfd f30, 128(SP)
  2478. lfd f31, 136(SP)
  2479. #ifdef __64BIT__
  2480. ld r14, 160(SP)
  2481. ld r15, 168(SP)
  2482. ld r16, 176(SP)
  2483. ld r17, 184(SP)
  2484. ld r18, 192(SP)
  2485. ld r19, 200(SP)
  2486. ld r20, 208(SP)
  2487. ld r21, 216(SP)
  2488. ld r22, 224(SP)
  2489. ld r23, 232(SP)
  2490. ld r24, 240(SP)
  2491. ld r25, 248(SP)
  2492. ld r26, 256(SP)
  2493. ld r27, 264(SP)
  2494. ld r28, 272(SP)
  2495. ld r29, 280(SP)
  2496. #else
  2497. lwz r14, 160(SP)
  2498. lwz r15, 164(SP)
  2499. lwz r16, 168(SP)
  2500. lwz r17, 172(SP)
  2501. lwz r18, 176(SP)
  2502. lwz r19, 180(SP)
  2503. lwz r20, 184(SP)
  2504. lwz r21, 188(SP)
  2505. lwz r22, 192(SP)
  2506. lwz r23, 196(SP)
  2507. lwz r24, 200(SP)
  2508. lwz r25, 204(SP)
  2509. lwz r26, 208(SP)
  2510. lwz r27, 212(SP)
  2511. lwz r28, 216(SP)
  2512. lwz r29, 220(SP)
  2513. #endif
  2514. addi SP, SP, STACKSIZE
  2515. blr
  2516. EPILOGUE
  2517. #endif