You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n.S 87 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r8
  54. #define LDA r9
  55. #define X r10
  56. #define INCX r5
  57. #define Y r6
  58. #define INCY r7
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r10
  66. #define LDA r5
  67. #define X r6
  68. #define INCX r7
  69. #define Y r8
  70. #define INCY r9
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r8
  75. #define LDA r9
  76. #define X r10
  77. #define INCX r5
  78. #define Y r6
  79. #define INCY r7
  80. #endif
  81. #endif
  82. #define I r11
  83. #define J r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define LDA4 r18
  89. #define Y1 r19
  90. #define Y2 r20
  91. #define PREA r21
  92. #define PREC r22
  93. #define y01 f0
  94. #define y02 f1
  95. #define y03 f2
  96. #define y04 f3
  97. #define y05 f4
  98. #define y06 f5
  99. #define y07 f6
  100. #define y08 f7
  101. #define y09 f8
  102. #define y10 f9
  103. #define y11 f10
  104. #define y12 f11
  105. #define y13 f12
  106. #define y14 f13
  107. #define y15 f14
  108. #define y16 f15
  109. #define alpha1r f16
  110. #define alpha1i f17
  111. #define alpha2r f18
  112. #define alpha2i f19
  113. #define alpha3r f20
  114. #define alpha3i f21
  115. #define alpha4r f22
  116. #define alpha4i f23
  117. #define a1 f24
  118. #define a2 f25
  119. #define a3 f26
  120. #define a4 f27
  121. #define a5 f28
  122. #define a6 f29
  123. #define a7 f30
  124. #define a8 f31
  125. #define alpha_r f14
  126. #define alpha_i f15
  127. #if defined(PPCG4)
  128. #define PREFETCHSIZE_A 34
  129. #define PREFETCHSIZE_C 16
  130. #endif
  131. #if defined(PPC440) || defined(PPC440FP2)
  132. #define PREFETCHSIZE_A 34
  133. #define PREFETCHSIZE_C 16
  134. #endif
  135. #ifdef PPC970
  136. #define PREFETCHSIZE_A 56
  137. #define PREFETCHSIZE_C 16
  138. #endif
  139. #ifdef CELL
  140. #define PREFETCHSIZE_A 56
  141. #define PREFETCHSIZE_C 16
  142. #endif
  143. #ifdef POWER3
  144. #define PREFETCHSIZE_A 34
  145. #define PREFETCHSIZE_C 16
  146. #endif
  147. #ifdef POWER4
  148. #define PREFETCHSIZE_A 34
  149. #define PREFETCHSIZE_C 16
  150. #endif
  151. #ifdef POWER5
  152. #define PREFETCHSIZE_A 40
  153. #define PREFETCHSIZE_C 24
  154. #endif
  155. #ifdef POWER6
  156. #define PREFETCHSIZE_A 24
  157. #define PREFETCHSIZE_C 24
  158. #endif
  159. #ifdef POWER8
  160. #define PREFETCHSIZE_A 24
  161. #define PREFETCHSIZE_C 24
  162. #endif
  163. #ifndef XCONJ
  164. #define FMADDR FMADD
  165. #define FMSUBR FNMSUB
  166. #else
  167. #define FMADDR FNMSUB
  168. #define FMSUBR FMADD
  169. #endif
  170. #ifndef CONJ
  171. #define FMADDX FMADD
  172. #define FMSUBX FNMSUB
  173. #else
  174. #define FMADDX FNMSUB
  175. #define FMSUBX FMADD
  176. #endif
  177. #ifndef NEEDPARAM
  178. #ifndef __64BIT__
  179. #define STACKSIZE 224
  180. #define ALPHA_R 208(SP)
  181. #define ALPHA_I 216(SP)
  182. #else
  183. #define STACKSIZE 280
  184. #define ALPHA_R 256(SP)
  185. #define ALPHA_I 264(SP)
  186. #endif
  187. PROLOGUE
  188. PROFCODE
  189. addi SP, SP, -STACKSIZE
  190. li r0, 0
  191. stfd f14, 0(SP)
  192. stfd f15, 8(SP)
  193. stfd f16, 16(SP)
  194. stfd f17, 24(SP)
  195. stfd f18, 32(SP)
  196. stfd f19, 40(SP)
  197. stfd f20, 48(SP)
  198. stfd f21, 56(SP)
  199. stfd f22, 64(SP)
  200. stfd f23, 72(SP)
  201. stfd f24, 80(SP)
  202. stfd f25, 88(SP)
  203. stfd f26, 96(SP)
  204. stfd f27, 104(SP)
  205. stfd f28, 112(SP)
  206. stfd f29, 120(SP)
  207. stfd f30, 128(SP)
  208. stfd f31, 136(SP)
  209. #ifdef __64BIT__
  210. std r14, 144(SP)
  211. std r15, 152(SP)
  212. std r16, 160(SP)
  213. std r17, 168(SP)
  214. std r18, 176(SP)
  215. std r19, 184(SP)
  216. std r20, 192(SP)
  217. std r21, 200(SP)
  218. std r22, 208(SP)
  219. #else
  220. stw r14, 144(SP)
  221. stw r15, 148(SP)
  222. stw r16, 152(SP)
  223. stw r17, 156(SP)
  224. stw r18, 160(SP)
  225. stw r19, 164(SP)
  226. stw r20, 168(SP)
  227. stw r21, 172(SP)
  228. stw r22, 176(SP)
  229. #endif
  230. #if defined(linux) || defined(__FreeBSD__)
  231. #ifndef __64BIT__
  232. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  233. #else
  234. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  235. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  236. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  237. #endif
  238. #endif
  239. #if defined(_AIX) || defined(__APPLE__)
  240. #ifndef __64BIT__
  241. #ifdef DOUBLE
  242. lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
  243. lwz X, FRAMESLOT(1) + STACKSIZE(SP)
  244. lwz INCX, FRAMESLOT(2) + STACKSIZE(SP)
  245. lwz Y, FRAMESLOT(3) + STACKSIZE(SP)
  246. lwz INCY, FRAMESLOT(4) + STACKSIZE(SP)
  247. #else
  248. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  249. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  250. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  251. #endif
  252. #else
  253. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  254. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  255. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  256. #endif
  257. #endif
  258. stfd f1, ALPHA_R
  259. stfd f2, ALPHA_I
  260. slwi LDA4, LDA, ZBASE_SHIFT + 2
  261. slwi LDA, LDA, ZBASE_SHIFT
  262. slwi INCX, INCX, ZBASE_SHIFT
  263. slwi INCY, INCY, ZBASE_SHIFT
  264. li PREA, PREFETCHSIZE_A * SIZE
  265. li PREC, PREFETCHSIZE_C * SIZE
  266. cmpwi cr0, M, 0
  267. ble- LL(999)
  268. cmpwi cr0, N, 0
  269. ble- LL(999)
  270. cmpi cr0, 0, INCY, 2 * SIZE
  271. bne LL(100)
  272. srawi. J, N, 2
  273. ble LL(20)
  274. .align 4
  275. LL(11):
  276. lfd alpha_r, ALPHA_R
  277. lfd alpha_i, ALPHA_I
  278. LFD a1, 0 * SIZE(X)
  279. LFD a2, 1 * SIZE(X)
  280. add X, X, INCX
  281. LFD a3, 0 * SIZE(X)
  282. LFD a4, 1 * SIZE(X)
  283. add X, X, INCX
  284. LFD a5, 0 * SIZE(X)
  285. LFD a6, 1 * SIZE(X)
  286. add X, X, INCX
  287. LFD a7, 0 * SIZE(X)
  288. LFD a8, 1 * SIZE(X)
  289. add X, X, INCX
  290. FMUL alpha1r, alpha_r, a1
  291. FMUL alpha1i, alpha_i, a1
  292. FMUL alpha2r, alpha_r, a3
  293. FMUL alpha2i, alpha_i, a3
  294. FMUL alpha3r, alpha_r, a5
  295. FMUL alpha3i, alpha_i, a5
  296. FMUL alpha4r, alpha_r, a7
  297. FMUL alpha4i, alpha_i, a7
  298. FMSUBR alpha1r, alpha_i, a2, alpha1r
  299. FMADDR alpha1i, alpha_r, a2, alpha1i
  300. FMSUBR alpha2r, alpha_i, a4, alpha2r
  301. FMADDR alpha2i, alpha_r, a4, alpha2i
  302. FMSUBR alpha3r, alpha_i, a6, alpha3r
  303. FMADDR alpha3i, alpha_r, a6, alpha3i
  304. FMSUBR alpha4r, alpha_i, a8, alpha4r
  305. FMADDR alpha4i, alpha_r, a8, alpha4i
  306. mr AO1, A
  307. add AO2, A, LDA
  308. add AO3, AO2, LDA
  309. add AO4, AO3, LDA
  310. add A, AO4, LDA
  311. mr Y1, Y
  312. mr Y2, Y
  313. srawi. r0, M, 3
  314. mtspr CTR, r0
  315. ble LL(15)
  316. .align 4
  317. LFD a1, 0 * SIZE(AO1)
  318. LFD a2, 1 * SIZE(AO1)
  319. LFD a3, 2 * SIZE(AO1)
  320. LFD a4, 3 * SIZE(AO1)
  321. LFD y01, 0 * SIZE(Y1)
  322. LFD y02, 1 * SIZE(Y1)
  323. LFD y03, 2 * SIZE(Y1)
  324. LFD y04, 3 * SIZE(Y1)
  325. LFD a5, 4 * SIZE(AO1)
  326. LFD a6, 5 * SIZE(AO1)
  327. LFD a7, 6 * SIZE(AO1)
  328. LFD a8, 7 * SIZE(AO1)
  329. LFD y05, 4 * SIZE(Y1)
  330. LFD y06, 5 * SIZE(Y1)
  331. LFD y07, 6 * SIZE(Y1)
  332. LFD y08, 7 * SIZE(Y1)
  333. LFD y09, 8 * SIZE(Y1)
  334. LFD y10, 9 * SIZE(Y1)
  335. LFD y11, 10 * SIZE(Y1)
  336. LFD y12, 11 * SIZE(Y1)
  337. LFD y13, 12 * SIZE(Y1)
  338. LFD y14, 13 * SIZE(Y1)
  339. LFD y15, 14 * SIZE(Y1)
  340. LFD y16, 15 * SIZE(Y1)
  341. addi Y1, Y1, 16 * SIZE
  342. bdz LL(13)
  343. .align 4
  344. LL(12):
  345. FMADD y01, alpha1r, a1, y01
  346. FMADD y02, alpha1i, a1, y02
  347. FMADD y03, alpha1r, a3, y03
  348. FMADD y04, alpha1i, a3, y04
  349. FMADD y05, alpha1r, a5, y05
  350. FMADD y06, alpha1i, a5, y06
  351. FMADD y07, alpha1r, a7, y07
  352. FMADD y08, alpha1i, a7, y08
  353. LFD a1, 8 * SIZE(AO1)
  354. LFD a3, 10 * SIZE(AO1)
  355. LFD a5, 12 * SIZE(AO1)
  356. LFD a7, 14 * SIZE(AO1)
  357. FMSUBX y01, alpha1i, a2, y01
  358. FMADDX y02, alpha1r, a2, y02
  359. FMSUBX y03, alpha1i, a4, y03
  360. FMADDX y04, alpha1r, a4, y04
  361. FMSUBX y05, alpha1i, a6, y05
  362. FMADDX y06, alpha1r, a6, y06
  363. FMSUBX y07, alpha1i, a8, y07
  364. FMADDX y08, alpha1r, a8, y08
  365. LFD a2, 9 * SIZE(AO1)
  366. LFD a4, 11 * SIZE(AO1)
  367. LFD a6, 13 * SIZE(AO1)
  368. LFD a8, 15 * SIZE(AO1)
  369. addi AO1, AO1, 16 * SIZE
  370. nop
  371. DCBT(AO1, PREA)
  372. nop
  373. FMADD y09, alpha1r, a1, y09
  374. FMADD y10, alpha1i, a1, y10
  375. FMADD y11, alpha1r, a3, y11
  376. FMADD y12, alpha1i, a3, y12
  377. FMADD y13, alpha1r, a5, y13
  378. FMADD y14, alpha1i, a5, y14
  379. FMADD y15, alpha1r, a7, y15
  380. FMADD y16, alpha1i, a7, y16
  381. LFD a1, 0 * SIZE(AO2)
  382. LFD a3, 2 * SIZE(AO2)
  383. LFD a5, 4 * SIZE(AO2)
  384. LFD a7, 6 * SIZE(AO2)
  385. FMSUBX y09, alpha1i, a2, y09
  386. FMADDX y10, alpha1r, a2, y10
  387. FMSUBX y11, alpha1i, a4, y11
  388. FMADDX y12, alpha1r, a4, y12
  389. FMSUBX y13, alpha1i, a6, y13
  390. FMADDX y14, alpha1r, a6, y14
  391. FMSUBX y15, alpha1i, a8, y15
  392. FMADDX y16, alpha1r, a8, y16
  393. LFD a2, 1 * SIZE(AO2)
  394. LFD a4, 3 * SIZE(AO2)
  395. LFD a6, 5 * SIZE(AO2)
  396. LFD a8, 7 * SIZE(AO2)
  397. FMADD y01, alpha2r, a1, y01
  398. FMADD y02, alpha2i, a1, y02
  399. FMADD y03, alpha2r, a3, y03
  400. FMADD y04, alpha2i, a3, y04
  401. FMADD y05, alpha2r, a5, y05
  402. FMADD y06, alpha2i, a5, y06
  403. FMADD y07, alpha2r, a7, y07
  404. FMADD y08, alpha2i, a7, y08
  405. LFD a1, 8 * SIZE(AO2)
  406. LFD a3, 10 * SIZE(AO2)
  407. LFD a5, 12 * SIZE(AO2)
  408. LFD a7, 14 * SIZE(AO2)
  409. FMSUBX y01, alpha2i, a2, y01
  410. FMADDX y02, alpha2r, a2, y02
  411. FMSUBX y03, alpha2i, a4, y03
  412. FMADDX y04, alpha2r, a4, y04
  413. FMSUBX y05, alpha2i, a6, y05
  414. FMADDX y06, alpha2r, a6, y06
  415. FMSUBX y07, alpha2i, a8, y07
  416. FMADDX y08, alpha2r, a8, y08
  417. LFD a2, 9 * SIZE(AO2)
  418. LFD a4, 11 * SIZE(AO2)
  419. LFD a6, 13 * SIZE(AO2)
  420. LFD a8, 15 * SIZE(AO2)
  421. addi AO2, AO2, 16 * SIZE
  422. nop
  423. DCBT(AO2, PREA)
  424. nop
  425. FMADD y09, alpha2r, a1, y09
  426. FMADD y10, alpha2i, a1, y10
  427. FMADD y11, alpha2r, a3, y11
  428. FMADD y12, alpha2i, a3, y12
  429. FMADD y13, alpha2r, a5, y13
  430. FMADD y14, alpha2i, a5, y14
  431. FMADD y15, alpha2r, a7, y15
  432. FMADD y16, alpha2i, a7, y16
  433. LFD a1, 0 * SIZE(AO3)
  434. LFD a3, 2 * SIZE(AO3)
  435. LFD a5, 4 * SIZE(AO3)
  436. LFD a7, 6 * SIZE(AO3)
  437. FMSUBX y09, alpha2i, a2, y09
  438. FMADDX y10, alpha2r, a2, y10
  439. FMSUBX y11, alpha2i, a4, y11
  440. FMADDX y12, alpha2r, a4, y12
  441. FMSUBX y13, alpha2i, a6, y13
  442. FMADDX y14, alpha2r, a6, y14
  443. FMSUBX y15, alpha2i, a8, y15
  444. FMADDX y16, alpha2r, a8, y16
  445. LFD a2, 1 * SIZE(AO3)
  446. LFD a4, 3 * SIZE(AO3)
  447. LFD a6, 5 * SIZE(AO3)
  448. LFD a8, 7 * SIZE(AO3)
  449. FMADD y01, alpha3r, a1, y01
  450. FMADD y02, alpha3i, a1, y02
  451. FMADD y03, alpha3r, a3, y03
  452. FMADD y04, alpha3i, a3, y04
  453. FMADD y05, alpha3r, a5, y05
  454. FMADD y06, alpha3i, a5, y06
  455. FMADD y07, alpha3r, a7, y07
  456. FMADD y08, alpha3i, a7, y08
  457. LFD a1, 8 * SIZE(AO3)
  458. LFD a3, 10 * SIZE(AO3)
  459. LFD a5, 12 * SIZE(AO3)
  460. LFD a7, 14 * SIZE(AO3)
  461. FMSUBX y01, alpha3i, a2, y01
  462. FMADDX y02, alpha3r, a2, y02
  463. FMSUBX y03, alpha3i, a4, y03
  464. FMADDX y04, alpha3r, a4, y04
  465. FMSUBX y05, alpha3i, a6, y05
  466. FMADDX y06, alpha3r, a6, y06
  467. FMSUBX y07, alpha3i, a8, y07
  468. FMADDX y08, alpha3r, a8, y08
  469. LFD a2, 9 * SIZE(AO3)
  470. LFD a4, 11 * SIZE(AO3)
  471. LFD a6, 13 * SIZE(AO3)
  472. LFD a8, 15 * SIZE(AO3)
  473. addi AO3, AO3, 16 * SIZE
  474. nop
  475. DCBT(AO3, PREA)
  476. nop
  477. FMADD y09, alpha3r, a1, y09
  478. FMADD y10, alpha3i, a1, y10
  479. FMADD y11, alpha3r, a3, y11
  480. FMADD y12, alpha3i, a3, y12
  481. FMADD y13, alpha3r, a5, y13
  482. FMADD y14, alpha3i, a5, y14
  483. FMADD y15, alpha3r, a7, y15
  484. FMADD y16, alpha3i, a7, y16
  485. LFD a1, 0 * SIZE(AO4)
  486. LFD a3, 2 * SIZE(AO4)
  487. LFD a5, 4 * SIZE(AO4)
  488. LFD a7, 6 * SIZE(AO4)
  489. FMSUBX y09, alpha3i, a2, y09
  490. FMADDX y10, alpha3r, a2, y10
  491. FMSUBX y11, alpha3i, a4, y11
  492. FMADDX y12, alpha3r, a4, y12
  493. FMSUBX y13, alpha3i, a6, y13
  494. FMADDX y14, alpha3r, a6, y14
  495. FMSUBX y15, alpha3i, a8, y15
  496. FMADDX y16, alpha3r, a8, y16
  497. LFD a2, 1 * SIZE(AO4)
  498. LFD a4, 3 * SIZE(AO4)
  499. LFD a6, 5 * SIZE(AO4)
  500. LFD a8, 7 * SIZE(AO4)
  501. FMADD y01, alpha4r, a1, y01
  502. FMADD y02, alpha4i, a1, y02
  503. FMADD y03, alpha4r, a3, y03
  504. FMADD y04, alpha4i, a3, y04
  505. FMADD y05, alpha4r, a5, y05
  506. FMADD y06, alpha4i, a5, y06
  507. FMADD y07, alpha4r, a7, y07
  508. FMADD y08, alpha4i, a7, y08
  509. LFD a1, 8 * SIZE(AO4)
  510. LFD a3, 10 * SIZE(AO4)
  511. LFD a5, 12 * SIZE(AO4)
  512. LFD a7, 14 * SIZE(AO4)
  513. FMSUBX y01, alpha4i, a2, y01
  514. FMADDX y02, alpha4r, a2, y02
  515. FMSUBX y03, alpha4i, a4, y03
  516. FMADDX y04, alpha4r, a4, y04
  517. STFD y01, 0 * SIZE(Y2)
  518. STFD y02, 1 * SIZE(Y2)
  519. STFD y03, 2 * SIZE(Y2)
  520. STFD y04, 3 * SIZE(Y2)
  521. LFD y01, 0 * SIZE(Y1)
  522. LFD y02, 1 * SIZE(Y1)
  523. LFD y03, 2 * SIZE(Y1)
  524. LFD y04, 3 * SIZE(Y1)
  525. FMSUBX y05, alpha4i, a6, y05
  526. FMADDX y06, alpha4r, a6, y06
  527. FMSUBX y07, alpha4i, a8, y07
  528. FMADDX y08, alpha4r, a8, y08
  529. LFD a2, 9 * SIZE(AO4)
  530. LFD a4, 11 * SIZE(AO4)
  531. LFD a6, 13 * SIZE(AO4)
  532. LFD a8, 15 * SIZE(AO4)
  533. addi AO4, AO4, 16 * SIZE
  534. nop
  535. DCBT(AO4, PREA)
  536. nop
  537. STFD y05, 4 * SIZE(Y2)
  538. STFD y06, 5 * SIZE(Y2)
  539. STFD y07, 6 * SIZE(Y2)
  540. STFD y08, 7 * SIZE(Y2)
  541. LFD y05, 4 * SIZE(Y1)
  542. LFD y06, 5 * SIZE(Y1)
  543. LFD y07, 6 * SIZE(Y1)
  544. LFD y08, 7 * SIZE(Y1)
  545. FMADD y09, alpha4r, a1, y09
  546. FMADD y10, alpha4i, a1, y10
  547. FMADD y11, alpha4r, a3, y11
  548. FMADD y12, alpha4i, a3, y12
  549. FMADD y13, alpha4r, a5, y13
  550. FMADD y14, alpha4i, a5, y14
  551. FMADD y15, alpha4r, a7, y15
  552. FMADD y16, alpha4i, a7, y16
  553. LFD a1, 0 * SIZE(AO1)
  554. LFD a3, 2 * SIZE(AO1)
  555. LFD a5, 4 * SIZE(AO1)
  556. LFD a7, 6 * SIZE(AO1)
  557. FMSUBX y09, alpha4i, a2, y09
  558. FMADDX y10, alpha4r, a2, y10
  559. FMSUBX y11, alpha4i, a4, y11
  560. FMADDX y12, alpha4r, a4, y12
  561. STFD y09, 8 * SIZE(Y2)
  562. STFD y10, 9 * SIZE(Y2)
  563. STFD y11, 10 * SIZE(Y2)
  564. STFD y12, 11 * SIZE(Y2)
  565. LFD y09, 8 * SIZE(Y1)
  566. LFD y10, 9 * SIZE(Y1)
  567. LFD y11, 10 * SIZE(Y1)
  568. LFD y12, 11 * SIZE(Y1)
  569. FMSUBX y13, alpha4i, a6, y13
  570. FMADDX y14, alpha4r, a6, y14
  571. FMSUBX y15, alpha4i, a8, y15
  572. FMADDX y16, alpha4r, a8, y16
  573. LFD a2, 1 * SIZE(AO1)
  574. LFD a4, 3 * SIZE(AO1)
  575. LFD a6, 5 * SIZE(AO1)
  576. LFD a8, 7 * SIZE(AO1)
  577. STFD y13, 12 * SIZE(Y2)
  578. STFD y14, 13 * SIZE(Y2)
  579. STFD y15, 14 * SIZE(Y2)
  580. STFD y16, 15 * SIZE(Y2)
  581. LFD y13, 12 * SIZE(Y1)
  582. LFD y14, 13 * SIZE(Y1)
  583. LFD y15, 14 * SIZE(Y1)
  584. LFD y16, 15 * SIZE(Y1)
  585. addi Y2, Y2, 16 * SIZE
  586. addi Y1, Y1, 16 * SIZE
  587. DCBT(Y1, PREC)
  588. bdnz LL(12)
  589. .align 4
  590. LL(13):
  591. FMADD y01, alpha1r, a1, y01
  592. FMADD y02, alpha1i, a1, y02
  593. FMADD y03, alpha1r, a3, y03
  594. FMADD y04, alpha1i, a3, y04
  595. FMADD y05, alpha1r, a5, y05
  596. FMADD y06, alpha1i, a5, y06
  597. FMADD y07, alpha1r, a7, y07
  598. FMADD y08, alpha1i, a7, y08
  599. LFD a1, 8 * SIZE(AO1)
  600. LFD a3, 10 * SIZE(AO1)
  601. LFD a5, 12 * SIZE(AO1)
  602. LFD a7, 14 * SIZE(AO1)
  603. FMSUBX y01, alpha1i, a2, y01
  604. FMADDX y02, alpha1r, a2, y02
  605. FMSUBX y03, alpha1i, a4, y03
  606. FMADDX y04, alpha1r, a4, y04
  607. FMSUBX y05, alpha1i, a6, y05
  608. FMADDX y06, alpha1r, a6, y06
  609. FMSUBX y07, alpha1i, a8, y07
  610. FMADDX y08, alpha1r, a8, y08
  611. LFD a2, 9 * SIZE(AO1)
  612. LFD a4, 11 * SIZE(AO1)
  613. LFD a6, 13 * SIZE(AO1)
  614. LFD a8, 15 * SIZE(AO1)
  615. FMADD y09, alpha1r, a1, y09
  616. FMADD y10, alpha1i, a1, y10
  617. FMADD y11, alpha1r, a3, y11
  618. FMADD y12, alpha1i, a3, y12
  619. FMADD y13, alpha1r, a5, y13
  620. FMADD y14, alpha1i, a5, y14
  621. FMADD y15, alpha1r, a7, y15
  622. FMADD y16, alpha1i, a7, y16
  623. LFD a1, 0 * SIZE(AO2)
  624. LFD a3, 2 * SIZE(AO2)
  625. LFD a5, 4 * SIZE(AO2)
  626. LFD a7, 6 * SIZE(AO2)
  627. FMSUBX y09, alpha1i, a2, y09
  628. FMADDX y10, alpha1r, a2, y10
  629. FMSUBX y11, alpha1i, a4, y11
  630. FMADDX y12, alpha1r, a4, y12
  631. FMSUBX y13, alpha1i, a6, y13
  632. FMADDX y14, alpha1r, a6, y14
  633. FMSUBX y15, alpha1i, a8, y15
  634. FMADDX y16, alpha1r, a8, y16
  635. LFD a2, 1 * SIZE(AO2)
  636. LFD a4, 3 * SIZE(AO2)
  637. LFD a6, 5 * SIZE(AO2)
  638. LFD a8, 7 * SIZE(AO2)
  639. FMADD y01, alpha2r, a1, y01
  640. FMADD y02, alpha2i, a1, y02
  641. FMADD y03, alpha2r, a3, y03
  642. FMADD y04, alpha2i, a3, y04
  643. FMADD y05, alpha2r, a5, y05
  644. FMADD y06, alpha2i, a5, y06
  645. FMADD y07, alpha2r, a7, y07
  646. FMADD y08, alpha2i, a7, y08
  647. LFD a1, 8 * SIZE(AO2)
  648. LFD a3, 10 * SIZE(AO2)
  649. LFD a5, 12 * SIZE(AO2)
  650. LFD a7, 14 * SIZE(AO2)
  651. FMSUBX y01, alpha2i, a2, y01
  652. FMADDX y02, alpha2r, a2, y02
  653. FMSUBX y03, alpha2i, a4, y03
  654. FMADDX y04, alpha2r, a4, y04
  655. FMSUBX y05, alpha2i, a6, y05
  656. FMADDX y06, alpha2r, a6, y06
  657. FMSUBX y07, alpha2i, a8, y07
  658. FMADDX y08, alpha2r, a8, y08
  659. LFD a2, 9 * SIZE(AO2)
  660. LFD a4, 11 * SIZE(AO2)
  661. LFD a6, 13 * SIZE(AO2)
  662. LFD a8, 15 * SIZE(AO2)
  663. FMADD y09, alpha2r, a1, y09
  664. FMADD y10, alpha2i, a1, y10
  665. FMADD y11, alpha2r, a3, y11
  666. FMADD y12, alpha2i, a3, y12
  667. FMADD y13, alpha2r, a5, y13
  668. FMADD y14, alpha2i, a5, y14
  669. FMADD y15, alpha2r, a7, y15
  670. FMADD y16, alpha2i, a7, y16
  671. LFD a1, 0 * SIZE(AO3)
  672. LFD a3, 2 * SIZE(AO3)
  673. LFD a5, 4 * SIZE(AO3)
  674. LFD a7, 6 * SIZE(AO3)
  675. FMSUBX y09, alpha2i, a2, y09
  676. FMADDX y10, alpha2r, a2, y10
  677. FMSUBX y11, alpha2i, a4, y11
  678. FMADDX y12, alpha2r, a4, y12
  679. FMSUBX y13, alpha2i, a6, y13
  680. FMADDX y14, alpha2r, a6, y14
  681. FMSUBX y15, alpha2i, a8, y15
  682. FMADDX y16, alpha2r, a8, y16
  683. LFD a2, 1 * SIZE(AO3)
  684. LFD a4, 3 * SIZE(AO3)
  685. LFD a6, 5 * SIZE(AO3)
  686. LFD a8, 7 * SIZE(AO3)
  687. FMADD y01, alpha3r, a1, y01
  688. FMADD y02, alpha3i, a1, y02
  689. FMADD y03, alpha3r, a3, y03
  690. FMADD y04, alpha3i, a3, y04
  691. FMADD y05, alpha3r, a5, y05
  692. FMADD y06, alpha3i, a5, y06
  693. FMADD y07, alpha3r, a7, y07
  694. FMADD y08, alpha3i, a7, y08
  695. LFD a1, 8 * SIZE(AO3)
  696. LFD a3, 10 * SIZE(AO3)
  697. LFD a5, 12 * SIZE(AO3)
  698. LFD a7, 14 * SIZE(AO3)
  699. FMSUBX y01, alpha3i, a2, y01
  700. FMADDX y02, alpha3r, a2, y02
  701. FMSUBX y03, alpha3i, a4, y03
  702. FMADDX y04, alpha3r, a4, y04
  703. FMSUBX y05, alpha3i, a6, y05
  704. FMADDX y06, alpha3r, a6, y06
  705. FMSUBX y07, alpha3i, a8, y07
  706. FMADDX y08, alpha3r, a8, y08
  707. LFD a2, 9 * SIZE(AO3)
  708. LFD a4, 11 * SIZE(AO3)
  709. LFD a6, 13 * SIZE(AO3)
  710. LFD a8, 15 * SIZE(AO3)
  711. FMADD y09, alpha3r, a1, y09
  712. FMADD y10, alpha3i, a1, y10
  713. FMADD y11, alpha3r, a3, y11
  714. FMADD y12, alpha3i, a3, y12
  715. FMADD y13, alpha3r, a5, y13
  716. FMADD y14, alpha3i, a5, y14
  717. FMADD y15, alpha3r, a7, y15
  718. FMADD y16, alpha3i, a7, y16
  719. LFD a1, 0 * SIZE(AO4)
  720. LFD a3, 2 * SIZE(AO4)
  721. LFD a5, 4 * SIZE(AO4)
  722. LFD a7, 6 * SIZE(AO4)
  723. FMSUBX y09, alpha3i, a2, y09
  724. FMADDX y10, alpha3r, a2, y10
  725. FMSUBX y11, alpha3i, a4, y11
  726. FMADDX y12, alpha3r, a4, y12
  727. FMSUBX y13, alpha3i, a6, y13
  728. FMADDX y14, alpha3r, a6, y14
  729. FMSUBX y15, alpha3i, a8, y15
  730. FMADDX y16, alpha3r, a8, y16
  731. LFD a2, 1 * SIZE(AO4)
  732. LFD a4, 3 * SIZE(AO4)
  733. LFD a6, 5 * SIZE(AO4)
  734. LFD a8, 7 * SIZE(AO4)
  735. FMADD y01, alpha4r, a1, y01
  736. FMADD y02, alpha4i, a1, y02
  737. FMADD y03, alpha4r, a3, y03
  738. FMADD y04, alpha4i, a3, y04
  739. FMADD y05, alpha4r, a5, y05
  740. FMADD y06, alpha4i, a5, y06
  741. FMADD y07, alpha4r, a7, y07
  742. FMADD y08, alpha4i, a7, y08
  743. LFD a1, 8 * SIZE(AO4)
  744. LFD a3, 10 * SIZE(AO4)
  745. LFD a5, 12 * SIZE(AO4)
  746. LFD a7, 14 * SIZE(AO4)
  747. FMSUBX y01, alpha4i, a2, y01
  748. FMADDX y02, alpha4r, a2, y02
  749. FMSUBX y03, alpha4i, a4, y03
  750. FMADDX y04, alpha4r, a4, y04
  751. FMSUBX y05, alpha4i, a6, y05
  752. FMADDX y06, alpha4r, a6, y06
  753. FMSUBX y07, alpha4i, a8, y07
  754. FMADDX y08, alpha4r, a8, y08
  755. LFD a2, 9 * SIZE(AO4)
  756. LFD a4, 11 * SIZE(AO4)
  757. LFD a6, 13 * SIZE(AO4)
  758. LFD a8, 15 * SIZE(AO4)
  759. FMADD y09, alpha4r, a1, y09
  760. FMADD y10, alpha4i, a1, y10
  761. FMADD y11, alpha4r, a3, y11
  762. FMADD y12, alpha4i, a3, y12
  763. FMADD y13, alpha4r, a5, y13
  764. FMADD y14, alpha4i, a5, y14
  765. FMADD y15, alpha4r, a7, y15
  766. FMADD y16, alpha4i, a7, y16
  767. LFD a1, 16 * SIZE(AO1)
  768. LFD a3, 18 * SIZE(AO1)
  769. LFD a5, 20 * SIZE(AO1)
  770. LFD a7, 22 * SIZE(AO1)
  771. FMSUBX y09, alpha4i, a2, y09
  772. FMADDX y10, alpha4r, a2, y10
  773. FMSUBX y11, alpha4i, a4, y11
  774. FMADDX y12, alpha4r, a4, y12
  775. FMSUBX y13, alpha4i, a6, y13
  776. FMADDX y14, alpha4r, a6, y14
  777. FMSUBX y15, alpha4i, a8, y15
  778. FMADDX y16, alpha4r, a8, y16
  779. LFD a2, 17 * SIZE(AO1)
  780. LFD a4, 19 * SIZE(AO1)
  781. LFD a6, 21 * SIZE(AO1)
  782. LFD a8, 23 * SIZE(AO1)
  783. addi AO1, AO1, 16 * SIZE
  784. addi AO2, AO2, 16 * SIZE
  785. addi AO3, AO3, 16 * SIZE
  786. addi AO4, AO4, 16 * SIZE
  787. STFD y01, 0 * SIZE(Y2)
  788. STFD y02, 1 * SIZE(Y2)
  789. STFD y03, 2 * SIZE(Y2)
  790. STFD y04, 3 * SIZE(Y2)
  791. STFD y05, 4 * SIZE(Y2)
  792. STFD y06, 5 * SIZE(Y2)
  793. STFD y07, 6 * SIZE(Y2)
  794. STFD y08, 7 * SIZE(Y2)
  795. STFD y09, 8 * SIZE(Y2)
  796. STFD y10, 9 * SIZE(Y2)
  797. STFD y11, 10 * SIZE(Y2)
  798. STFD y12, 11 * SIZE(Y2)
  799. STFD y13, 12 * SIZE(Y2)
  800. STFD y14, 13 * SIZE(Y2)
  801. STFD y15, 14 * SIZE(Y2)
  802. STFD y16, 15 * SIZE(Y2)
  803. addi Y2, Y2, 16 * SIZE
  804. .align 4
  805. LL(15):
  806. andi. r0, M, 7
  807. ble LL(19)
  808. andi. r0, M, 4
  809. ble LL(16)
  810. LFD y01, 0 * SIZE(Y1)
  811. LFD y02, 1 * SIZE(Y1)
  812. LFD y03, 2 * SIZE(Y1)
  813. LFD y04, 3 * SIZE(Y1)
  814. LFD a1, 0 * SIZE(AO1)
  815. LFD a3, 2 * SIZE(AO1)
  816. LFD a5, 4 * SIZE(AO1)
  817. LFD a7, 6 * SIZE(AO1)
  818. LFD y05, 4 * SIZE(Y1)
  819. LFD y06, 5 * SIZE(Y1)
  820. LFD y07, 6 * SIZE(Y1)
  821. LFD y08, 7 * SIZE(Y1)
  822. LFD a2, 1 * SIZE(AO1)
  823. LFD a4, 3 * SIZE(AO1)
  824. LFD a6, 5 * SIZE(AO1)
  825. LFD a8, 7 * SIZE(AO1)
  826. FMADD y01, alpha1r, a1, y01
  827. FMADD y02, alpha1i, a1, y02
  828. FMADD y03, alpha1r, a3, y03
  829. FMADD y04, alpha1i, a3, y04
  830. FMADD y05, alpha1r, a5, y05
  831. FMADD y06, alpha1i, a5, y06
  832. FMADD y07, alpha1r, a7, y07
  833. FMADD y08, alpha1i, a7, y08
  834. LFD a1, 0 * SIZE(AO2)
  835. LFD a3, 2 * SIZE(AO2)
  836. LFD a5, 4 * SIZE(AO2)
  837. LFD a7, 6 * SIZE(AO2)
  838. FMSUBX y01, alpha1i, a2, y01
  839. FMADDX y02, alpha1r, a2, y02
  840. FMSUBX y03, alpha1i, a4, y03
  841. FMADDX y04, alpha1r, a4, y04
  842. FMSUBX y05, alpha1i, a6, y05
  843. FMADDX y06, alpha1r, a6, y06
  844. FMSUBX y07, alpha1i, a8, y07
  845. FMADDX y08, alpha1r, a8, y08
  846. LFD a2, 1 * SIZE(AO2)
  847. LFD a4, 3 * SIZE(AO2)
  848. LFD a6, 5 * SIZE(AO2)
  849. LFD a8, 7 * SIZE(AO2)
  850. FMADD y01, alpha2r, a1, y01
  851. FMADD y02, alpha2i, a1, y02
  852. FMADD y03, alpha2r, a3, y03
  853. FMADD y04, alpha2i, a3, y04
  854. FMADD y05, alpha2r, a5, y05
  855. FMADD y06, alpha2i, a5, y06
  856. FMADD y07, alpha2r, a7, y07
  857. FMADD y08, alpha2i, a7, y08
  858. LFD a1, 0 * SIZE(AO3)
  859. LFD a3, 2 * SIZE(AO3)
  860. LFD a5, 4 * SIZE(AO3)
  861. LFD a7, 6 * SIZE(AO3)
  862. FMSUBX y01, alpha2i, a2, y01
  863. FMADDX y02, alpha2r, a2, y02
  864. FMSUBX y03, alpha2i, a4, y03
  865. FMADDX y04, alpha2r, a4, y04
  866. FMSUBX y05, alpha2i, a6, y05
  867. FMADDX y06, alpha2r, a6, y06
  868. FMSUBX y07, alpha2i, a8, y07
  869. FMADDX y08, alpha2r, a8, y08
  870. LFD a2, 1 * SIZE(AO3)
  871. LFD a4, 3 * SIZE(AO3)
  872. LFD a6, 5 * SIZE(AO3)
  873. LFD a8, 7 * SIZE(AO3)
  874. FMADD y01, alpha3r, a1, y01
  875. FMADD y02, alpha3i, a1, y02
  876. FMADD y03, alpha3r, a3, y03
  877. FMADD y04, alpha3i, a3, y04
  878. FMADD y05, alpha3r, a5, y05
  879. FMADD y06, alpha3i, a5, y06
  880. FMADD y07, alpha3r, a7, y07
  881. FMADD y08, alpha3i, a7, y08
  882. LFD a1, 0 * SIZE(AO4)
  883. LFD a3, 2 * SIZE(AO4)
  884. LFD a5, 4 * SIZE(AO4)
  885. LFD a7, 6 * SIZE(AO4)
  886. FMSUBX y01, alpha3i, a2, y01
  887. FMADDX y02, alpha3r, a2, y02
  888. FMSUBX y03, alpha3i, a4, y03
  889. FMADDX y04, alpha3r, a4, y04
  890. FMSUBX y05, alpha3i, a6, y05
  891. FMADDX y06, alpha3r, a6, y06
  892. FMSUBX y07, alpha3i, a8, y07
  893. FMADDX y08, alpha3r, a8, y08
  894. LFD a2, 1 * SIZE(AO4)
  895. LFD a4, 3 * SIZE(AO4)
  896. LFD a6, 5 * SIZE(AO4)
  897. LFD a8, 7 * SIZE(AO4)
  898. FMADD y01, alpha4r, a1, y01
  899. FMADD y02, alpha4i, a1, y02
  900. FMADD y03, alpha4r, a3, y03
  901. FMADD y04, alpha4i, a3, y04
  902. FMADD y05, alpha4r, a5, y05
  903. FMADD y06, alpha4i, a5, y06
  904. FMADD y07, alpha4r, a7, y07
  905. FMADD y08, alpha4i, a7, y08
  906. FMSUBX y01, alpha4i, a2, y01
  907. FMADDX y02, alpha4r, a2, y02
  908. FMSUBX y03, alpha4i, a4, y03
  909. FMADDX y04, alpha4r, a4, y04
  910. STFD y01, 0 * SIZE(Y2)
  911. STFD y02, 1 * SIZE(Y2)
  912. STFD y03, 2 * SIZE(Y2)
  913. STFD y04, 3 * SIZE(Y2)
  914. FMSUBX y05, alpha4i, a6, y05
  915. FMADDX y06, alpha4r, a6, y06
  916. FMSUBX y07, alpha4i, a8, y07
  917. FMADDX y08, alpha4r, a8, y08
  918. STFD y05, 4 * SIZE(Y2)
  919. STFD y06, 5 * SIZE(Y2)
  920. STFD y07, 6 * SIZE(Y2)
  921. STFD y08, 7 * SIZE(Y2)
  922. addi AO1, AO1, 8 * SIZE
  923. addi AO2, AO2, 8 * SIZE
  924. addi AO3, AO3, 8 * SIZE
  925. addi AO4, AO4, 8 * SIZE
  926. addi Y1, Y1, 8 * SIZE
  927. addi Y2, Y2, 8 * SIZE
  928. .align 4
  929. LL(16):
  930. andi. r0, M, 2
  931. nop
  932. nop
  933. ble LL(17)
  934. LFD a1, 0 * SIZE(AO1)
  935. LFD a2, 1 * SIZE(AO1)
  936. LFD a3, 2 * SIZE(AO1)
  937. LFD a4, 3 * SIZE(AO1)
  938. LFD y01, 0 * SIZE(Y1)
  939. LFD y02, 1 * SIZE(Y1)
  940. LFD y03, 2 * SIZE(Y1)
  941. LFD y04, 3 * SIZE(Y1)
  942. LFD a5, 0 * SIZE(AO2)
  943. LFD a6, 1 * SIZE(AO2)
  944. LFD a7, 2 * SIZE(AO2)
  945. LFD a8, 3 * SIZE(AO2)
  946. FMADD y01, alpha1r, a1, y01
  947. FMADD y02, alpha1i, a1, y02
  948. FMADD y03, alpha1r, a3, y03
  949. FMADD y04, alpha1i, a3, y04
  950. FMSUBX y01, alpha1i, a2, y01
  951. FMADDX y02, alpha1r, a2, y02
  952. FMSUBX y03, alpha1i, a4, y03
  953. FMADDX y04, alpha1r, a4, y04
  954. LFD a1, 0 * SIZE(AO3)
  955. LFD a2, 1 * SIZE(AO3)
  956. LFD a3, 2 * SIZE(AO3)
  957. LFD a4, 3 * SIZE(AO3)
  958. FMADD y01, alpha2r, a5, y01
  959. FMADD y02, alpha2i, a5, y02
  960. FMADD y03, alpha2r, a7, y03
  961. FMADD y04, alpha2i, a7, y04
  962. FMSUBX y01, alpha2i, a6, y01
  963. FMADDX y02, alpha2r, a6, y02
  964. FMSUBX y03, alpha2i, a8, y03
  965. FMADDX y04, alpha2r, a8, y04
  966. LFD a5, 0 * SIZE(AO4)
  967. LFD a6, 1 * SIZE(AO4)
  968. LFD a7, 2 * SIZE(AO4)
  969. LFD a8, 3 * SIZE(AO4)
  970. FMADD y01, alpha3r, a1, y01
  971. FMADD y02, alpha3i, a1, y02
  972. FMADD y03, alpha3r, a3, y03
  973. FMADD y04, alpha3i, a3, y04
  974. FMSUBX y01, alpha3i, a2, y01
  975. FMADDX y02, alpha3r, a2, y02
  976. FMSUBX y03, alpha3i, a4, y03
  977. FMADDX y04, alpha3r, a4, y04
  978. FMADD y01, alpha4r, a5, y01
  979. FMADD y02, alpha4i, a5, y02
  980. FMADD y03, alpha4r, a7, y03
  981. FMADD y04, alpha4i, a7, y04
  982. FMSUBX y01, alpha4i, a6, y01
  983. FMADDX y02, alpha4r, a6, y02
  984. FMSUBX y03, alpha4i, a8, y03
  985. FMADDX y04, alpha4r, a8, y04
  986. STFD y01, 0 * SIZE(Y2)
  987. STFD y02, 1 * SIZE(Y2)
  988. STFD y03, 2 * SIZE(Y2)
  989. STFD y04, 3 * SIZE(Y2)
  990. addi AO1, AO1, 4 * SIZE
  991. addi AO2, AO2, 4 * SIZE
  992. addi AO3, AO3, 4 * SIZE
  993. addi AO4, AO4, 4 * SIZE
  994. addi Y1, Y1, 4 * SIZE
  995. addi Y2, Y2, 4 * SIZE
  996. .align 4
  997. LL(17):
  998. andi. r0, M, 1
  999. ble LL(19)
  1000. LFD y01, 0 * SIZE(Y1)
  1001. LFD y02, 1 * SIZE(Y1)
  1002. LFD a1, 0 * SIZE(AO1)
  1003. LFD a2, 1 * SIZE(AO1)
  1004. LFD a3, 0 * SIZE(AO2)
  1005. LFD a4, 1 * SIZE(AO2)
  1006. LFD a5, 0 * SIZE(AO3)
  1007. LFD a6, 1 * SIZE(AO3)
  1008. LFD a7, 0 * SIZE(AO4)
  1009. LFD a8, 1 * SIZE(AO4)
  1010. FMADD y01, alpha1r, a1, y01
  1011. FMADD y02, alpha1i, a1, y02
  1012. FMSUBX y01, alpha1i, a2, y01
  1013. FMADDX y02, alpha1r, a2, y02
  1014. FMADD y01, alpha2r, a3, y01
  1015. FMADD y02, alpha2i, a3, y02
  1016. FMSUBX y01, alpha2i, a4, y01
  1017. FMADDX y02, alpha2r, a4, y02
  1018. FMADD y01, alpha3r, a5, y01
  1019. FMADD y02, alpha3i, a5, y02
  1020. FMSUBX y01, alpha3i, a6, y01
  1021. FMADDX y02, alpha3r, a6, y02
  1022. FMADD y01, alpha4r, a7, y01
  1023. FMADD y02, alpha4i, a7, y02
  1024. FMSUBX y01, alpha4i, a8, y01
  1025. FMADDX y02, alpha4r, a8, y02
  1026. STFD y01, 0 * SIZE(Y2)
  1027. STFD y02, 1 * SIZE(Y2)
  1028. add Y1, Y1, INCY
  1029. add Y2, Y2, INCY
  1030. .align 4
  1031. LL(19):
  1032. addi J, J, -1
  1033. cmpi cr0, 0, J, 0
  1034. bgt LL(11)
  1035. .align 4
  1036. LL(20):
  1037. andi. J, N, 2
  1038. ble LL(30)
  1039. .align 4
  1040. LL(21):
  1041. lfd alpha_r, ALPHA_R
  1042. lfd alpha_i, ALPHA_I
  1043. LFD a1, 0 * SIZE(X)
  1044. LFD a2, 1 * SIZE(X)
  1045. add X, X, INCX
  1046. LFD a3, 0 * SIZE(X)
  1047. LFD a4, 1 * SIZE(X)
  1048. add X, X, INCX
  1049. FMUL alpha1r, alpha_r, a1
  1050. FMUL alpha1i, alpha_i, a1
  1051. FMUL alpha2r, alpha_r, a3
  1052. FMUL alpha2i, alpha_i, a3
  1053. FMSUBR alpha1r, alpha_i, a2, alpha1r
  1054. FMADDR alpha1i, alpha_r, a2, alpha1i
  1055. FMSUBR alpha2r, alpha_i, a4, alpha2r
  1056. FMADDR alpha2i, alpha_r, a4, alpha2i
  1057. mr AO1, A
  1058. add AO2, A, LDA
  1059. add A, AO2, LDA
  1060. mr Y1, Y
  1061. mr Y2, Y
  1062. srawi. r0, M, 3
  1063. mtspr CTR, r0
  1064. ble LL(25)
  1065. .align 4
  1066. LFD a1, 0 * SIZE(AO1)
  1067. LFD a2, 1 * SIZE(AO1)
  1068. LFD a3, 2 * SIZE(AO1)
  1069. LFD a4, 3 * SIZE(AO1)
  1070. LFD y01, 0 * SIZE(Y1)
  1071. LFD y02, 1 * SIZE(Y1)
  1072. LFD y03, 2 * SIZE(Y1)
  1073. LFD y04, 3 * SIZE(Y1)
  1074. LFD a5, 4 * SIZE(AO1)
  1075. LFD a6, 5 * SIZE(AO1)
  1076. LFD a7, 6 * SIZE(AO1)
  1077. LFD a8, 7 * SIZE(AO1)
  1078. LFD y05, 4 * SIZE(Y1)
  1079. LFD y06, 5 * SIZE(Y1)
  1080. LFD y07, 6 * SIZE(Y1)
  1081. LFD y08, 7 * SIZE(Y1)
  1082. LFD y09, 8 * SIZE(Y1)
  1083. LFD y10, 9 * SIZE(Y1)
  1084. LFD y11, 10 * SIZE(Y1)
  1085. LFD y12, 11 * SIZE(Y1)
  1086. LFD y13, 12 * SIZE(Y1)
  1087. LFD y14, 13 * SIZE(Y1)
  1088. LFD y15, 14 * SIZE(Y1)
  1089. LFD y16, 15 * SIZE(Y1)
  1090. addi Y1, Y1, 16 * SIZE
  1091. bdz LL(23)
  1092. .align 4
  1093. LL(22):
  1094. FMADD y01, alpha1r, a1, y01
  1095. FMADD y02, alpha1i, a1, y02
  1096. FMADD y03, alpha1r, a3, y03
  1097. FMADD y04, alpha1i, a3, y04
  1098. FMADD y05, alpha1r, a5, y05
  1099. FMADD y06, alpha1i, a5, y06
  1100. FMADD y07, alpha1r, a7, y07
  1101. FMADD y08, alpha1i, a7, y08
  1102. LFD a1, 8 * SIZE(AO1)
  1103. LFD a3, 10 * SIZE(AO1)
  1104. LFD a5, 12 * SIZE(AO1)
  1105. LFD a7, 14 * SIZE(AO1)
  1106. FMSUBX y01, alpha1i, a2, y01
  1107. FMADDX y02, alpha1r, a2, y02
  1108. FMSUBX y03, alpha1i, a4, y03
  1109. FMADDX y04, alpha1r, a4, y04
  1110. FMSUBX y05, alpha1i, a6, y05
  1111. FMADDX y06, alpha1r, a6, y06
  1112. FMSUBX y07, alpha1i, a8, y07
  1113. FMADDX y08, alpha1r, a8, y08
  1114. LFD a2, 9 * SIZE(AO1)
  1115. LFD a4, 11 * SIZE(AO1)
  1116. LFD a6, 13 * SIZE(AO1)
  1117. LFD a8, 15 * SIZE(AO1)
  1118. addi AO1, AO1, 16 * SIZE
  1119. nop
  1120. DCBT(AO1, PREA)
  1121. nop
  1122. FMADD y09, alpha1r, a1, y09
  1123. FMADD y10, alpha1i, a1, y10
  1124. FMADD y11, alpha1r, a3, y11
  1125. FMADD y12, alpha1i, a3, y12
  1126. FMADD y13, alpha1r, a5, y13
  1127. FMADD y14, alpha1i, a5, y14
  1128. FMADD y15, alpha1r, a7, y15
  1129. FMADD y16, alpha1i, a7, y16
  1130. LFD a1, 0 * SIZE(AO2)
  1131. LFD a3, 2 * SIZE(AO2)
  1132. LFD a5, 4 * SIZE(AO2)
  1133. LFD a7, 6 * SIZE(AO2)
  1134. FMSUBX y09, alpha1i, a2, y09
  1135. FMADDX y10, alpha1r, a2, y10
  1136. FMSUBX y11, alpha1i, a4, y11
  1137. FMADDX y12, alpha1r, a4, y12
  1138. FMSUBX y13, alpha1i, a6, y13
  1139. FMADDX y14, alpha1r, a6, y14
  1140. FMSUBX y15, alpha1i, a8, y15
  1141. FMADDX y16, alpha1r, a8, y16
  1142. LFD a2, 1 * SIZE(AO2)
  1143. LFD a4, 3 * SIZE(AO2)
  1144. LFD a6, 5 * SIZE(AO2)
  1145. LFD a8, 7 * SIZE(AO2)
  1146. FMADD y01, alpha2r, a1, y01
  1147. FMADD y02, alpha2i, a1, y02
  1148. FMADD y03, alpha2r, a3, y03
  1149. FMADD y04, alpha2i, a3, y04
  1150. FMADD y05, alpha2r, a5, y05
  1151. FMADD y06, alpha2i, a5, y06
  1152. FMADD y07, alpha2r, a7, y07
  1153. FMADD y08, alpha2i, a7, y08
  1154. LFD a1, 8 * SIZE(AO2)
  1155. LFD a3, 10 * SIZE(AO2)
  1156. LFD a5, 12 * SIZE(AO2)
  1157. LFD a7, 14 * SIZE(AO2)
  1158. FMSUBX y01, alpha2i, a2, y01
  1159. FMADDX y02, alpha2r, a2, y02
  1160. FMSUBX y03, alpha2i, a4, y03
  1161. FMADDX y04, alpha2r, a4, y04
  1162. STFD y01, 0 * SIZE(Y2)
  1163. STFD y02, 1 * SIZE(Y2)
  1164. STFD y03, 2 * SIZE(Y2)
  1165. STFD y04, 3 * SIZE(Y2)
  1166. LFD y01, 0 * SIZE(Y1)
  1167. LFD y02, 1 * SIZE(Y1)
  1168. LFD y03, 2 * SIZE(Y1)
  1169. LFD y04, 3 * SIZE(Y1)
  1170. FMSUBX y05, alpha2i, a6, y05
  1171. FMADDX y06, alpha2r, a6, y06
  1172. FMSUBX y07, alpha2i, a8, y07
  1173. FMADDX y08, alpha2r, a8, y08
  1174. LFD a2, 9 * SIZE(AO2)
  1175. LFD a4, 11 * SIZE(AO2)
  1176. LFD a6, 13 * SIZE(AO2)
  1177. LFD a8, 15 * SIZE(AO2)
  1178. STFD y05, 4 * SIZE(Y2)
  1179. STFD y06, 5 * SIZE(Y2)
  1180. STFD y07, 6 * SIZE(Y2)
  1181. STFD y08, 7 * SIZE(Y2)
  1182. LFD y05, 4 * SIZE(Y1)
  1183. LFD y06, 5 * SIZE(Y1)
  1184. LFD y07, 6 * SIZE(Y1)
  1185. LFD y08, 7 * SIZE(Y1)
  1186. addi AO2, AO2, 16 * SIZE
  1187. nop
  1188. DCBT(AO2, PREA)
  1189. nop
  1190. FMADD y09, alpha2r, a1, y09
  1191. FMADD y10, alpha2i, a1, y10
  1192. FMADD y11, alpha2r, a3, y11
  1193. FMADD y12, alpha2i, a3, y12
  1194. FMADD y13, alpha2r, a5, y13
  1195. FMADD y14, alpha2i, a5, y14
  1196. FMADD y15, alpha2r, a7, y15
  1197. FMADD y16, alpha2i, a7, y16
  1198. LFD a1, 0 * SIZE(AO1)
  1199. LFD a3, 2 * SIZE(AO1)
  1200. LFD a5, 4 * SIZE(AO1)
  1201. LFD a7, 6 * SIZE(AO1)
  1202. FMSUBX y09, alpha2i, a2, y09
  1203. FMADDX y10, alpha2r, a2, y10
  1204. FMSUBX y11, alpha2i, a4, y11
  1205. FMADDX y12, alpha2r, a4, y12
  1206. STFD y09, 8 * SIZE(Y2)
  1207. STFD y10, 9 * SIZE(Y2)
  1208. STFD y11, 10 * SIZE(Y2)
  1209. STFD y12, 11 * SIZE(Y2)
  1210. LFD y09, 8 * SIZE(Y1)
  1211. LFD y10, 9 * SIZE(Y1)
  1212. LFD y11, 10 * SIZE(Y1)
  1213. LFD y12, 11 * SIZE(Y1)
  1214. FMSUBX y13, alpha2i, a6, y13
  1215. FMADDX y14, alpha2r, a6, y14
  1216. FMSUBX y15, alpha2i, a8, y15
  1217. FMADDX y16, alpha2r, a8, y16
  1218. LFD a2, 1 * SIZE(AO1)
  1219. LFD a4, 3 * SIZE(AO1)
  1220. LFD a6, 5 * SIZE(AO1)
  1221. LFD a8, 7 * SIZE(AO1)
  1222. STFD y13, 12 * SIZE(Y2)
  1223. STFD y14, 13 * SIZE(Y2)
  1224. STFD y15, 14 * SIZE(Y2)
  1225. STFD y16, 15 * SIZE(Y2)
  1226. LFD y13, 12 * SIZE(Y1)
  1227. LFD y14, 13 * SIZE(Y1)
  1228. LFD y15, 14 * SIZE(Y1)
  1229. LFD y16, 15 * SIZE(Y1)
  1230. addi Y2, Y2, 16 * SIZE
  1231. addi Y1, Y1, 16 * SIZE
  1232. DCBT(Y1, PREC)
  1233. bdnz LL(22)
  1234. .align 4
  1235. LL(23):
  1236. FMADD y01, alpha1r, a1, y01
  1237. FMADD y02, alpha1i, a1, y02
  1238. FMADD y03, alpha1r, a3, y03
  1239. FMADD y04, alpha1i, a3, y04
  1240. FMADD y05, alpha1r, a5, y05
  1241. FMADD y06, alpha1i, a5, y06
  1242. FMADD y07, alpha1r, a7, y07
  1243. FMADD y08, alpha1i, a7, y08
  1244. LFD a1, 8 * SIZE(AO1)
  1245. LFD a3, 10 * SIZE(AO1)
  1246. LFD a5, 12 * SIZE(AO1)
  1247. LFD a7, 14 * SIZE(AO1)
  1248. FMSUBX y01, alpha1i, a2, y01
  1249. FMADDX y02, alpha1r, a2, y02
  1250. FMSUBX y03, alpha1i, a4, y03
  1251. FMADDX y04, alpha1r, a4, y04
  1252. FMSUBX y05, alpha1i, a6, y05
  1253. FMADDX y06, alpha1r, a6, y06
  1254. FMSUBX y07, alpha1i, a8, y07
  1255. FMADDX y08, alpha1r, a8, y08
  1256. LFD a2, 9 * SIZE(AO1)
  1257. LFD a4, 11 * SIZE(AO1)
  1258. LFD a6, 13 * SIZE(AO1)
  1259. LFD a8, 15 * SIZE(AO1)
  1260. FMADD y09, alpha1r, a1, y09
  1261. FMADD y10, alpha1i, a1, y10
  1262. FMADD y11, alpha1r, a3, y11
  1263. FMADD y12, alpha1i, a3, y12
  1264. FMADD y13, alpha1r, a5, y13
  1265. FMADD y14, alpha1i, a5, y14
  1266. FMADD y15, alpha1r, a7, y15
  1267. FMADD y16, alpha1i, a7, y16
  1268. LFD a1, 0 * SIZE(AO2)
  1269. LFD a3, 2 * SIZE(AO2)
  1270. LFD a5, 4 * SIZE(AO2)
  1271. LFD a7, 6 * SIZE(AO2)
  1272. FMSUBX y09, alpha1i, a2, y09
  1273. FMADDX y10, alpha1r, a2, y10
  1274. FMSUBX y11, alpha1i, a4, y11
  1275. FMADDX y12, alpha1r, a4, y12
  1276. FMSUBX y13, alpha1i, a6, y13
  1277. FMADDX y14, alpha1r, a6, y14
  1278. FMSUBX y15, alpha1i, a8, y15
  1279. FMADDX y16, alpha1r, a8, y16
  1280. LFD a2, 1 * SIZE(AO2)
  1281. LFD a4, 3 * SIZE(AO2)
  1282. LFD a6, 5 * SIZE(AO2)
  1283. LFD a8, 7 * SIZE(AO2)
  1284. FMADD y01, alpha2r, a1, y01
  1285. FMADD y02, alpha2i, a1, y02
  1286. FMADD y03, alpha2r, a3, y03
  1287. FMADD y04, alpha2i, a3, y04
  1288. FMADD y05, alpha2r, a5, y05
  1289. FMADD y06, alpha2i, a5, y06
  1290. FMADD y07, alpha2r, a7, y07
  1291. FMADD y08, alpha2i, a7, y08
  1292. LFD a1, 8 * SIZE(AO2)
  1293. LFD a3, 10 * SIZE(AO2)
  1294. LFD a5, 12 * SIZE(AO2)
  1295. LFD a7, 14 * SIZE(AO2)
  1296. FMSUBX y01, alpha2i, a2, y01
  1297. FMADDX y02, alpha2r, a2, y02
  1298. FMSUBX y03, alpha2i, a4, y03
  1299. FMADDX y04, alpha2r, a4, y04
  1300. STFD y01, 0 * SIZE(Y2)
  1301. STFD y02, 1 * SIZE(Y2)
  1302. STFD y03, 2 * SIZE(Y2)
  1303. STFD y04, 3 * SIZE(Y2)
  1304. FMSUBX y05, alpha2i, a6, y05
  1305. FMADDX y06, alpha2r, a6, y06
  1306. FMSUBX y07, alpha2i, a8, y07
  1307. FMADDX y08, alpha2r, a8, y08
  1308. LFD a2, 9 * SIZE(AO2)
  1309. LFD a4, 11 * SIZE(AO2)
  1310. LFD a6, 13 * SIZE(AO2)
  1311. LFD a8, 15 * SIZE(AO2)
  1312. STFD y05, 4 * SIZE(Y2)
  1313. STFD y06, 5 * SIZE(Y2)
  1314. STFD y07, 6 * SIZE(Y2)
  1315. STFD y08, 7 * SIZE(Y2)
  1316. FMADD y09, alpha2r, a1, y09
  1317. FMADD y10, alpha2i, a1, y10
  1318. FMADD y11, alpha2r, a3, y11
  1319. FMADD y12, alpha2i, a3, y12
  1320. FMADD y13, alpha2r, a5, y13
  1321. FMADD y14, alpha2i, a5, y14
  1322. FMADD y15, alpha2r, a7, y15
  1323. FMADD y16, alpha2i, a7, y16
  1324. FMSUBX y09, alpha2i, a2, y09
  1325. FMADDX y10, alpha2r, a2, y10
  1326. FMSUBX y11, alpha2i, a4, y11
  1327. FMADDX y12, alpha2r, a4, y12
  1328. FMSUBX y13, alpha2i, a6, y13
  1329. FMADDX y14, alpha2r, a6, y14
  1330. FMSUBX y15, alpha2i, a8, y15
  1331. FMADDX y16, alpha2r, a8, y16
  1332. STFD y09, 8 * SIZE(Y2)
  1333. STFD y10, 9 * SIZE(Y2)
  1334. STFD y11, 10 * SIZE(Y2)
  1335. STFD y12, 11 * SIZE(Y2)
  1336. STFD y13, 12 * SIZE(Y2)
  1337. STFD y14, 13 * SIZE(Y2)
  1338. STFD y15, 14 * SIZE(Y2)
  1339. STFD y16, 15 * SIZE(Y2)
  1340. addi AO1, AO1, 16 * SIZE
  1341. addi AO2, AO2, 16 * SIZE
  1342. addi Y2, Y2, 16 * SIZE
  1343. .align 4
  1344. LL(25):
  1345. andi. r0, M, 7
  1346. ble LL(30)
  1347. andi. r0, M, 4
  1348. ble LL(26)
  1349. LFD y01, 0 * SIZE(Y1)
  1350. LFD y02, 1 * SIZE(Y1)
  1351. LFD y03, 2 * SIZE(Y1)
  1352. LFD y04, 3 * SIZE(Y1)
  1353. LFD a1, 0 * SIZE(AO1)
  1354. LFD a3, 2 * SIZE(AO1)
  1355. LFD a5, 4 * SIZE(AO1)
  1356. LFD a7, 6 * SIZE(AO1)
  1357. LFD y05, 4 * SIZE(Y1)
  1358. LFD y06, 5 * SIZE(Y1)
  1359. LFD y07, 6 * SIZE(Y1)
  1360. LFD y08, 7 * SIZE(Y1)
  1361. LFD a2, 1 * SIZE(AO1)
  1362. LFD a4, 3 * SIZE(AO1)
  1363. LFD a6, 5 * SIZE(AO1)
  1364. LFD a8, 7 * SIZE(AO1)
  1365. FMADD y01, alpha1r, a1, y01
  1366. FMADD y02, alpha1i, a1, y02
  1367. FMADD y03, alpha1r, a3, y03
  1368. FMADD y04, alpha1i, a3, y04
  1369. FMADD y05, alpha1r, a5, y05
  1370. FMADD y06, alpha1i, a5, y06
  1371. FMADD y07, alpha1r, a7, y07
  1372. FMADD y08, alpha1i, a7, y08
  1373. LFD a1, 0 * SIZE(AO2)
  1374. LFD a3, 2 * SIZE(AO2)
  1375. LFD a5, 4 * SIZE(AO2)
  1376. LFD a7, 6 * SIZE(AO2)
  1377. FMSUBX y01, alpha1i, a2, y01
  1378. FMADDX y02, alpha1r, a2, y02
  1379. FMSUBX y03, alpha1i, a4, y03
  1380. FMADDX y04, alpha1r, a4, y04
  1381. FMSUBX y05, alpha1i, a6, y05
  1382. FMADDX y06, alpha1r, a6, y06
  1383. FMSUBX y07, alpha1i, a8, y07
  1384. FMADDX y08, alpha1r, a8, y08
  1385. LFD a2, 1 * SIZE(AO2)
  1386. LFD a4, 3 * SIZE(AO2)
  1387. LFD a6, 5 * SIZE(AO2)
  1388. LFD a8, 7 * SIZE(AO2)
  1389. FMADD y01, alpha2r, a1, y01
  1390. FMADD y02, alpha2i, a1, y02
  1391. FMADD y03, alpha2r, a3, y03
  1392. FMADD y04, alpha2i, a3, y04
  1393. FMADD y05, alpha2r, a5, y05
  1394. FMADD y06, alpha2i, a5, y06
  1395. FMADD y07, alpha2r, a7, y07
  1396. FMADD y08, alpha2i, a7, y08
  1397. FMSUBX y01, alpha2i, a2, y01
  1398. FMADDX y02, alpha2r, a2, y02
  1399. FMSUBX y03, alpha2i, a4, y03
  1400. FMADDX y04, alpha2r, a4, y04
  1401. STFD y01, 0 * SIZE(Y2)
  1402. STFD y02, 1 * SIZE(Y2)
  1403. STFD y03, 2 * SIZE(Y2)
  1404. STFD y04, 3 * SIZE(Y2)
  1405. FMSUBX y05, alpha2i, a6, y05
  1406. FMADDX y06, alpha2r, a6, y06
  1407. FMSUBX y07, alpha2i, a8, y07
  1408. FMADDX y08, alpha2r, a8, y08
  1409. STFD y05, 4 * SIZE(Y2)
  1410. STFD y06, 5 * SIZE(Y2)
  1411. STFD y07, 6 * SIZE(Y2)
  1412. STFD y08, 7 * SIZE(Y2)
  1413. addi AO1, AO1, 8 * SIZE
  1414. addi AO2, AO2, 8 * SIZE
  1415. addi Y1, Y1, 8 * SIZE
  1416. addi Y2, Y2, 8 * SIZE
  1417. .align 4
  1418. LL(26):
  1419. andi. r0, M, 2
  1420. ble LL(27)
  1421. LFD a1, 0 * SIZE(AO1)
  1422. LFD a3, 2 * SIZE(AO1)
  1423. LFD a5, 0 * SIZE(AO2)
  1424. LFD a7, 2 * SIZE(AO2)
  1425. LFD y01, 0 * SIZE(Y1)
  1426. LFD y02, 1 * SIZE(Y1)
  1427. LFD y03, 2 * SIZE(Y1)
  1428. LFD y04, 3 * SIZE(Y1)
  1429. LFD a2, 1 * SIZE(AO1)
  1430. LFD a4, 3 * SIZE(AO1)
  1431. LFD a6, 1 * SIZE(AO2)
  1432. LFD a8, 3 * SIZE(AO2)
  1433. FMADD y01, alpha1r, a1, y01
  1434. FMADD y02, alpha1i, a1, y02
  1435. FMADD y03, alpha1r, a3, y03
  1436. FMADD y04, alpha1i, a3, y04
  1437. FMSUBX y01, alpha1i, a2, y01
  1438. FMADDX y02, alpha1r, a2, y02
  1439. FMSUBX y03, alpha1i, a4, y03
  1440. FMADDX y04, alpha1r, a4, y04
  1441. FMADD y01, alpha2r, a5, y01
  1442. FMADD y02, alpha2i, a5, y02
  1443. FMADD y03, alpha2r, a7, y03
  1444. FMADD y04, alpha2i, a7, y04
  1445. FMSUBX y01, alpha2i, a6, y01
  1446. FMADDX y02, alpha2r, a6, y02
  1447. FMSUBX y03, alpha2i, a8, y03
  1448. FMADDX y04, alpha2r, a8, y04
  1449. STFD y01, 0 * SIZE(Y2)
  1450. STFD y02, 1 * SIZE(Y2)
  1451. STFD y03, 2 * SIZE(Y2)
  1452. STFD y04, 3 * SIZE(Y2)
  1453. addi AO1, AO1, 4 * SIZE
  1454. addi AO2, AO2, 4 * SIZE
  1455. addi Y1, Y1, 4 * SIZE
  1456. addi Y2, Y2, 4 * SIZE
  1457. .align 4
  1458. LL(27):
  1459. andi. r0, M, 1
  1460. ble LL(30)
  1461. LFD y01, 0 * SIZE(Y1)
  1462. LFD y02, 1 * SIZE(Y1)
  1463. LFD a1, 0 * SIZE(AO1)
  1464. LFD a2, 1 * SIZE(AO1)
  1465. LFD a3, 0 * SIZE(AO2)
  1466. LFD a4, 1 * SIZE(AO2)
  1467. FMADD y01, alpha1r, a1, y01
  1468. FMADD y02, alpha1i, a1, y02
  1469. FMSUBX y01, alpha1i, a2, y01
  1470. FMADDX y02, alpha1r, a2, y02
  1471. FMADD y01, alpha2r, a3, y01
  1472. FMADD y02, alpha2i, a3, y02
  1473. FMSUBX y01, alpha2i, a4, y01
  1474. FMADDX y02, alpha2r, a4, y02
  1475. STFD y01, 0 * SIZE(Y2)
  1476. STFD y02, 1 * SIZE(Y2)
  1477. add Y1, Y1, INCY
  1478. add Y2, Y2, INCY
  1479. .align 4
  1480. LL(30):
  1481. andi. J, N, 1
  1482. ble LL(999)
  1483. .align 4
  1484. LL(31):
  1485. lfd alpha_r, ALPHA_R
  1486. lfd alpha_i, ALPHA_I
  1487. LFD a1, 0 * SIZE(X)
  1488. LFD a2, 1 * SIZE(X)
  1489. add X, X, INCX
  1490. FMUL alpha1r, alpha_r, a1
  1491. FMUL alpha1i, alpha_i, a1
  1492. FMSUBR alpha1r, alpha_i, a2, alpha1r
  1493. FMADDR alpha1i, alpha_r, a2, alpha1i
  1494. mr AO1, A
  1495. add A, AO1, LDA
  1496. mr Y1, Y
  1497. mr Y2, Y
  1498. srawi. r0, M, 3
  1499. mtspr CTR, r0
  1500. ble LL(35)
  1501. .align 4
  1502. LFD y01, 0 * SIZE(Y1)
  1503. LFD y02, 1 * SIZE(Y1)
  1504. LFD y03, 2 * SIZE(Y1)
  1505. LFD y04, 3 * SIZE(Y1)
  1506. LFD y05, 4 * SIZE(Y1)
  1507. LFD y06, 5 * SIZE(Y1)
  1508. LFD y07, 6 * SIZE(Y1)
  1509. LFD y08, 7 * SIZE(Y1)
  1510. LFD y09, 8 * SIZE(Y1)
  1511. LFD y10, 9 * SIZE(Y1)
  1512. LFD y11, 10 * SIZE(Y1)
  1513. LFD y12, 11 * SIZE(Y1)
  1514. LFD y13, 12 * SIZE(Y1)
  1515. LFD y14, 13 * SIZE(Y1)
  1516. LFD y15, 14 * SIZE(Y1)
  1517. LFD y16, 15 * SIZE(Y1)
  1518. LFD a1, 0 * SIZE(AO1)
  1519. LFD a2, 1 * SIZE(AO1)
  1520. LFD a3, 2 * SIZE(AO1)
  1521. LFD a4, 3 * SIZE(AO1)
  1522. LFD a5, 4 * SIZE(AO1)
  1523. LFD a6, 5 * SIZE(AO1)
  1524. LFD a7, 6 * SIZE(AO1)
  1525. LFD a8, 7 * SIZE(AO1)
  1526. addi Y1, Y1, 16 * SIZE
  1527. bdz LL(33)
  1528. .align 4
  1529. LL(32):
  1530. FMADD y01, alpha1r, a1, y01
  1531. FMADD y02, alpha1i, a1, y02
  1532. FMADD y03, alpha1r, a3, y03
  1533. FMADD y04, alpha1i, a3, y04
  1534. FMADD y05, alpha1r, a5, y05
  1535. FMADD y06, alpha1i, a5, y06
  1536. FMADD y07, alpha1r, a7, y07
  1537. FMADD y08, alpha1i, a7, y08
  1538. LFD a1, 8 * SIZE(AO1)
  1539. LFD a3, 10 * SIZE(AO1)
  1540. LFD a5, 12 * SIZE(AO1)
  1541. LFD a7, 14 * SIZE(AO1)
  1542. FMSUBX y01, alpha1i, a2, y01
  1543. FMADDX y02, alpha1r, a2, y02
  1544. FMSUBX y03, alpha1i, a4, y03
  1545. FMADDX y04, alpha1r, a4, y04
  1546. STFD y01, 0 * SIZE(Y2)
  1547. STFD y02, 1 * SIZE(Y2)
  1548. STFD y03, 2 * SIZE(Y2)
  1549. STFD y04, 3 * SIZE(Y2)
  1550. LFD y01, 0 * SIZE(Y1)
  1551. LFD y02, 1 * SIZE(Y1)
  1552. LFD y03, 2 * SIZE(Y1)
  1553. LFD y04, 3 * SIZE(Y1)
  1554. FMSUBX y05, alpha1i, a6, y05
  1555. FMADDX y06, alpha1r, a6, y06
  1556. FMSUBX y07, alpha1i, a8, y07
  1557. FMADDX y08, alpha1r, a8, y08
  1558. LFD a2, 9 * SIZE(AO1)
  1559. LFD a4, 11 * SIZE(AO1)
  1560. LFD a6, 13 * SIZE(AO1)
  1561. LFD a8, 15 * SIZE(AO1)
  1562. addi AO1, AO1, 16 * SIZE
  1563. nop
  1564. DCBT(AO1, PREA)
  1565. nop
  1566. STFD y05, 4 * SIZE(Y2)
  1567. STFD y06, 5 * SIZE(Y2)
  1568. STFD y07, 6 * SIZE(Y2)
  1569. STFD y08, 7 * SIZE(Y2)
  1570. LFD y05, 4 * SIZE(Y1)
  1571. LFD y06, 5 * SIZE(Y1)
  1572. LFD y07, 6 * SIZE(Y1)
  1573. LFD y08, 7 * SIZE(Y1)
  1574. FMADD y09, alpha1r, a1, y09
  1575. FMADD y10, alpha1i, a1, y10
  1576. FMADD y11, alpha1r, a3, y11
  1577. FMADD y12, alpha1i, a3, y12
  1578. FMADD y13, alpha1r, a5, y13
  1579. FMADD y14, alpha1i, a5, y14
  1580. FMADD y15, alpha1r, a7, y15
  1581. FMADD y16, alpha1i, a7, y16
  1582. LFD a1, 0 * SIZE(AO1)
  1583. LFD a3, 2 * SIZE(AO1)
  1584. LFD a5, 4 * SIZE(AO1)
  1585. LFD a7, 6 * SIZE(AO1)
  1586. FMSUBX y09, alpha1i, a2, y09
  1587. FMADDX y10, alpha1r, a2, y10
  1588. FMSUBX y11, alpha1i, a4, y11
  1589. FMADDX y12, alpha1r, a4, y12
  1590. STFD y09, 8 * SIZE(Y2)
  1591. STFD y10, 9 * SIZE(Y2)
  1592. STFD y11, 10 * SIZE(Y2)
  1593. STFD y12, 11 * SIZE(Y2)
  1594. LFD y09, 8 * SIZE(Y1)
  1595. LFD y10, 9 * SIZE(Y1)
  1596. LFD y11, 10 * SIZE(Y1)
  1597. LFD y12, 11 * SIZE(Y1)
  1598. FMSUBX y13, alpha1i, a6, y13
  1599. FMADDX y14, alpha1r, a6, y14
  1600. FMSUBX y15, alpha1i, a8, y15
  1601. FMADDX y16, alpha1r, a8, y16
  1602. LFD a2, 1 * SIZE(AO1)
  1603. LFD a4, 3 * SIZE(AO1)
  1604. LFD a6, 5 * SIZE(AO1)
  1605. LFD a8, 7 * SIZE(AO1)
  1606. STFD y13, 12 * SIZE(Y2)
  1607. STFD y14, 13 * SIZE(Y2)
  1608. STFD y15, 14 * SIZE(Y2)
  1609. STFD y16, 15 * SIZE(Y2)
  1610. LFD y13, 12 * SIZE(Y1)
  1611. LFD y14, 13 * SIZE(Y1)
  1612. LFD y15, 14 * SIZE(Y1)
  1613. LFD y16, 15 * SIZE(Y1)
  1614. addi Y1, Y1, 16 * SIZE
  1615. addi Y2, Y2, 16 * SIZE
  1616. DCBT(Y1, PREC)
  1617. bdnz LL(32)
  1618. .align 4
  1619. LL(33):
  1620. FMADD y01, alpha1r, a1, y01
  1621. FMADD y02, alpha1i, a1, y02
  1622. FMADD y03, alpha1r, a3, y03
  1623. FMADD y04, alpha1i, a3, y04
  1624. FMADD y05, alpha1r, a5, y05
  1625. FMADD y06, alpha1i, a5, y06
  1626. FMADD y07, alpha1r, a7, y07
  1627. FMADD y08, alpha1i, a7, y08
  1628. LFD a1, 8 * SIZE(AO1)
  1629. LFD a3, 10 * SIZE(AO1)
  1630. LFD a5, 12 * SIZE(AO1)
  1631. LFD a7, 14 * SIZE(AO1)
  1632. FMSUBX y01, alpha1i, a2, y01
  1633. FMADDX y02, alpha1r, a2, y02
  1634. FMSUBX y03, alpha1i, a4, y03
  1635. FMADDX y04, alpha1r, a4, y04
  1636. STFD y01, 0 * SIZE(Y2)
  1637. STFD y02, 1 * SIZE(Y2)
  1638. STFD y03, 2 * SIZE(Y2)
  1639. STFD y04, 3 * SIZE(Y2)
  1640. FMSUBX y05, alpha1i, a6, y05
  1641. FMADDX y06, alpha1r, a6, y06
  1642. FMSUBX y07, alpha1i, a8, y07
  1643. FMADDX y08, alpha1r, a8, y08
  1644. LFD a2, 9 * SIZE(AO1)
  1645. LFD a4, 11 * SIZE(AO1)
  1646. LFD a6, 13 * SIZE(AO1)
  1647. LFD a8, 15 * SIZE(AO1)
  1648. STFD y05, 4 * SIZE(Y2)
  1649. STFD y06, 5 * SIZE(Y2)
  1650. STFD y07, 6 * SIZE(Y2)
  1651. STFD y08, 7 * SIZE(Y2)
  1652. FMADD y09, alpha1r, a1, y09
  1653. FMADD y10, alpha1i, a1, y10
  1654. FMADD y11, alpha1r, a3, y11
  1655. FMADD y12, alpha1i, a3, y12
  1656. FMADD y13, alpha1r, a5, y13
  1657. FMADD y14, alpha1i, a5, y14
  1658. FMADD y15, alpha1r, a7, y15
  1659. FMADD y16, alpha1i, a7, y16
  1660. FMSUBX y09, alpha1i, a2, y09
  1661. FMADDX y10, alpha1r, a2, y10
  1662. FMSUBX y11, alpha1i, a4, y11
  1663. FMADDX y12, alpha1r, a4, y12
  1664. STFD y09, 8 * SIZE(Y2)
  1665. STFD y10, 9 * SIZE(Y2)
  1666. STFD y11, 10 * SIZE(Y2)
  1667. STFD y12, 11 * SIZE(Y2)
  1668. FMSUBX y13, alpha1i, a6, y13
  1669. FMADDX y14, alpha1r, a6, y14
  1670. FMSUBX y15, alpha1i, a8, y15
  1671. FMADDX y16, alpha1r, a8, y16
  1672. STFD y13, 12 * SIZE(Y2)
  1673. STFD y14, 13 * SIZE(Y2)
  1674. STFD y15, 14 * SIZE(Y2)
  1675. STFD y16, 15 * SIZE(Y2)
  1676. addi AO1, AO1, 16 * SIZE
  1677. addi Y2, Y2, 16 * SIZE
  1678. .align 4
  1679. LL(35):
  1680. andi. r0, M, 7
  1681. ble LL(999)
  1682. andi. r0, M, 4
  1683. ble LL(36)
  1684. LFD y01, 0 * SIZE(Y1)
  1685. LFD y02, 1 * SIZE(Y1)
  1686. LFD y03, 2 * SIZE(Y1)
  1687. LFD y04, 3 * SIZE(Y1)
  1688. LFD a1, 0 * SIZE(AO1)
  1689. LFD a3, 2 * SIZE(AO1)
  1690. LFD a5, 4 * SIZE(AO1)
  1691. LFD a7, 6 * SIZE(AO1)
  1692. LFD y05, 4 * SIZE(Y1)
  1693. LFD y06, 5 * SIZE(Y1)
  1694. LFD y07, 6 * SIZE(Y1)
  1695. LFD y08, 7 * SIZE(Y1)
  1696. LFD a2, 1 * SIZE(AO1)
  1697. LFD a4, 3 * SIZE(AO1)
  1698. LFD a6, 5 * SIZE(AO1)
  1699. LFD a8, 7 * SIZE(AO1)
  1700. FMADD y01, alpha1r, a1, y01
  1701. FMADD y02, alpha1i, a1, y02
  1702. FMADD y03, alpha1r, a3, y03
  1703. FMADD y04, alpha1i, a3, y04
  1704. FMADD y05, alpha1r, a5, y05
  1705. FMADD y06, alpha1i, a5, y06
  1706. FMADD y07, alpha1r, a7, y07
  1707. FMADD y08, alpha1i, a7, y08
  1708. FMSUBX y01, alpha1i, a2, y01
  1709. FMADDX y02, alpha1r, a2, y02
  1710. FMSUBX y03, alpha1i, a4, y03
  1711. FMADDX y04, alpha1r, a4, y04
  1712. FMSUBX y05, alpha1i, a6, y05
  1713. FMADDX y06, alpha1r, a6, y06
  1714. FMSUBX y07, alpha1i, a8, y07
  1715. FMADDX y08, alpha1r, a8, y08
  1716. STFD y01, 0 * SIZE(Y2)
  1717. STFD y02, 1 * SIZE(Y2)
  1718. STFD y03, 2 * SIZE(Y2)
  1719. STFD y04, 3 * SIZE(Y2)
  1720. STFD y05, 4 * SIZE(Y2)
  1721. STFD y06, 5 * SIZE(Y2)
  1722. STFD y07, 6 * SIZE(Y2)
  1723. STFD y08, 7 * SIZE(Y2)
  1724. addi AO1, AO1, 8 * SIZE
  1725. addi Y1, Y1, 8 * SIZE
  1726. addi Y2, Y2, 8 * SIZE
  1727. .align 4
  1728. LL(36):
  1729. andi. r0, M, 2
  1730. ble LL(37)
  1731. LFD a1, 0 * SIZE(AO1)
  1732. LFD a2, 1 * SIZE(AO1)
  1733. LFD a3, 2 * SIZE(AO1)
  1734. LFD a4, 3 * SIZE(AO1)
  1735. LFD y01, 0 * SIZE(Y1)
  1736. LFD y02, 1 * SIZE(Y1)
  1737. LFD y03, 2 * SIZE(Y1)
  1738. LFD y04, 3 * SIZE(Y1)
  1739. FMADD y01, alpha1r, a1, y01
  1740. FMADD y02, alpha1i, a1, y02
  1741. FMADD y03, alpha1r, a3, y03
  1742. FMADD y04, alpha1i, a3, y04
  1743. FMSUBX y01, alpha1i, a2, y01
  1744. FMADDX y02, alpha1r, a2, y02
  1745. FMSUBX y03, alpha1i, a4, y03
  1746. FMADDX y04, alpha1r, a4, y04
  1747. STFD y01, 0 * SIZE(Y2)
  1748. STFD y02, 1 * SIZE(Y2)
  1749. STFD y03, 2 * SIZE(Y2)
  1750. STFD y04, 3 * SIZE(Y2)
  1751. addi AO1, AO1, 4 * SIZE
  1752. addi Y1, Y1, 4 * SIZE
  1753. addi Y2, Y2, 4 * SIZE
  1754. .align 4
  1755. LL(37):
  1756. andi. r0, M, 1
  1757. ble LL(999)
  1758. LFD y01, 0 * SIZE(Y1)
  1759. LFD y02, 1 * SIZE(Y1)
  1760. LFD a1, 0 * SIZE(AO1)
  1761. LFD a2, 1 * SIZE(AO1)
  1762. FMADD y01, alpha1r, a1, y01
  1763. FMADD y02, alpha1i, a1, y02
  1764. FMSUBX y01, alpha1i, a2, y01
  1765. FMADDX y02, alpha1r, a2, y02
  1766. STFD y01, 0 * SIZE(Y2)
  1767. STFD y02, 1 * SIZE(Y2)
  1768. add Y1, Y1, INCY
  1769. add Y2, Y2, INCY
  1770. b LL(999)
  1771. .align 4
  1772. LL(100):
  1773. srawi. J, N, 2
  1774. ble LL(120)
  1775. .align 4
  1776. LL(111):
  1777. lfd alpha_r, ALPHA_R
  1778. lfd alpha_i, ALPHA_I
  1779. LFD a1, 0 * SIZE(X)
  1780. LFD a2, 1 * SIZE(X)
  1781. add X, X, INCX
  1782. LFD a3, 0 * SIZE(X)
  1783. LFD a4, 1 * SIZE(X)
  1784. add X, X, INCX
  1785. LFD a5, 0 * SIZE(X)
  1786. LFD a6, 1 * SIZE(X)
  1787. add X, X, INCX
  1788. LFD a7, 0 * SIZE(X)
  1789. LFD a8, 1 * SIZE(X)
  1790. add X, X, INCX
  1791. FMUL alpha1r, alpha_r, a1
  1792. FMUL alpha1i, alpha_i, a1
  1793. FMUL alpha2r, alpha_r, a3
  1794. FMUL alpha2i, alpha_i, a3
  1795. FMUL alpha3r, alpha_r, a5
  1796. FMUL alpha3i, alpha_i, a5
  1797. FMUL alpha4r, alpha_r, a7
  1798. FMUL alpha4i, alpha_i, a7
  1799. FMSUBR alpha1r, alpha_i, a2, alpha1r
  1800. FMADDR alpha1i, alpha_r, a2, alpha1i
  1801. FMSUBR alpha2r, alpha_i, a4, alpha2r
  1802. FMADDR alpha2i, alpha_r, a4, alpha2i
  1803. FMSUBR alpha3r, alpha_i, a6, alpha3r
  1804. FMADDR alpha3i, alpha_r, a6, alpha3i
  1805. FMSUBR alpha4r, alpha_i, a8, alpha4r
  1806. FMADDR alpha4i, alpha_r, a8, alpha4i
  1807. mr AO1, A
  1808. add AO2, A, LDA
  1809. add AO3, AO2, LDA
  1810. add AO4, AO3, LDA
  1811. add A, AO4, LDA
  1812. mr Y1, Y
  1813. mr Y2, Y
  1814. srawi. r0, M, 3
  1815. mtspr CTR, r0
  1816. ble LL(115)
  1817. .align 4
  1818. LFD y01, 0 * SIZE(Y1)
  1819. LFD y02, 1 * SIZE(Y1)
  1820. add Y1, Y1, INCY
  1821. LFD y03, 0 * SIZE(Y1)
  1822. LFD y04, 1 * SIZE(Y1)
  1823. add Y1, Y1, INCY
  1824. LFD y05, 0 * SIZE(Y1)
  1825. LFD y06, 1 * SIZE(Y1)
  1826. add Y1, Y1, INCY
  1827. LFD y07, 0 * SIZE(Y1)
  1828. LFD y08, 1 * SIZE(Y1)
  1829. add Y1, Y1, INCY
  1830. LFD y09, 0 * SIZE(Y1)
  1831. LFD y10, 1 * SIZE(Y1)
  1832. add Y1, Y1, INCY
  1833. LFD y11, 0 * SIZE(Y1)
  1834. LFD y12, 1 * SIZE(Y1)
  1835. add Y1, Y1, INCY
  1836. LFD y13, 0 * SIZE(Y1)
  1837. LFD y14, 1 * SIZE(Y1)
  1838. add Y1, Y1, INCY
  1839. LFD y15, 0 * SIZE(Y1)
  1840. LFD y16, 1 * SIZE(Y1)
  1841. add Y1, Y1, INCY
  1842. LFD a1, 0 * SIZE(AO1)
  1843. LFD a2, 1 * SIZE(AO1)
  1844. LFD a3, 2 * SIZE(AO1)
  1845. LFD a4, 3 * SIZE(AO1)
  1846. LFD a5, 4 * SIZE(AO1)
  1847. LFD a6, 5 * SIZE(AO1)
  1848. LFD a7, 6 * SIZE(AO1)
  1849. LFD a8, 7 * SIZE(AO1)
  1850. bdz LL(113)
  1851. .align 4
  1852. LL(112):
  1853. FMADD y01, alpha1r, a1, y01
  1854. FMADD y02, alpha1i, a1, y02
  1855. FMADD y03, alpha1r, a3, y03
  1856. FMADD y04, alpha1i, a3, y04
  1857. FMADD y05, alpha1r, a5, y05
  1858. FMADD y06, alpha1i, a5, y06
  1859. FMADD y07, alpha1r, a7, y07
  1860. FMADD y08, alpha1i, a7, y08
  1861. LFD a1, 8 * SIZE(AO1)
  1862. LFD a3, 10 * SIZE(AO1)
  1863. LFD a5, 12 * SIZE(AO1)
  1864. LFD a7, 14 * SIZE(AO1)
  1865. FMSUBX y01, alpha1i, a2, y01
  1866. FMADDX y02, alpha1r, a2, y02
  1867. FMSUBX y03, alpha1i, a4, y03
  1868. FMADDX y04, alpha1r, a4, y04
  1869. FMSUBX y05, alpha1i, a6, y05
  1870. FMADDX y06, alpha1r, a6, y06
  1871. FMSUBX y07, alpha1i, a8, y07
  1872. FMADDX y08, alpha1r, a8, y08
  1873. LFD a2, 9 * SIZE(AO1)
  1874. LFD a4, 11 * SIZE(AO1)
  1875. LFD a6, 13 * SIZE(AO1)
  1876. LFD a8, 15 * SIZE(AO1)
  1877. addi AO1, AO1, 16 * SIZE
  1878. nop
  1879. DCBT(AO1, PREA)
  1880. nop
  1881. FMADD y09, alpha1r, a1, y09
  1882. FMADD y10, alpha1i, a1, y10
  1883. FMADD y11, alpha1r, a3, y11
  1884. FMADD y12, alpha1i, a3, y12
  1885. FMADD y13, alpha1r, a5, y13
  1886. FMADD y14, alpha1i, a5, y14
  1887. FMADD y15, alpha1r, a7, y15
  1888. FMADD y16, alpha1i, a7, y16
  1889. LFD a1, 0 * SIZE(AO2)
  1890. LFD a3, 2 * SIZE(AO2)
  1891. LFD a5, 4 * SIZE(AO2)
  1892. LFD a7, 6 * SIZE(AO2)
  1893. FMSUBX y09, alpha1i, a2, y09
  1894. FMADDX y10, alpha1r, a2, y10
  1895. FMSUBX y11, alpha1i, a4, y11
  1896. FMADDX y12, alpha1r, a4, y12
  1897. FMSUBX y13, alpha1i, a6, y13
  1898. FMADDX y14, alpha1r, a6, y14
  1899. FMSUBX y15, alpha1i, a8, y15
  1900. FMADDX y16, alpha1r, a8, y16
  1901. LFD a2, 1 * SIZE(AO2)
  1902. LFD a4, 3 * SIZE(AO2)
  1903. LFD a6, 5 * SIZE(AO2)
  1904. LFD a8, 7 * SIZE(AO2)
  1905. FMADD y01, alpha2r, a1, y01
  1906. FMADD y02, alpha2i, a1, y02
  1907. FMADD y03, alpha2r, a3, y03
  1908. FMADD y04, alpha2i, a3, y04
  1909. FMADD y05, alpha2r, a5, y05
  1910. FMADD y06, alpha2i, a5, y06
  1911. FMADD y07, alpha2r, a7, y07
  1912. FMADD y08, alpha2i, a7, y08
  1913. LFD a1, 8 * SIZE(AO2)
  1914. LFD a3, 10 * SIZE(AO2)
  1915. LFD a5, 12 * SIZE(AO2)
  1916. LFD a7, 14 * SIZE(AO2)
  1917. FMSUBX y01, alpha2i, a2, y01
  1918. FMADDX y02, alpha2r, a2, y02
  1919. FMSUBX y03, alpha2i, a4, y03
  1920. FMADDX y04, alpha2r, a4, y04
  1921. FMSUBX y05, alpha2i, a6, y05
  1922. FMADDX y06, alpha2r, a6, y06
  1923. FMSUBX y07, alpha2i, a8, y07
  1924. FMADDX y08, alpha2r, a8, y08
  1925. LFD a2, 9 * SIZE(AO2)
  1926. LFD a4, 11 * SIZE(AO2)
  1927. LFD a6, 13 * SIZE(AO2)
  1928. LFD a8, 15 * SIZE(AO2)
  1929. addi AO2, AO2, 16 * SIZE
  1930. nop
  1931. DCBT(AO2, PREA)
  1932. nop
  1933. FMADD y09, alpha2r, a1, y09
  1934. FMADD y10, alpha2i, a1, y10
  1935. FMADD y11, alpha2r, a3, y11
  1936. FMADD y12, alpha2i, a3, y12
  1937. FMADD y13, alpha2r, a5, y13
  1938. FMADD y14, alpha2i, a5, y14
  1939. FMADD y15, alpha2r, a7, y15
  1940. FMADD y16, alpha2i, a7, y16
  1941. LFD a1, 0 * SIZE(AO3)
  1942. LFD a3, 2 * SIZE(AO3)
  1943. LFD a5, 4 * SIZE(AO3)
  1944. LFD a7, 6 * SIZE(AO3)
  1945. FMSUBX y09, alpha2i, a2, y09
  1946. FMADDX y10, alpha2r, a2, y10
  1947. FMSUBX y11, alpha2i, a4, y11
  1948. FMADDX y12, alpha2r, a4, y12
  1949. FMSUBX y13, alpha2i, a6, y13
  1950. FMADDX y14, alpha2r, a6, y14
  1951. FMSUBX y15, alpha2i, a8, y15
  1952. FMADDX y16, alpha2r, a8, y16
  1953. LFD a2, 1 * SIZE(AO3)
  1954. LFD a4, 3 * SIZE(AO3)
  1955. LFD a6, 5 * SIZE(AO3)
  1956. LFD a8, 7 * SIZE(AO3)
  1957. FMADD y01, alpha3r, a1, y01
  1958. FMADD y02, alpha3i, a1, y02
  1959. FMADD y03, alpha3r, a3, y03
  1960. FMADD y04, alpha3i, a3, y04
  1961. FMADD y05, alpha3r, a5, y05
  1962. FMADD y06, alpha3i, a5, y06
  1963. FMADD y07, alpha3r, a7, y07
  1964. FMADD y08, alpha3i, a7, y08
  1965. LFD a1, 8 * SIZE(AO3)
  1966. LFD a3, 10 * SIZE(AO3)
  1967. LFD a5, 12 * SIZE(AO3)
  1968. LFD a7, 14 * SIZE(AO3)
  1969. FMSUBX y01, alpha3i, a2, y01
  1970. FMADDX y02, alpha3r, a2, y02
  1971. FMSUBX y03, alpha3i, a4, y03
  1972. FMADDX y04, alpha3r, a4, y04
  1973. FMSUBX y05, alpha3i, a6, y05
  1974. FMADDX y06, alpha3r, a6, y06
  1975. FMSUBX y07, alpha3i, a8, y07
  1976. FMADDX y08, alpha3r, a8, y08
  1977. LFD a2, 9 * SIZE(AO3)
  1978. LFD a4, 11 * SIZE(AO3)
  1979. LFD a6, 13 * SIZE(AO3)
  1980. LFD a8, 15 * SIZE(AO3)
  1981. addi AO3, AO3, 16 * SIZE
  1982. nop
  1983. DCBT(AO3, PREA)
  1984. nop
  1985. FMADD y09, alpha3r, a1, y09
  1986. FMADD y10, alpha3i, a1, y10
  1987. FMADD y11, alpha3r, a3, y11
  1988. FMADD y12, alpha3i, a3, y12
  1989. FMADD y13, alpha3r, a5, y13
  1990. FMADD y14, alpha3i, a5, y14
  1991. FMADD y15, alpha3r, a7, y15
  1992. FMADD y16, alpha3i, a7, y16
  1993. LFD a1, 0 * SIZE(AO4)
  1994. LFD a3, 2 * SIZE(AO4)
  1995. LFD a5, 4 * SIZE(AO4)
  1996. LFD a7, 6 * SIZE(AO4)
  1997. FMSUBX y09, alpha3i, a2, y09
  1998. FMADDX y10, alpha3r, a2, y10
  1999. FMSUBX y11, alpha3i, a4, y11
  2000. FMADDX y12, alpha3r, a4, y12
  2001. FMSUBX y13, alpha3i, a6, y13
  2002. FMADDX y14, alpha3r, a6, y14
  2003. FMSUBX y15, alpha3i, a8, y15
  2004. FMADDX y16, alpha3r, a8, y16
  2005. LFD a2, 1 * SIZE(AO4)
  2006. LFD a4, 3 * SIZE(AO4)
  2007. LFD a6, 5 * SIZE(AO4)
  2008. LFD a8, 7 * SIZE(AO4)
  2009. FMADD y01, alpha4r, a1, y01
  2010. FMADD y02, alpha4i, a1, y02
  2011. FMADD y03, alpha4r, a3, y03
  2012. FMADD y04, alpha4i, a3, y04
  2013. FMADD y05, alpha4r, a5, y05
  2014. FMADD y06, alpha4i, a5, y06
  2015. FMADD y07, alpha4r, a7, y07
  2016. FMADD y08, alpha4i, a7, y08
  2017. LFD a1, 8 * SIZE(AO4)
  2018. LFD a3, 10 * SIZE(AO4)
  2019. LFD a5, 12 * SIZE(AO4)
  2020. LFD a7, 14 * SIZE(AO4)
  2021. FMSUBX y01, alpha4i, a2, y01
  2022. FMADDX y02, alpha4r, a2, y02
  2023. FMSUBX y03, alpha4i, a4, y03
  2024. FMADDX y04, alpha4r, a4, y04
  2025. STFD y01, 0 * SIZE(Y2)
  2026. nop
  2027. STFD y02, 1 * SIZE(Y2)
  2028. add Y2, Y2, INCY
  2029. LFD y01, 0 * SIZE(Y1)
  2030. nop
  2031. LFD y02, 1 * SIZE(Y1)
  2032. add Y1, Y1, INCY
  2033. STFD y03, 0 * SIZE(Y2)
  2034. nop
  2035. STFD y04, 1 * SIZE(Y2)
  2036. add Y2, Y2, INCY
  2037. LFD y03, 0 * SIZE(Y1)
  2038. nop
  2039. LFD y04, 1 * SIZE(Y1)
  2040. add Y1, Y1, INCY
  2041. FMSUBX y05, alpha4i, a6, y05
  2042. FMADDX y06, alpha4r, a6, y06
  2043. FMSUBX y07, alpha4i, a8, y07
  2044. FMADDX y08, alpha4r, a8, y08
  2045. LFD a2, 9 * SIZE(AO4)
  2046. LFD a4, 11 * SIZE(AO4)
  2047. LFD a6, 13 * SIZE(AO4)
  2048. LFD a8, 15 * SIZE(AO4)
  2049. addi AO4, AO4, 16 * SIZE
  2050. nop
  2051. DCBT(AO4, PREA)
  2052. nop
  2053. STFD y05, 0 * SIZE(Y2)
  2054. nop
  2055. STFD y06, 1 * SIZE(Y2)
  2056. add Y2, Y2, INCY
  2057. LFD y05, 0 * SIZE(Y1)
  2058. nop
  2059. LFD y06, 1 * SIZE(Y1)
  2060. add Y1, Y1, INCY
  2061. STFD y07, 0 * SIZE(Y2)
  2062. nop
  2063. STFD y08, 1 * SIZE(Y2)
  2064. add Y2, Y2, INCY
  2065. LFD y07, 0 * SIZE(Y1)
  2066. nop
  2067. LFD y08, 1 * SIZE(Y1)
  2068. add Y1, Y1, INCY
  2069. FMADD y09, alpha4r, a1, y09
  2070. FMADD y10, alpha4i, a1, y10
  2071. FMADD y11, alpha4r, a3, y11
  2072. FMADD y12, alpha4i, a3, y12
  2073. FMADD y13, alpha4r, a5, y13
  2074. FMADD y14, alpha4i, a5, y14
  2075. FMADD y15, alpha4r, a7, y15
  2076. FMADD y16, alpha4i, a7, y16
  2077. LFD a1, 0 * SIZE(AO1)
  2078. LFD a3, 2 * SIZE(AO1)
  2079. LFD a5, 4 * SIZE(AO1)
  2080. LFD a7, 6 * SIZE(AO1)
  2081. FMSUBX y09, alpha4i, a2, y09
  2082. FMADDX y10, alpha4r, a2, y10
  2083. FMSUBX y11, alpha4i, a4, y11
  2084. FMADDX y12, alpha4r, a4, y12
  2085. STFD y09, 0 * SIZE(Y2)
  2086. nop
  2087. STFD y10, 1 * SIZE(Y2)
  2088. add Y2, Y2, INCY
  2089. LFD y09, 0 * SIZE(Y1)
  2090. nop
  2091. LFD y10, 1 * SIZE(Y1)
  2092. add Y1, Y1, INCY
  2093. STFD y11, 0 * SIZE(Y2)
  2094. nop
  2095. STFD y12, 1 * SIZE(Y2)
  2096. add Y2, Y2, INCY
  2097. LFD y11, 0 * SIZE(Y1)
  2098. nop
  2099. LFD y12, 1 * SIZE(Y1)
  2100. add Y1, Y1, INCY
  2101. FMSUBX y13, alpha4i, a6, y13
  2102. FMADDX y14, alpha4r, a6, y14
  2103. FMSUBX y15, alpha4i, a8, y15
  2104. FMADDX y16, alpha4r, a8, y16
  2105. LFD a2, 1 * SIZE(AO1)
  2106. LFD a4, 3 * SIZE(AO1)
  2107. LFD a6, 5 * SIZE(AO1)
  2108. LFD a8, 7 * SIZE(AO1)
  2109. STFD y13, 0 * SIZE(Y2)
  2110. nop
  2111. STFD y14, 1 * SIZE(Y2)
  2112. add Y2, Y2, INCY
  2113. LFD y13, 0 * SIZE(Y1)
  2114. nop
  2115. LFD y14, 1 * SIZE(Y1)
  2116. add Y1, Y1, INCY
  2117. STFD y15, 0 * SIZE(Y2)
  2118. nop
  2119. STFD y16, 1 * SIZE(Y2)
  2120. add Y2, Y2, INCY
  2121. LFD y15, 0 * SIZE(Y1)
  2122. nop
  2123. LFD y16, 1 * SIZE(Y1)
  2124. add Y1, Y1, INCY
  2125. DCBT(Y1, PREC)
  2126. bdnz LL(112)
  2127. .align 4
  2128. LL(113):
  2129. FMADD y01, alpha1r, a1, y01
  2130. FMADD y02, alpha1i, a1, y02
  2131. FMADD y03, alpha1r, a3, y03
  2132. FMADD y04, alpha1i, a3, y04
  2133. FMADD y05, alpha1r, a5, y05
  2134. FMADD y06, alpha1i, a5, y06
  2135. FMADD y07, alpha1r, a7, y07
  2136. FMADD y08, alpha1i, a7, y08
  2137. LFD a1, 8 * SIZE(AO1)
  2138. LFD a3, 10 * SIZE(AO1)
  2139. LFD a5, 12 * SIZE(AO1)
  2140. LFD a7, 14 * SIZE(AO1)
  2141. FMSUBX y01, alpha1i, a2, y01
  2142. FMADDX y02, alpha1r, a2, y02
  2143. FMSUBX y03, alpha1i, a4, y03
  2144. FMADDX y04, alpha1r, a4, y04
  2145. FMSUBX y05, alpha1i, a6, y05
  2146. FMADDX y06, alpha1r, a6, y06
  2147. FMSUBX y07, alpha1i, a8, y07
  2148. FMADDX y08, alpha1r, a8, y08
  2149. LFD a2, 9 * SIZE(AO1)
  2150. LFD a4, 11 * SIZE(AO1)
  2151. LFD a6, 13 * SIZE(AO1)
  2152. LFD a8, 15 * SIZE(AO1)
  2153. FMADD y09, alpha1r, a1, y09
  2154. FMADD y10, alpha1i, a1, y10
  2155. FMADD y11, alpha1r, a3, y11
  2156. FMADD y12, alpha1i, a3, y12
  2157. FMADD y13, alpha1r, a5, y13
  2158. FMADD y14, alpha1i, a5, y14
  2159. FMADD y15, alpha1r, a7, y15
  2160. FMADD y16, alpha1i, a7, y16
  2161. LFD a1, 0 * SIZE(AO2)
  2162. LFD a3, 2 * SIZE(AO2)
  2163. LFD a5, 4 * SIZE(AO2)
  2164. LFD a7, 6 * SIZE(AO2)
  2165. FMSUBX y09, alpha1i, a2, y09
  2166. FMADDX y10, alpha1r, a2, y10
  2167. FMSUBX y11, alpha1i, a4, y11
  2168. FMADDX y12, alpha1r, a4, y12
  2169. FMSUBX y13, alpha1i, a6, y13
  2170. FMADDX y14, alpha1r, a6, y14
  2171. FMSUBX y15, alpha1i, a8, y15
  2172. FMADDX y16, alpha1r, a8, y16
  2173. LFD a2, 1 * SIZE(AO2)
  2174. LFD a4, 3 * SIZE(AO2)
  2175. LFD a6, 5 * SIZE(AO2)
  2176. LFD a8, 7 * SIZE(AO2)
  2177. FMADD y01, alpha2r, a1, y01
  2178. FMADD y02, alpha2i, a1, y02
  2179. FMADD y03, alpha2r, a3, y03
  2180. FMADD y04, alpha2i, a3, y04
  2181. FMADD y05, alpha2r, a5, y05
  2182. FMADD y06, alpha2i, a5, y06
  2183. FMADD y07, alpha2r, a7, y07
  2184. FMADD y08, alpha2i, a7, y08
  2185. LFD a1, 8 * SIZE(AO2)
  2186. LFD a3, 10 * SIZE(AO2)
  2187. LFD a5, 12 * SIZE(AO2)
  2188. LFD a7, 14 * SIZE(AO2)
  2189. FMSUBX y01, alpha2i, a2, y01
  2190. FMADDX y02, alpha2r, a2, y02
  2191. FMSUBX y03, alpha2i, a4, y03
  2192. FMADDX y04, alpha2r, a4, y04
  2193. FMSUBX y05, alpha2i, a6, y05
  2194. FMADDX y06, alpha2r, a6, y06
  2195. FMSUBX y07, alpha2i, a8, y07
  2196. FMADDX y08, alpha2r, a8, y08
  2197. LFD a2, 9 * SIZE(AO2)
  2198. LFD a4, 11 * SIZE(AO2)
  2199. LFD a6, 13 * SIZE(AO2)
  2200. LFD a8, 15 * SIZE(AO2)
  2201. FMADD y09, alpha2r, a1, y09
  2202. FMADD y10, alpha2i, a1, y10
  2203. FMADD y11, alpha2r, a3, y11
  2204. FMADD y12, alpha2i, a3, y12
  2205. FMADD y13, alpha2r, a5, y13
  2206. FMADD y14, alpha2i, a5, y14
  2207. FMADD y15, alpha2r, a7, y15
  2208. FMADD y16, alpha2i, a7, y16
  2209. LFD a1, 0 * SIZE(AO3)
  2210. LFD a3, 2 * SIZE(AO3)
  2211. LFD a5, 4 * SIZE(AO3)
  2212. LFD a7, 6 * SIZE(AO3)
  2213. FMSUBX y09, alpha2i, a2, y09
  2214. FMADDX y10, alpha2r, a2, y10
  2215. FMSUBX y11, alpha2i, a4, y11
  2216. FMADDX y12, alpha2r, a4, y12
  2217. FMSUBX y13, alpha2i, a6, y13
  2218. FMADDX y14, alpha2r, a6, y14
  2219. FMSUBX y15, alpha2i, a8, y15
  2220. FMADDX y16, alpha2r, a8, y16
  2221. LFD a2, 1 * SIZE(AO3)
  2222. LFD a4, 3 * SIZE(AO3)
  2223. LFD a6, 5 * SIZE(AO3)
  2224. LFD a8, 7 * SIZE(AO3)
  2225. FMADD y01, alpha3r, a1, y01
  2226. FMADD y02, alpha3i, a1, y02
  2227. FMADD y03, alpha3r, a3, y03
  2228. FMADD y04, alpha3i, a3, y04
  2229. FMADD y05, alpha3r, a5, y05
  2230. FMADD y06, alpha3i, a5, y06
  2231. FMADD y07, alpha3r, a7, y07
  2232. FMADD y08, alpha3i, a7, y08
  2233. LFD a1, 8 * SIZE(AO3)
  2234. LFD a3, 10 * SIZE(AO3)
  2235. LFD a5, 12 * SIZE(AO3)
  2236. LFD a7, 14 * SIZE(AO3)
  2237. FMSUBX y01, alpha3i, a2, y01
  2238. FMADDX y02, alpha3r, a2, y02
  2239. FMSUBX y03, alpha3i, a4, y03
  2240. FMADDX y04, alpha3r, a4, y04
  2241. FMSUBX y05, alpha3i, a6, y05
  2242. FMADDX y06, alpha3r, a6, y06
  2243. FMSUBX y07, alpha3i, a8, y07
  2244. FMADDX y08, alpha3r, a8, y08
  2245. LFD a2, 9 * SIZE(AO3)
  2246. LFD a4, 11 * SIZE(AO3)
  2247. LFD a6, 13 * SIZE(AO3)
  2248. LFD a8, 15 * SIZE(AO3)
  2249. FMADD y09, alpha3r, a1, y09
  2250. FMADD y10, alpha3i, a1, y10
  2251. FMADD y11, alpha3r, a3, y11
  2252. FMADD y12, alpha3i, a3, y12
  2253. FMADD y13, alpha3r, a5, y13
  2254. FMADD y14, alpha3i, a5, y14
  2255. FMADD y15, alpha3r, a7, y15
  2256. FMADD y16, alpha3i, a7, y16
  2257. LFD a1, 0 * SIZE(AO4)
  2258. LFD a3, 2 * SIZE(AO4)
  2259. LFD a5, 4 * SIZE(AO4)
  2260. LFD a7, 6 * SIZE(AO4)
  2261. FMSUBX y09, alpha3i, a2, y09
  2262. FMADDX y10, alpha3r, a2, y10
  2263. FMSUBX y11, alpha3i, a4, y11
  2264. FMADDX y12, alpha3r, a4, y12
  2265. FMSUBX y13, alpha3i, a6, y13
  2266. FMADDX y14, alpha3r, a6, y14
  2267. FMSUBX y15, alpha3i, a8, y15
  2268. FMADDX y16, alpha3r, a8, y16
  2269. LFD a2, 1 * SIZE(AO4)
  2270. LFD a4, 3 * SIZE(AO4)
  2271. LFD a6, 5 * SIZE(AO4)
  2272. LFD a8, 7 * SIZE(AO4)
  2273. FMADD y01, alpha4r, a1, y01
  2274. FMADD y02, alpha4i, a1, y02
  2275. FMADD y03, alpha4r, a3, y03
  2276. FMADD y04, alpha4i, a3, y04
  2277. FMADD y05, alpha4r, a5, y05
  2278. FMADD y06, alpha4i, a5, y06
  2279. FMADD y07, alpha4r, a7, y07
  2280. FMADD y08, alpha4i, a7, y08
  2281. LFD a1, 8 * SIZE(AO4)
  2282. LFD a3, 10 * SIZE(AO4)
  2283. LFD a5, 12 * SIZE(AO4)
  2284. LFD a7, 14 * SIZE(AO4)
  2285. FMSUBX y01, alpha4i, a2, y01
  2286. FMADDX y02, alpha4r, a2, y02
  2287. FMSUBX y03, alpha4i, a4, y03
  2288. FMADDX y04, alpha4r, a4, y04
  2289. STFD y01, 0 * SIZE(Y2)
  2290. nop
  2291. STFD y02, 1 * SIZE(Y2)
  2292. add Y2, Y2, INCY
  2293. STFD y03, 0 * SIZE(Y2)
  2294. nop
  2295. STFD y04, 1 * SIZE(Y2)
  2296. add Y2, Y2, INCY
  2297. FMSUBX y05, alpha4i, a6, y05
  2298. FMADDX y06, alpha4r, a6, y06
  2299. FMSUBX y07, alpha4i, a8, y07
  2300. FMADDX y08, alpha4r, a8, y08
  2301. LFD a2, 9 * SIZE(AO4)
  2302. LFD a4, 11 * SIZE(AO4)
  2303. LFD a6, 13 * SIZE(AO4)
  2304. LFD a8, 15 * SIZE(AO4)
  2305. STFD y05, 0 * SIZE(Y2)
  2306. nop
  2307. STFD y06, 1 * SIZE(Y2)
  2308. add Y2, Y2, INCY
  2309. STFD y07, 0 * SIZE(Y2)
  2310. nop
  2311. STFD y08, 1 * SIZE(Y2)
  2312. add Y2, Y2, INCY
  2313. FMADD y09, alpha4r, a1, y09
  2314. FMADD y10, alpha4i, a1, y10
  2315. FMADD y11, alpha4r, a3, y11
  2316. FMADD y12, alpha4i, a3, y12
  2317. FMADD y13, alpha4r, a5, y13
  2318. FMADD y14, alpha4i, a5, y14
  2319. FMADD y15, alpha4r, a7, y15
  2320. FMADD y16, alpha4i, a7, y16
  2321. FMSUBX y09, alpha4i, a2, y09
  2322. FMADDX y10, alpha4r, a2, y10
  2323. FMSUBX y11, alpha4i, a4, y11
  2324. FMADDX y12, alpha4r, a4, y12
  2325. STFD y09, 0 * SIZE(Y2)
  2326. nop
  2327. STFD y10, 1 * SIZE(Y2)
  2328. add Y2, Y2, INCY
  2329. STFD y11, 0 * SIZE(Y2)
  2330. nop
  2331. STFD y12, 1 * SIZE(Y2)
  2332. add Y2, Y2, INCY
  2333. FMSUBX y13, alpha4i, a6, y13
  2334. FMADDX y14, alpha4r, a6, y14
  2335. FMSUBX y15, alpha4i, a8, y15
  2336. FMADDX y16, alpha4r, a8, y16
  2337. STFD y13, 0 * SIZE(Y2)
  2338. nop
  2339. STFD y14, 1 * SIZE(Y2)
  2340. add Y2, Y2, INCY
  2341. STFD y15, 0 * SIZE(Y2)
  2342. nop
  2343. STFD y16, 1 * SIZE(Y2)
  2344. add Y2, Y2, INCY
  2345. addi AO1, AO1, 16 * SIZE
  2346. addi AO2, AO2, 16 * SIZE
  2347. addi AO3, AO3, 16 * SIZE
  2348. addi AO4, AO4, 16 * SIZE
  2349. .align 4
  2350. LL(115):
  2351. andi. r0, M, 7
  2352. ble LL(119)
  2353. andi. r0, M, 4
  2354. ble LL(116)
  2355. LFD y01, 0 * SIZE(Y1)
  2356. LFD y02, 1 * SIZE(Y1)
  2357. add Y1, Y1, INCY
  2358. LFD y03, 0 * SIZE(Y1)
  2359. LFD y04, 1 * SIZE(Y1)
  2360. add Y1, Y1, INCY
  2361. LFD a1, 0 * SIZE(AO1)
  2362. LFD a3, 2 * SIZE(AO1)
  2363. LFD a5, 4 * SIZE(AO1)
  2364. LFD a7, 6 * SIZE(AO1)
  2365. LFD y05, 0 * SIZE(Y1)
  2366. LFD y06, 1 * SIZE(Y1)
  2367. add Y1, Y1, INCY
  2368. LFD y07, 0 * SIZE(Y1)
  2369. LFD y08, 1 * SIZE(Y1)
  2370. add Y1, Y1, INCY
  2371. LFD a2, 1 * SIZE(AO1)
  2372. LFD a4, 3 * SIZE(AO1)
  2373. LFD a6, 5 * SIZE(AO1)
  2374. LFD a8, 7 * SIZE(AO1)
  2375. FMADD y01, alpha1r, a1, y01
  2376. FMADD y02, alpha1i, a1, y02
  2377. FMADD y03, alpha1r, a3, y03
  2378. FMADD y04, alpha1i, a3, y04
  2379. FMADD y05, alpha1r, a5, y05
  2380. FMADD y06, alpha1i, a5, y06
  2381. FMADD y07, alpha1r, a7, y07
  2382. FMADD y08, alpha1i, a7, y08
  2383. LFD a1, 0 * SIZE(AO2)
  2384. LFD a3, 2 * SIZE(AO2)
  2385. LFD a5, 4 * SIZE(AO2)
  2386. LFD a7, 6 * SIZE(AO2)
  2387. FMSUBX y01, alpha1i, a2, y01
  2388. FMADDX y02, alpha1r, a2, y02
  2389. FMSUBX y03, alpha1i, a4, y03
  2390. FMADDX y04, alpha1r, a4, y04
  2391. FMSUBX y05, alpha1i, a6, y05
  2392. FMADDX y06, alpha1r, a6, y06
  2393. FMSUBX y07, alpha1i, a8, y07
  2394. FMADDX y08, alpha1r, a8, y08
  2395. LFD a2, 1 * SIZE(AO2)
  2396. LFD a4, 3 * SIZE(AO2)
  2397. LFD a6, 5 * SIZE(AO2)
  2398. LFD a8, 7 * SIZE(AO2)
  2399. FMADD y01, alpha2r, a1, y01
  2400. FMADD y02, alpha2i, a1, y02
  2401. FMADD y03, alpha2r, a3, y03
  2402. FMADD y04, alpha2i, a3, y04
  2403. FMADD y05, alpha2r, a5, y05
  2404. FMADD y06, alpha2i, a5, y06
  2405. FMADD y07, alpha2r, a7, y07
  2406. FMADD y08, alpha2i, a7, y08
  2407. LFD a1, 0 * SIZE(AO3)
  2408. LFD a3, 2 * SIZE(AO3)
  2409. LFD a5, 4 * SIZE(AO3)
  2410. LFD a7, 6 * SIZE(AO3)
  2411. FMSUBX y01, alpha2i, a2, y01
  2412. FMADDX y02, alpha2r, a2, y02
  2413. FMSUBX y03, alpha2i, a4, y03
  2414. FMADDX y04, alpha2r, a4, y04
  2415. FMSUBX y05, alpha2i, a6, y05
  2416. FMADDX y06, alpha2r, a6, y06
  2417. FMSUBX y07, alpha2i, a8, y07
  2418. FMADDX y08, alpha2r, a8, y08
  2419. LFD a2, 1 * SIZE(AO3)
  2420. LFD a4, 3 * SIZE(AO3)
  2421. LFD a6, 5 * SIZE(AO3)
  2422. LFD a8, 7 * SIZE(AO3)
  2423. FMADD y01, alpha3r, a1, y01
  2424. FMADD y02, alpha3i, a1, y02
  2425. FMADD y03, alpha3r, a3, y03
  2426. FMADD y04, alpha3i, a3, y04
  2427. FMADD y05, alpha3r, a5, y05
  2428. FMADD y06, alpha3i, a5, y06
  2429. FMADD y07, alpha3r, a7, y07
  2430. FMADD y08, alpha3i, a7, y08
  2431. LFD a1, 0 * SIZE(AO4)
  2432. LFD a3, 2 * SIZE(AO4)
  2433. LFD a5, 4 * SIZE(AO4)
  2434. LFD a7, 6 * SIZE(AO4)
  2435. FMSUBX y01, alpha3i, a2, y01
  2436. FMADDX y02, alpha3r, a2, y02
  2437. FMSUBX y03, alpha3i, a4, y03
  2438. FMADDX y04, alpha3r, a4, y04
  2439. FMSUBX y05, alpha3i, a6, y05
  2440. FMADDX y06, alpha3r, a6, y06
  2441. FMSUBX y07, alpha3i, a8, y07
  2442. FMADDX y08, alpha3r, a8, y08
  2443. LFD a2, 1 * SIZE(AO4)
  2444. LFD a4, 3 * SIZE(AO4)
  2445. LFD a6, 5 * SIZE(AO4)
  2446. LFD a8, 7 * SIZE(AO4)
  2447. FMADD y01, alpha4r, a1, y01
  2448. FMADD y02, alpha4i, a1, y02
  2449. FMADD y03, alpha4r, a3, y03
  2450. FMADD y04, alpha4i, a3, y04
  2451. FMADD y05, alpha4r, a5, y05
  2452. FMADD y06, alpha4i, a5, y06
  2453. FMADD y07, alpha4r, a7, y07
  2454. FMADD y08, alpha4i, a7, y08
  2455. FMSUBX y01, alpha4i, a2, y01
  2456. FMADDX y02, alpha4r, a2, y02
  2457. FMSUBX y03, alpha4i, a4, y03
  2458. FMADDX y04, alpha4r, a4, y04
  2459. FMSUBX y05, alpha4i, a6, y05
  2460. FMADDX y06, alpha4r, a6, y06
  2461. FMSUBX y07, alpha4i, a8, y07
  2462. FMADDX y08, alpha4r, a8, y08
  2463. STFD y01, 0 * SIZE(Y2)
  2464. addi AO1, AO1, 8 * SIZE
  2465. STFD y02, 1 * SIZE(Y2)
  2466. add Y2, Y2, INCY
  2467. STFD y03, 0 * SIZE(Y2)
  2468. addi AO2, AO2, 8 * SIZE
  2469. STFD y04, 1 * SIZE(Y2)
  2470. add Y2, Y2, INCY
  2471. STFD y05, 0 * SIZE(Y2)
  2472. addi AO3, AO3, 8 * SIZE
  2473. STFD y06, 1 * SIZE(Y2)
  2474. add Y2, Y2, INCY
  2475. STFD y07, 0 * SIZE(Y2)
  2476. addi AO4, AO4, 8 * SIZE
  2477. STFD y08, 1 * SIZE(Y2)
  2478. add Y2, Y2, INCY
  2479. .align 4
  2480. LL(116):
  2481. andi. r0, M, 2
  2482. ble LL(117)
  2483. LFD a1, 0 * SIZE(AO1)
  2484. LFD a2, 1 * SIZE(AO1)
  2485. LFD a3, 2 * SIZE(AO1)
  2486. LFD a4, 3 * SIZE(AO1)
  2487. LFD y01, 0 * SIZE(Y1)
  2488. LFD y02, 1 * SIZE(Y1)
  2489. add Y1, Y1, INCY
  2490. LFD y03, 0 * SIZE(Y1)
  2491. LFD y04, 1 * SIZE(Y1)
  2492. add Y1, Y1, INCY
  2493. LFD a5, 0 * SIZE(AO2)
  2494. LFD a6, 1 * SIZE(AO2)
  2495. LFD a7, 2 * SIZE(AO2)
  2496. LFD a8, 3 * SIZE(AO2)
  2497. FMADD y01, alpha1r, a1, y01
  2498. FMADD y02, alpha1i, a1, y02
  2499. FMADD y03, alpha1r, a3, y03
  2500. FMADD y04, alpha1i, a3, y04
  2501. FMSUBX y01, alpha1i, a2, y01
  2502. FMADDX y02, alpha1r, a2, y02
  2503. FMSUBX y03, alpha1i, a4, y03
  2504. FMADDX y04, alpha1r, a4, y04
  2505. LFD a1, 0 * SIZE(AO3)
  2506. LFD a2, 1 * SIZE(AO3)
  2507. LFD a3, 2 * SIZE(AO3)
  2508. LFD a4, 3 * SIZE(AO3)
  2509. FMADD y01, alpha2r, a5, y01
  2510. FMADD y02, alpha2i, a5, y02
  2511. FMADD y03, alpha2r, a7, y03
  2512. FMADD y04, alpha2i, a7, y04
  2513. FMSUBX y01, alpha2i, a6, y01
  2514. FMADDX y02, alpha2r, a6, y02
  2515. FMSUBX y03, alpha2i, a8, y03
  2516. FMADDX y04, alpha2r, a8, y04
  2517. LFD a5, 0 * SIZE(AO4)
  2518. LFD a6, 1 * SIZE(AO4)
  2519. LFD a7, 2 * SIZE(AO4)
  2520. LFD a8, 3 * SIZE(AO4)
  2521. FMADD y01, alpha3r, a1, y01
  2522. FMADD y02, alpha3i, a1, y02
  2523. FMADD y03, alpha3r, a3, y03
  2524. FMADD y04, alpha3i, a3, y04
  2525. FMSUBX y01, alpha3i, a2, y01
  2526. FMADDX y02, alpha3r, a2, y02
  2527. FMSUBX y03, alpha3i, a4, y03
  2528. FMADDX y04, alpha3r, a4, y04
  2529. FMADD y01, alpha4r, a5, y01
  2530. FMADD y02, alpha4i, a5, y02
  2531. FMADD y03, alpha4r, a7, y03
  2532. FMADD y04, alpha4i, a7, y04
  2533. FMSUBX y01, alpha4i, a6, y01
  2534. FMADDX y02, alpha4r, a6, y02
  2535. FMSUBX y03, alpha4i, a8, y03
  2536. FMADDX y04, alpha4r, a8, y04
  2537. STFD y01, 0 * SIZE(Y2)
  2538. addi AO1, AO1, 4 * SIZE
  2539. STFD y02, 1 * SIZE(Y2)
  2540. add Y2, Y2, INCY
  2541. STFD y03, 0 * SIZE(Y2)
  2542. addi AO2, AO2, 4 * SIZE
  2543. STFD y04, 1 * SIZE(Y2)
  2544. add Y2, Y2, INCY
  2545. addi AO3, AO3, 4 * SIZE
  2546. addi AO4, AO4, 4 * SIZE
  2547. .align 4
  2548. LL(117):
  2549. andi. r0, M, 1
  2550. ble LL(119)
  2551. LFD y01, 0 * SIZE(Y1)
  2552. LFD y02, 1 * SIZE(Y1)
  2553. add Y1, Y1, INCY
  2554. LFD a1, 0 * SIZE(AO1)
  2555. LFD a2, 1 * SIZE(AO1)
  2556. LFD a3, 0 * SIZE(AO2)
  2557. LFD a4, 1 * SIZE(AO2)
  2558. LFD a5, 0 * SIZE(AO3)
  2559. LFD a6, 1 * SIZE(AO3)
  2560. LFD a7, 0 * SIZE(AO4)
  2561. LFD a8, 1 * SIZE(AO4)
  2562. FMADD y01, alpha1r, a1, y01
  2563. FMADD y02, alpha1i, a1, y02
  2564. FMSUBX y01, alpha1i, a2, y01
  2565. FMADDX y02, alpha1r, a2, y02
  2566. FMADD y01, alpha2r, a3, y01
  2567. FMADD y02, alpha2i, a3, y02
  2568. FMSUBX y01, alpha2i, a4, y01
  2569. FMADDX y02, alpha2r, a4, y02
  2570. FMADD y01, alpha3r, a5, y01
  2571. FMADD y02, alpha3i, a5, y02
  2572. FMSUBX y01, alpha3i, a6, y01
  2573. FMADDX y02, alpha3r, a6, y02
  2574. FMADD y01, alpha4r, a7, y01
  2575. FMADD y02, alpha4i, a7, y02
  2576. FMSUBX y01, alpha4i, a8, y01
  2577. FMADDX y02, alpha4r, a8, y02
  2578. STFD y01, 0 * SIZE(Y2)
  2579. STFD y02, 1 * SIZE(Y2)
  2580. add Y2, Y2, INCY
  2581. .align 4
  2582. LL(119):
  2583. addi J, J, -1
  2584. cmpi cr0, 0, J, 0
  2585. bgt LL(111)
  2586. .align 4
  2587. LL(120):
  2588. andi. J, N, 2
  2589. ble LL(130)
  2590. .align 4
  2591. LL(121):
  2592. lfd alpha_r, ALPHA_R
  2593. lfd alpha_i, ALPHA_I
  2594. LFD a1, 0 * SIZE(X)
  2595. LFD a2, 1 * SIZE(X)
  2596. add X, X, INCX
  2597. LFD a3, 0 * SIZE(X)
  2598. LFD a4, 1 * SIZE(X)
  2599. add X, X, INCX
  2600. FMUL alpha1r, alpha_r, a1
  2601. FMUL alpha1i, alpha_i, a1
  2602. FMUL alpha2r, alpha_r, a3
  2603. FMUL alpha2i, alpha_i, a3
  2604. FMSUBR alpha1r, alpha_i, a2, alpha1r
  2605. FMADDR alpha1i, alpha_r, a2, alpha1i
  2606. FMSUBR alpha2r, alpha_i, a4, alpha2r
  2607. FMADDR alpha2i, alpha_r, a4, alpha2i
  2608. mr AO1, A
  2609. add AO2, A, LDA
  2610. add A, AO2, LDA
  2611. mr Y1, Y
  2612. mr Y2, Y
  2613. srawi. r0, M, 3
  2614. mtspr CTR, r0
  2615. ble LL(125)
  2616. .align 4
  2617. LFD y01, 0 * SIZE(Y1)
  2618. LFD y02, 1 * SIZE(Y1)
  2619. add Y1, Y1, INCY
  2620. LFD y03, 0 * SIZE(Y1)
  2621. LFD y04, 1 * SIZE(Y1)
  2622. add Y1, Y1, INCY
  2623. LFD a1, 0 * SIZE(AO1)
  2624. LFD a3, 2 * SIZE(AO1)
  2625. LFD a5, 4 * SIZE(AO1)
  2626. LFD a7, 6 * SIZE(AO1)
  2627. LFD y05, 0 * SIZE(Y1)
  2628. LFD y06, 1 * SIZE(Y1)
  2629. add Y1, Y1, INCY
  2630. LFD y07, 0 * SIZE(Y1)
  2631. LFD y08, 1 * SIZE(Y1)
  2632. add Y1, Y1, INCY
  2633. LFD a2, 1 * SIZE(AO1)
  2634. LFD a4, 3 * SIZE(AO1)
  2635. LFD a6, 5 * SIZE(AO1)
  2636. LFD a8, 7 * SIZE(AO1)
  2637. LFD y09, 0 * SIZE(Y1)
  2638. LFD y10, 1 * SIZE(Y1)
  2639. add Y1, Y1, INCY
  2640. LFD y11, 0 * SIZE(Y1)
  2641. LFD y12, 1 * SIZE(Y1)
  2642. add Y1, Y1, INCY
  2643. LFD y13, 0 * SIZE(Y1)
  2644. LFD y14, 1 * SIZE(Y1)
  2645. add Y1, Y1, INCY
  2646. LFD y15, 0 * SIZE(Y1)
  2647. LFD y16, 1 * SIZE(Y1)
  2648. add Y1, Y1, INCY
  2649. bdz LL(123)
  2650. .align 4
  2651. LL(122):
  2652. FMADD y01, alpha1r, a1, y01
  2653. FMADD y02, alpha1i, a1, y02
  2654. FMADD y03, alpha1r, a3, y03
  2655. FMADD y04, alpha1i, a3, y04
  2656. FMADD y05, alpha1r, a5, y05
  2657. FMADD y06, alpha1i, a5, y06
  2658. FMADD y07, alpha1r, a7, y07
  2659. FMADD y08, alpha1i, a7, y08
  2660. LFD a1, 8 * SIZE(AO1)
  2661. LFD a3, 10 * SIZE(AO1)
  2662. LFD a5, 12 * SIZE(AO1)
  2663. LFD a7, 14 * SIZE(AO1)
  2664. FMSUBX y01, alpha1i, a2, y01
  2665. FMADDX y02, alpha1r, a2, y02
  2666. FMSUBX y03, alpha1i, a4, y03
  2667. FMADDX y04, alpha1r, a4, y04
  2668. FMSUBX y05, alpha1i, a6, y05
  2669. FMADDX y06, alpha1r, a6, y06
  2670. FMSUBX y07, alpha1i, a8, y07
  2671. FMADDX y08, alpha1r, a8, y08
  2672. LFD a2, 9 * SIZE(AO1)
  2673. LFD a4, 11 * SIZE(AO1)
  2674. LFD a6, 13 * SIZE(AO1)
  2675. LFD a8, 15 * SIZE(AO1)
  2676. addi AO1, AO1, 16 * SIZE
  2677. nop
  2678. DCBT(AO1, PREA)
  2679. nop
  2680. FMADD y09, alpha1r, a1, y09
  2681. FMADD y10, alpha1i, a1, y10
  2682. FMADD y11, alpha1r, a3, y11
  2683. FMADD y12, alpha1i, a3, y12
  2684. FMADD y13, alpha1r, a5, y13
  2685. FMADD y14, alpha1i, a5, y14
  2686. FMADD y15, alpha1r, a7, y15
  2687. FMADD y16, alpha1i, a7, y16
  2688. LFD a1, 0 * SIZE(AO2)
  2689. LFD a3, 2 * SIZE(AO2)
  2690. LFD a5, 4 * SIZE(AO2)
  2691. LFD a7, 6 * SIZE(AO2)
  2692. FMSUBX y09, alpha1i, a2, y09
  2693. FMADDX y10, alpha1r, a2, y10
  2694. FMSUBX y11, alpha1i, a4, y11
  2695. FMADDX y12, alpha1r, a4, y12
  2696. FMSUBX y13, alpha1i, a6, y13
  2697. FMADDX y14, alpha1r, a6, y14
  2698. FMSUBX y15, alpha1i, a8, y15
  2699. FMADDX y16, alpha1r, a8, y16
  2700. LFD a2, 1 * SIZE(AO2)
  2701. LFD a4, 3 * SIZE(AO2)
  2702. LFD a6, 5 * SIZE(AO2)
  2703. LFD a8, 7 * SIZE(AO2)
  2704. FMADD y01, alpha2r, a1, y01
  2705. FMADD y02, alpha2i, a1, y02
  2706. FMADD y03, alpha2r, a3, y03
  2707. FMADD y04, alpha2i, a3, y04
  2708. FMADD y05, alpha2r, a5, y05
  2709. FMADD y06, alpha2i, a5, y06
  2710. FMADD y07, alpha2r, a7, y07
  2711. FMADD y08, alpha2i, a7, y08
  2712. LFD a1, 8 * SIZE(AO2)
  2713. LFD a3, 10 * SIZE(AO2)
  2714. LFD a5, 12 * SIZE(AO2)
  2715. LFD a7, 14 * SIZE(AO2)
  2716. FMSUBX y01, alpha2i, a2, y01
  2717. FMADDX y02, alpha2r, a2, y02
  2718. FMSUBX y03, alpha2i, a4, y03
  2719. FMADDX y04, alpha2r, a4, y04
  2720. STFD y01, 0 * SIZE(Y2)
  2721. nop
  2722. STFD y02, 1 * SIZE(Y2)
  2723. add Y2, Y2, INCY
  2724. LFD y01, 0 * SIZE(Y1)
  2725. nop
  2726. LFD y02, 1 * SIZE(Y1)
  2727. add Y1, Y1, INCY
  2728. STFD y03, 0 * SIZE(Y2)
  2729. nop
  2730. STFD y04, 1 * SIZE(Y2)
  2731. add Y2, Y2, INCY
  2732. LFD y03, 0 * SIZE(Y1)
  2733. nop
  2734. LFD y04, 1 * SIZE(Y1)
  2735. add Y1, Y1, INCY
  2736. FMSUBX y05, alpha2i, a6, y05
  2737. FMADDX y06, alpha2r, a6, y06
  2738. FMSUBX y07, alpha2i, a8, y07
  2739. FMADDX y08, alpha2r, a8, y08
  2740. LFD a2, 9 * SIZE(AO2)
  2741. LFD a4, 11 * SIZE(AO2)
  2742. LFD a6, 13 * SIZE(AO2)
  2743. LFD a8, 15 * SIZE(AO2)
  2744. addi AO2, AO2, 16 * SIZE
  2745. nop
  2746. DCBT(AO2, PREA)
  2747. nop
  2748. STFD y05, 0 * SIZE(Y2)
  2749. nop
  2750. STFD y06, 1 * SIZE(Y2)
  2751. add Y2, Y2, INCY
  2752. LFD y05, 0 * SIZE(Y1)
  2753. nop
  2754. LFD y06, 1 * SIZE(Y1)
  2755. add Y1, Y1, INCY
  2756. STFD y07, 0 * SIZE(Y2)
  2757. nop
  2758. STFD y08, 1 * SIZE(Y2)
  2759. add Y2, Y2, INCY
  2760. LFD y07, 0 * SIZE(Y1)
  2761. nop
  2762. LFD y08, 1 * SIZE(Y1)
  2763. add Y1, Y1, INCY
  2764. FMADD y09, alpha2r, a1, y09
  2765. FMADD y10, alpha2i, a1, y10
  2766. FMADD y11, alpha2r, a3, y11
  2767. FMADD y12, alpha2i, a3, y12
  2768. FMADD y13, alpha2r, a5, y13
  2769. FMADD y14, alpha2i, a5, y14
  2770. FMADD y15, alpha2r, a7, y15
  2771. FMADD y16, alpha2i, a7, y16
  2772. LFD a1, 0 * SIZE(AO1)
  2773. LFD a3, 2 * SIZE(AO1)
  2774. LFD a5, 4 * SIZE(AO1)
  2775. LFD a7, 6 * SIZE(AO1)
  2776. FMSUBX y09, alpha2i, a2, y09
  2777. FMADDX y10, alpha2r, a2, y10
  2778. FMSUBX y11, alpha2i, a4, y11
  2779. FMADDX y12, alpha2r, a4, y12
  2780. STFD y09, 0 * SIZE(Y2)
  2781. nop
  2782. STFD y10, 1 * SIZE(Y2)
  2783. add Y2, Y2, INCY
  2784. LFD y09, 0 * SIZE(Y1)
  2785. nop
  2786. LFD y10, 1 * SIZE(Y1)
  2787. add Y1, Y1, INCY
  2788. STFD y11, 0 * SIZE(Y2)
  2789. nop
  2790. STFD y12, 1 * SIZE(Y2)
  2791. add Y2, Y2, INCY
  2792. LFD y11, 0 * SIZE(Y1)
  2793. nop
  2794. LFD y12, 1 * SIZE(Y1)
  2795. add Y1, Y1, INCY
  2796. FMSUBX y13, alpha2i, a6, y13
  2797. FMADDX y14, alpha2r, a6, y14
  2798. FMSUBX y15, alpha2i, a8, y15
  2799. FMADDX y16, alpha2r, a8, y16
  2800. LFD a2, 1 * SIZE(AO1)
  2801. LFD a4, 3 * SIZE(AO1)
  2802. LFD a6, 5 * SIZE(AO1)
  2803. LFD a8, 7 * SIZE(AO1)
  2804. STFD y13, 0 * SIZE(Y2)
  2805. nop
  2806. STFD y14, 1 * SIZE(Y2)
  2807. add Y2, Y2, INCY
  2808. STFD y15, 0 * SIZE(Y2)
  2809. nop
  2810. STFD y16, 1 * SIZE(Y2)
  2811. add Y2, Y2, INCY
  2812. LFD y13, 0 * SIZE(Y1)
  2813. nop
  2814. LFD y14, 1 * SIZE(Y1)
  2815. add Y1, Y1, INCY
  2816. LFD y15, 0 * SIZE(Y1)
  2817. nop
  2818. LFD y16, 1 * SIZE(Y1)
  2819. add Y1, Y1, INCY
  2820. DCBT(Y1, PREC)
  2821. bdnz LL(122)
  2822. .align 4
  2823. LL(123):
  2824. FMADD y01, alpha1r, a1, y01
  2825. FMADD y02, alpha1i, a1, y02
  2826. FMADD y03, alpha1r, a3, y03
  2827. FMADD y04, alpha1i, a3, y04
  2828. FMADD y05, alpha1r, a5, y05
  2829. FMADD y06, alpha1i, a5, y06
  2830. FMADD y07, alpha1r, a7, y07
  2831. FMADD y08, alpha1i, a7, y08
  2832. LFD a1, 8 * SIZE(AO1)
  2833. LFD a3, 10 * SIZE(AO1)
  2834. LFD a5, 12 * SIZE(AO1)
  2835. LFD a7, 14 * SIZE(AO1)
  2836. FMSUBX y01, alpha1i, a2, y01
  2837. FMADDX y02, alpha1r, a2, y02
  2838. FMSUBX y03, alpha1i, a4, y03
  2839. FMADDX y04, alpha1r, a4, y04
  2840. FMSUBX y05, alpha1i, a6, y05
  2841. FMADDX y06, alpha1r, a6, y06
  2842. FMSUBX y07, alpha1i, a8, y07
  2843. FMADDX y08, alpha1r, a8, y08
  2844. LFD a2, 9 * SIZE(AO1)
  2845. LFD a4, 11 * SIZE(AO1)
  2846. LFD a6, 13 * SIZE(AO1)
  2847. LFD a8, 15 * SIZE(AO1)
  2848. FMADD y09, alpha1r, a1, y09
  2849. FMADD y10, alpha1i, a1, y10
  2850. FMADD y11, alpha1r, a3, y11
  2851. FMADD y12, alpha1i, a3, y12
  2852. FMADD y13, alpha1r, a5, y13
  2853. FMADD y14, alpha1i, a5, y14
  2854. FMADD y15, alpha1r, a7, y15
  2855. FMADD y16, alpha1i, a7, y16
  2856. LFD a1, 0 * SIZE(AO2)
  2857. LFD a3, 2 * SIZE(AO2)
  2858. LFD a5, 4 * SIZE(AO2)
  2859. LFD a7, 6 * SIZE(AO2)
  2860. FMSUBX y09, alpha1i, a2, y09
  2861. FMADDX y10, alpha1r, a2, y10
  2862. FMSUBX y11, alpha1i, a4, y11
  2863. FMADDX y12, alpha1r, a4, y12
  2864. FMSUBX y13, alpha1i, a6, y13
  2865. FMADDX y14, alpha1r, a6, y14
  2866. FMSUBX y15, alpha1i, a8, y15
  2867. FMADDX y16, alpha1r, a8, y16
  2868. LFD a2, 1 * SIZE(AO2)
  2869. LFD a4, 3 * SIZE(AO2)
  2870. LFD a6, 5 * SIZE(AO2)
  2871. LFD a8, 7 * SIZE(AO2)
  2872. FMADD y01, alpha2r, a1, y01
  2873. FMADD y02, alpha2i, a1, y02
  2874. FMADD y03, alpha2r, a3, y03
  2875. FMADD y04, alpha2i, a3, y04
  2876. FMADD y05, alpha2r, a5, y05
  2877. FMADD y06, alpha2i, a5, y06
  2878. FMADD y07, alpha2r, a7, y07
  2879. FMADD y08, alpha2i, a7, y08
  2880. LFD a1, 8 * SIZE(AO2)
  2881. LFD a3, 10 * SIZE(AO2)
  2882. LFD a5, 12 * SIZE(AO2)
  2883. LFD a7, 14 * SIZE(AO2)
  2884. FMSUBX y01, alpha2i, a2, y01
  2885. FMADDX y02, alpha2r, a2, y02
  2886. FMSUBX y03, alpha2i, a4, y03
  2887. FMADDX y04, alpha2r, a4, y04
  2888. STFD y01, 0 * SIZE(Y2)
  2889. addi AO1, AO1, 16 * SIZE
  2890. STFD y02, 1 * SIZE(Y2)
  2891. add Y2, Y2, INCY
  2892. STFD y03, 0 * SIZE(Y2)
  2893. nop
  2894. STFD y04, 1 * SIZE(Y2)
  2895. add Y2, Y2, INCY
  2896. FMSUBX y05, alpha2i, a6, y05
  2897. FMADDX y06, alpha2r, a6, y06
  2898. FMSUBX y07, alpha2i, a8, y07
  2899. FMADDX y08, alpha2r, a8, y08
  2900. LFD a2, 9 * SIZE(AO2)
  2901. LFD a4, 11 * SIZE(AO2)
  2902. LFD a6, 13 * SIZE(AO2)
  2903. LFD a8, 15 * SIZE(AO2)
  2904. STFD y05, 0 * SIZE(Y2)
  2905. addi AO2, AO2, 16 * SIZE
  2906. STFD y06, 1 * SIZE(Y2)
  2907. add Y2, Y2, INCY
  2908. STFD y07, 0 * SIZE(Y2)
  2909. nop
  2910. STFD y08, 1 * SIZE(Y2)
  2911. add Y2, Y2, INCY
  2912. FMADD y09, alpha2r, a1, y09
  2913. FMADD y10, alpha2i, a1, y10
  2914. FMADD y11, alpha2r, a3, y11
  2915. FMADD y12, alpha2i, a3, y12
  2916. FMADD y13, alpha2r, a5, y13
  2917. FMADD y14, alpha2i, a5, y14
  2918. FMADD y15, alpha2r, a7, y15
  2919. FMADD y16, alpha2i, a7, y16
  2920. FMSUBX y09, alpha2i, a2, y09
  2921. FMADDX y10, alpha2r, a2, y10
  2922. FMSUBX y11, alpha2i, a4, y11
  2923. FMADDX y12, alpha2r, a4, y12
  2924. STFD y09, 0 * SIZE(Y2)
  2925. nop
  2926. STFD y10, 1 * SIZE(Y2)
  2927. add Y2, Y2, INCY
  2928. STFD y11, 0 * SIZE(Y2)
  2929. nop
  2930. STFD y12, 1 * SIZE(Y2)
  2931. add Y2, Y2, INCY
  2932. FMSUBX y13, alpha2i, a6, y13
  2933. FMADDX y14, alpha2r, a6, y14
  2934. FMSUBX y15, alpha2i, a8, y15
  2935. FMADDX y16, alpha2r, a8, y16
  2936. STFD y13, 0 * SIZE(Y2)
  2937. nop
  2938. STFD y14, 1 * SIZE(Y2)
  2939. add Y2, Y2, INCY
  2940. STFD y15, 0 * SIZE(Y2)
  2941. nop
  2942. STFD y16, 1 * SIZE(Y2)
  2943. add Y2, Y2, INCY
  2944. .align 4
  2945. LL(125):
  2946. andi. r0, M, 7
  2947. ble LL(130)
  2948. andi. r0, M, 4
  2949. ble LL(126)
  2950. LFD y01, 0 * SIZE(Y1)
  2951. LFD y02, 1 * SIZE(Y1)
  2952. add Y1, Y1, INCY
  2953. LFD y03, 0 * SIZE(Y1)
  2954. LFD y04, 1 * SIZE(Y1)
  2955. add Y1, Y1, INCY
  2956. LFD a1, 0 * SIZE(AO1)
  2957. LFD a3, 2 * SIZE(AO1)
  2958. LFD a5, 4 * SIZE(AO1)
  2959. LFD a7, 6 * SIZE(AO1)
  2960. LFD y05, 0 * SIZE(Y1)
  2961. LFD y06, 1 * SIZE(Y1)
  2962. add Y1, Y1, INCY
  2963. LFD y07, 0 * SIZE(Y1)
  2964. LFD y08, 1 * SIZE(Y1)
  2965. add Y1, Y1, INCY
  2966. LFD a2, 1 * SIZE(AO1)
  2967. LFD a4, 3 * SIZE(AO1)
  2968. LFD a6, 5 * SIZE(AO1)
  2969. LFD a8, 7 * SIZE(AO1)
  2970. FMADD y01, alpha1r, a1, y01
  2971. FMADD y02, alpha1i, a1, y02
  2972. FMADD y03, alpha1r, a3, y03
  2973. FMADD y04, alpha1i, a3, y04
  2974. FMADD y05, alpha1r, a5, y05
  2975. FMADD y06, alpha1i, a5, y06
  2976. FMADD y07, alpha1r, a7, y07
  2977. FMADD y08, alpha1i, a7, y08
  2978. LFD a1, 0 * SIZE(AO2)
  2979. LFD a3, 2 * SIZE(AO2)
  2980. LFD a5, 4 * SIZE(AO2)
  2981. LFD a7, 6 * SIZE(AO2)
  2982. FMSUBX y01, alpha1i, a2, y01
  2983. FMADDX y02, alpha1r, a2, y02
  2984. FMSUBX y03, alpha1i, a4, y03
  2985. FMADDX y04, alpha1r, a4, y04
  2986. FMSUBX y05, alpha1i, a6, y05
  2987. FMADDX y06, alpha1r, a6, y06
  2988. FMSUBX y07, alpha1i, a8, y07
  2989. FMADDX y08, alpha1r, a8, y08
  2990. LFD a2, 1 * SIZE(AO2)
  2991. LFD a4, 3 * SIZE(AO2)
  2992. LFD a6, 5 * SIZE(AO2)
  2993. LFD a8, 7 * SIZE(AO2)
  2994. FMADD y01, alpha2r, a1, y01
  2995. FMADD y02, alpha2i, a1, y02
  2996. FMADD y03, alpha2r, a3, y03
  2997. FMADD y04, alpha2i, a3, y04
  2998. FMADD y05, alpha2r, a5, y05
  2999. FMADD y06, alpha2i, a5, y06
  3000. FMADD y07, alpha2r, a7, y07
  3001. FMADD y08, alpha2i, a7, y08
  3002. FMSUBX y01, alpha2i, a2, y01
  3003. FMADDX y02, alpha2r, a2, y02
  3004. FMSUBX y03, alpha2i, a4, y03
  3005. FMADDX y04, alpha2r, a4, y04
  3006. STFD y01, 0 * SIZE(Y2)
  3007. addi AO1, AO1, 8 * SIZE
  3008. STFD y02, 1 * SIZE(Y2)
  3009. add Y2, Y2, INCY
  3010. STFD y03, 0 * SIZE(Y2)
  3011. addi AO2, AO2, 8 * SIZE
  3012. STFD y04, 1 * SIZE(Y2)
  3013. add Y2, Y2, INCY
  3014. FMSUBX y05, alpha2i, a6, y05
  3015. FMADDX y06, alpha2r, a6, y06
  3016. FMSUBX y07, alpha2i, a8, y07
  3017. FMADDX y08, alpha2r, a8, y08
  3018. STFD y05, 0 * SIZE(Y2)
  3019. nop
  3020. STFD y06, 1 * SIZE(Y2)
  3021. add Y2, Y2, INCY
  3022. STFD y07, 0 * SIZE(Y2)
  3023. nop
  3024. STFD y08, 1 * SIZE(Y2)
  3025. add Y2, Y2, INCY
  3026. .align 4
  3027. LL(126):
  3028. andi. r0, M, 2
  3029. ble LL(127)
  3030. LFD a1, 0 * SIZE(AO1)
  3031. LFD a2, 1 * SIZE(AO1)
  3032. LFD a3, 2 * SIZE(AO1)
  3033. LFD a4, 3 * SIZE(AO1)
  3034. LFD y01, 0 * SIZE(Y1)
  3035. LFD y02, 1 * SIZE(Y1)
  3036. add Y1, Y1, INCY
  3037. LFD y03, 0 * SIZE(Y1)
  3038. LFD y04, 1 * SIZE(Y1)
  3039. add Y1, Y1, INCY
  3040. LFD a5, 0 * SIZE(AO2)
  3041. LFD a6, 1 * SIZE(AO2)
  3042. LFD a7, 2 * SIZE(AO2)
  3043. LFD a8, 3 * SIZE(AO2)
  3044. FMADD y01, alpha1r, a1, y01
  3045. FMADD y02, alpha1i, a1, y02
  3046. FMADD y03, alpha1r, a3, y03
  3047. FMADD y04, alpha1i, a3, y04
  3048. FMSUBX y01, alpha1i, a2, y01
  3049. FMADDX y02, alpha1r, a2, y02
  3050. FMSUBX y03, alpha1i, a4, y03
  3051. FMADDX y04, alpha1r, a4, y04
  3052. FMADD y01, alpha2r, a5, y01
  3053. FMADD y02, alpha2i, a5, y02
  3054. FMADD y03, alpha2r, a7, y03
  3055. FMADD y04, alpha2i, a7, y04
  3056. FMSUBX y01, alpha2i, a6, y01
  3057. FMADDX y02, alpha2r, a6, y02
  3058. FMSUBX y03, alpha2i, a8, y03
  3059. FMADDX y04, alpha2r, a8, y04
  3060. STFD y01, 0 * SIZE(Y2)
  3061. addi AO1, AO1, 4 * SIZE
  3062. STFD y02, 1 * SIZE(Y2)
  3063. add Y2, Y2, INCY
  3064. STFD y03, 0 * SIZE(Y2)
  3065. addi AO2, AO2, 4 * SIZE
  3066. STFD y04, 1 * SIZE(Y2)
  3067. add Y2, Y2, INCY
  3068. .align 4
  3069. LL(127):
  3070. andi. r0, M, 1
  3071. ble LL(130)
  3072. LFD y01, 0 * SIZE(Y1)
  3073. LFD y02, 1 * SIZE(Y1)
  3074. add Y1, Y1, INCY
  3075. LFD a1, 0 * SIZE(AO1)
  3076. LFD a2, 1 * SIZE(AO1)
  3077. LFD a3, 0 * SIZE(AO2)
  3078. LFD a4, 1 * SIZE(AO2)
  3079. FMADD y01, alpha1r, a1, y01
  3080. FMADD y02, alpha1i, a1, y02
  3081. FMSUBX y01, alpha1i, a2, y01
  3082. FMADDX y02, alpha1r, a2, y02
  3083. FMADD y01, alpha2r, a3, y01
  3084. FMADD y02, alpha2i, a3, y02
  3085. FMSUBX y01, alpha2i, a4, y01
  3086. FMADDX y02, alpha2r, a4, y02
  3087. STFD y01, 0 * SIZE(Y2)
  3088. STFD y02, 1 * SIZE(Y2)
  3089. add Y2, Y2, INCY
  3090. .align 4
  3091. LL(130):
  3092. andi. J, N, 1
  3093. ble LL(999)
  3094. .align 4
  3095. LL(131):
  3096. lfd alpha_r, ALPHA_R
  3097. lfd alpha_i, ALPHA_I
  3098. LFD a1, 0 * SIZE(X)
  3099. LFD a2, 1 * SIZE(X)
  3100. add X, X, INCX
  3101. FMUL alpha1r, alpha_r, a1
  3102. FMUL alpha1i, alpha_i, a1
  3103. FMSUBR alpha1r, alpha_i, a2, alpha1r
  3104. FMADDR alpha1i, alpha_r, a2, alpha1i
  3105. mr AO1, A
  3106. add A, AO1, LDA
  3107. mr Y1, Y
  3108. mr Y2, Y
  3109. srawi. r0, M, 3
  3110. mtspr CTR, r0
  3111. ble LL(135)
  3112. .align 4
  3113. LFD y01, 0 * SIZE(Y1)
  3114. LFD y02, 1 * SIZE(Y1)
  3115. add Y1, Y1, INCY
  3116. LFD y03, 0 * SIZE(Y1)
  3117. LFD y04, 1 * SIZE(Y1)
  3118. add Y1, Y1, INCY
  3119. LFD a1, 0 * SIZE(AO1)
  3120. LFD a3, 2 * SIZE(AO1)
  3121. LFD a5, 4 * SIZE(AO1)
  3122. LFD a7, 6 * SIZE(AO1)
  3123. LFD y05, 0 * SIZE(Y1)
  3124. LFD y06, 1 * SIZE(Y1)
  3125. add Y1, Y1, INCY
  3126. LFD y07, 0 * SIZE(Y1)
  3127. LFD y08, 1 * SIZE(Y1)
  3128. add Y1, Y1, INCY
  3129. LFD a2, 1 * SIZE(AO1)
  3130. LFD a4, 3 * SIZE(AO1)
  3131. LFD a6, 5 * SIZE(AO1)
  3132. LFD a8, 7 * SIZE(AO1)
  3133. LFD y09, 0 * SIZE(Y1)
  3134. LFD y10, 1 * SIZE(Y1)
  3135. add Y1, Y1, INCY
  3136. LFD y11, 0 * SIZE(Y1)
  3137. LFD y12, 1 * SIZE(Y1)
  3138. add Y1, Y1, INCY
  3139. LFD y13, 0 * SIZE(Y1)
  3140. LFD y14, 1 * SIZE(Y1)
  3141. add Y1, Y1, INCY
  3142. LFD y15, 0 * SIZE(Y1)
  3143. LFD y16, 1 * SIZE(Y1)
  3144. add Y1, Y1, INCY
  3145. bdz LL(133)
  3146. .align 4
  3147. LL(132):
  3148. FMADD y01, alpha1r, a1, y01
  3149. FMADD y02, alpha1i, a1, y02
  3150. FMADD y03, alpha1r, a3, y03
  3151. FMADD y04, alpha1i, a3, y04
  3152. FMADD y05, alpha1r, a5, y05
  3153. FMADD y06, alpha1i, a5, y06
  3154. FMADD y07, alpha1r, a7, y07
  3155. FMADD y08, alpha1i, a7, y08
  3156. LFD a1, 8 * SIZE(AO1)
  3157. LFD a3, 10 * SIZE(AO1)
  3158. LFD a5, 12 * SIZE(AO1)
  3159. LFD a7, 14 * SIZE(AO1)
  3160. FMSUBX y01, alpha1i, a2, y01
  3161. FMADDX y02, alpha1r, a2, y02
  3162. FMSUBX y03, alpha1i, a4, y03
  3163. FMADDX y04, alpha1r, a4, y04
  3164. STFD y01, 0 * SIZE(Y2)
  3165. nop
  3166. STFD y02, 1 * SIZE(Y2)
  3167. add Y2, Y2, INCY
  3168. LFD y01, 0 * SIZE(Y1)
  3169. nop
  3170. LFD y02, 1 * SIZE(Y1)
  3171. add Y1, Y1, INCY
  3172. STFD y03, 0 * SIZE(Y2)
  3173. nop
  3174. STFD y04, 1 * SIZE(Y2)
  3175. add Y2, Y2, INCY
  3176. LFD y03, 0 * SIZE(Y1)
  3177. nop
  3178. LFD y04, 1 * SIZE(Y1)
  3179. add Y1, Y1, INCY
  3180. FMSUBX y05, alpha1i, a6, y05
  3181. FMADDX y06, alpha1r, a6, y06
  3182. FMSUBX y07, alpha1i, a8, y07
  3183. FMADDX y08, alpha1r, a8, y08
  3184. LFD a2, 9 * SIZE(AO1)
  3185. LFD a4, 11 * SIZE(AO1)
  3186. LFD a6, 13 * SIZE(AO1)
  3187. LFD a8, 15 * SIZE(AO1)
  3188. addi AO1, AO1, 16 * SIZE
  3189. nop
  3190. DCBT(AO1, PREA)
  3191. nop
  3192. STFD y05, 0 * SIZE(Y2)
  3193. nop
  3194. STFD y06, 1 * SIZE(Y2)
  3195. add Y2, Y2, INCY
  3196. LFD y05, 0 * SIZE(Y1)
  3197. nop
  3198. LFD y06, 1 * SIZE(Y1)
  3199. add Y1, Y1, INCY
  3200. STFD y07, 0 * SIZE(Y2)
  3201. nop
  3202. STFD y08, 1 * SIZE(Y2)
  3203. add Y2, Y2, INCY
  3204. LFD y07, 0 * SIZE(Y1)
  3205. nop
  3206. LFD y08, 1 * SIZE(Y1)
  3207. add Y1, Y1, INCY
  3208. FMADD y09, alpha1r, a1, y09
  3209. FMADD y10, alpha1i, a1, y10
  3210. FMADD y11, alpha1r, a3, y11
  3211. FMADD y12, alpha1i, a3, y12
  3212. FMADD y13, alpha1r, a5, y13
  3213. FMADD y14, alpha1i, a5, y14
  3214. FMADD y15, alpha1r, a7, y15
  3215. FMADD y16, alpha1i, a7, y16
  3216. LFD a1, 0 * SIZE(AO1)
  3217. LFD a3, 2 * SIZE(AO1)
  3218. LFD a5, 4 * SIZE(AO1)
  3219. LFD a7, 6 * SIZE(AO1)
  3220. FMSUBX y09, alpha1i, a2, y09
  3221. FMADDX y10, alpha1r, a2, y10
  3222. FMSUBX y11, alpha1i, a4, y11
  3223. FMADDX y12, alpha1r, a4, y12
  3224. STFD y09, 0 * SIZE(Y2)
  3225. nop
  3226. STFD y10, 1 * SIZE(Y2)
  3227. add Y2, Y2, INCY
  3228. LFD y09, 0 * SIZE(Y1)
  3229. nop
  3230. LFD y10, 1 * SIZE(Y1)
  3231. add Y1, Y1, INCY
  3232. STFD y11, 0 * SIZE(Y2)
  3233. nop
  3234. STFD y12, 1 * SIZE(Y2)
  3235. add Y2, Y2, INCY
  3236. LFD y11, 0 * SIZE(Y1)
  3237. nop
  3238. LFD y12, 1 * SIZE(Y1)
  3239. add Y1, Y1, INCY
  3240. FMSUBX y13, alpha1i, a6, y13
  3241. FMADDX y14, alpha1r, a6, y14
  3242. FMSUBX y15, alpha1i, a8, y15
  3243. FMADDX y16, alpha1r, a8, y16
  3244. LFD a2, 1 * SIZE(AO1)
  3245. LFD a4, 3 * SIZE(AO1)
  3246. LFD a6, 5 * SIZE(AO1)
  3247. LFD a8, 7 * SIZE(AO1)
  3248. STFD y13, 0 * SIZE(Y2)
  3249. nop
  3250. STFD y14, 1 * SIZE(Y2)
  3251. add Y2, Y2, INCY
  3252. STFD y15, 0 * SIZE(Y2)
  3253. nop
  3254. STFD y16, 1 * SIZE(Y2)
  3255. add Y2, Y2, INCY
  3256. LFD y13, 0 * SIZE(Y1)
  3257. nop
  3258. LFD y14, 1 * SIZE(Y1)
  3259. add Y1, Y1, INCY
  3260. LFD y15, 0 * SIZE(Y1)
  3261. nop
  3262. LFD y16, 1 * SIZE(Y1)
  3263. add Y1, Y1, INCY
  3264. DCBT(Y1, PREC)
  3265. bdnz LL(132)
  3266. .align 4
  3267. LL(133):
  3268. FMADD y01, alpha1r, a1, y01
  3269. FMADD y02, alpha1i, a1, y02
  3270. FMADD y03, alpha1r, a3, y03
  3271. FMADD y04, alpha1i, a3, y04
  3272. FMADD y05, alpha1r, a5, y05
  3273. FMADD y06, alpha1i, a5, y06
  3274. FMADD y07, alpha1r, a7, y07
  3275. FMADD y08, alpha1i, a7, y08
  3276. LFD a1, 8 * SIZE(AO1)
  3277. LFD a3, 10 * SIZE(AO1)
  3278. LFD a5, 12 * SIZE(AO1)
  3279. LFD a7, 14 * SIZE(AO1)
  3280. FMSUBX y01, alpha1i, a2, y01
  3281. FMADDX y02, alpha1r, a2, y02
  3282. FMSUBX y03, alpha1i, a4, y03
  3283. FMADDX y04, alpha1r, a4, y04
  3284. FMSUBX y05, alpha1i, a6, y05
  3285. FMADDX y06, alpha1r, a6, y06
  3286. FMSUBX y07, alpha1i, a8, y07
  3287. FMADDX y08, alpha1r, a8, y08
  3288. LFD a2, 9 * SIZE(AO1)
  3289. LFD a4, 11 * SIZE(AO1)
  3290. LFD a6, 13 * SIZE(AO1)
  3291. LFD a8, 15 * SIZE(AO1)
  3292. FMADD y09, alpha1r, a1, y09
  3293. FMADD y10, alpha1i, a1, y10
  3294. FMADD y11, alpha1r, a3, y11
  3295. FMADD y12, alpha1i, a3, y12
  3296. FMADD y13, alpha1r, a5, y13
  3297. FMADD y14, alpha1i, a5, y14
  3298. FMADD y15, alpha1r, a7, y15
  3299. FMADD y16, alpha1i, a7, y16
  3300. FMSUBX y09, alpha1i, a2, y09
  3301. FMADDX y10, alpha1r, a2, y10
  3302. FMSUBX y11, alpha1i, a4, y11
  3303. FMADDX y12, alpha1r, a4, y12
  3304. FMSUBX y13, alpha1i, a6, y13
  3305. FMADDX y14, alpha1r, a6, y14
  3306. FMSUBX y15, alpha1i, a8, y15
  3307. FMADDX y16, alpha1r, a8, y16
  3308. STFD y01, 0 * SIZE(Y2)
  3309. addi AO1, AO1, 16 * SIZE
  3310. STFD y02, 1 * SIZE(Y2)
  3311. add Y2, Y2, INCY
  3312. STFD y03, 0 * SIZE(Y2)
  3313. nop
  3314. STFD y04, 1 * SIZE(Y2)
  3315. add Y2, Y2, INCY
  3316. STFD y05, 0 * SIZE(Y2)
  3317. nop
  3318. STFD y06, 1 * SIZE(Y2)
  3319. add Y2, Y2, INCY
  3320. STFD y07, 0 * SIZE(Y2)
  3321. nop
  3322. STFD y08, 1 * SIZE(Y2)
  3323. add Y2, Y2, INCY
  3324. STFD y09, 0 * SIZE(Y2)
  3325. nop
  3326. STFD y10, 1 * SIZE(Y2)
  3327. add Y2, Y2, INCY
  3328. STFD y11, 0 * SIZE(Y2)
  3329. nop
  3330. STFD y12, 1 * SIZE(Y2)
  3331. add Y2, Y2, INCY
  3332. STFD y13, 0 * SIZE(Y2)
  3333. nop
  3334. STFD y14, 1 * SIZE(Y2)
  3335. add Y2, Y2, INCY
  3336. STFD y15, 0 * SIZE(Y2)
  3337. nop
  3338. STFD y16, 1 * SIZE(Y2)
  3339. add Y2, Y2, INCY
  3340. .align 4
  3341. LL(135):
  3342. andi. r0, M, 7
  3343. ble LL(999)
  3344. andi. r0, M, 4
  3345. ble LL(136)
  3346. LFD y01, 0 * SIZE(Y1)
  3347. nop
  3348. LFD y02, 1 * SIZE(Y1)
  3349. add Y1, Y1, INCY
  3350. LFD y03, 0 * SIZE(Y1)
  3351. nop
  3352. LFD y04, 1 * SIZE(Y1)
  3353. add Y1, Y1, INCY
  3354. LFD y05, 0 * SIZE(Y1)
  3355. nop
  3356. LFD y06, 1 * SIZE(Y1)
  3357. add Y1, Y1, INCY
  3358. LFD y07, 0 * SIZE(Y1)
  3359. nop
  3360. LFD y08, 1 * SIZE(Y1)
  3361. add Y1, Y1, INCY
  3362. LFD a1, 0 * SIZE(AO1)
  3363. LFD a3, 2 * SIZE(AO1)
  3364. LFD a5, 4 * SIZE(AO1)
  3365. LFD a7, 6 * SIZE(AO1)
  3366. LFD a2, 1 * SIZE(AO1)
  3367. LFD a4, 3 * SIZE(AO1)
  3368. LFD a6, 5 * SIZE(AO1)
  3369. LFD a8, 7 * SIZE(AO1)
  3370. FMADD y01, alpha1r, a1, y01
  3371. FMADD y02, alpha1i, a1, y02
  3372. FMADD y03, alpha1r, a3, y03
  3373. FMADD y04, alpha1i, a3, y04
  3374. FMADD y05, alpha1r, a5, y05
  3375. FMADD y06, alpha1i, a5, y06
  3376. FMADD y07, alpha1r, a7, y07
  3377. FMADD y08, alpha1i, a7, y08
  3378. FMSUBX y01, alpha1i, a2, y01
  3379. FMADDX y02, alpha1r, a2, y02
  3380. FMSUBX y03, alpha1i, a4, y03
  3381. FMADDX y04, alpha1r, a4, y04
  3382. FMSUBX y05, alpha1i, a6, y05
  3383. FMADDX y06, alpha1r, a6, y06
  3384. FMSUBX y07, alpha1i, a8, y07
  3385. FMADDX y08, alpha1r, a8, y08
  3386. STFD y01, 0 * SIZE(Y2)
  3387. addi AO1, AO1, 8 * SIZE
  3388. STFD y02, 1 * SIZE(Y2)
  3389. add Y2, Y2, INCY
  3390. STFD y03, 0 * SIZE(Y2)
  3391. nop
  3392. STFD y04, 1 * SIZE(Y2)
  3393. add Y2, Y2, INCY
  3394. STFD y05, 0 * SIZE(Y2)
  3395. nop
  3396. STFD y06, 1 * SIZE(Y2)
  3397. add Y2, Y2, INCY
  3398. STFD y07, 0 * SIZE(Y2)
  3399. nop
  3400. STFD y08, 1 * SIZE(Y2)
  3401. add Y2, Y2, INCY
  3402. .align 4
  3403. LL(136):
  3404. andi. r0, M, 2
  3405. ble LL(137)
  3406. LFD a1, 0 * SIZE(AO1)
  3407. LFD a2, 1 * SIZE(AO1)
  3408. LFD a3, 2 * SIZE(AO1)
  3409. LFD a4, 3 * SIZE(AO1)
  3410. LFD y01, 0 * SIZE(Y1)
  3411. nop
  3412. LFD y02, 1 * SIZE(Y1)
  3413. add Y1, Y1, INCY
  3414. LFD y03, 0 * SIZE(Y1)
  3415. nop
  3416. LFD y04, 1 * SIZE(Y1)
  3417. add Y1, Y1, INCY
  3418. FMADD y01, alpha1r, a1, y01
  3419. FMADD y02, alpha1i, a1, y02
  3420. FMADD y03, alpha1r, a3, y03
  3421. FMADD y04, alpha1i, a3, y04
  3422. FMSUBX y01, alpha1i, a2, y01
  3423. FMADDX y02, alpha1r, a2, y02
  3424. FMSUBX y03, alpha1i, a4, y03
  3425. FMADDX y04, alpha1r, a4, y04
  3426. STFD y01, 0 * SIZE(Y2)
  3427. addi AO1, AO1, 4 * SIZE
  3428. STFD y02, 1 * SIZE(Y2)
  3429. add Y2, Y2, INCY
  3430. STFD y03, 0 * SIZE(Y2)
  3431. nop
  3432. STFD y04, 1 * SIZE(Y2)
  3433. add Y2, Y2, INCY
  3434. .align 4
  3435. LL(137):
  3436. andi. r0, M, 1
  3437. ble LL(999)
  3438. LFD y01, 0 * SIZE(Y1)
  3439. nop
  3440. LFD y02, 1 * SIZE(Y1)
  3441. add Y1, Y1, INCY
  3442. LFD a1, 0 * SIZE(AO1)
  3443. LFD a2, 1 * SIZE(AO1)
  3444. FMADD y01, alpha1r, a1, y01
  3445. FMADD y02, alpha1i, a1, y02
  3446. FMSUBX y01, alpha1i, a2, y01
  3447. FMADDX y02, alpha1r, a2, y02
  3448. STFD y01, 0 * SIZE(Y2)
  3449. nop
  3450. STFD y02, 1 * SIZE(Y2)
  3451. add Y2, Y2, INCY
  3452. .align 4
  3453. LL(999):
  3454. li r3, 0
  3455. lfd f14, 0(SP)
  3456. lfd f15, 8(SP)
  3457. lfd f16, 16(SP)
  3458. lfd f17, 24(SP)
  3459. lfd f18, 32(SP)
  3460. lfd f19, 40(SP)
  3461. lfd f20, 48(SP)
  3462. lfd f21, 56(SP)
  3463. lfd f22, 64(SP)
  3464. lfd f23, 72(SP)
  3465. lfd f24, 80(SP)
  3466. lfd f25, 88(SP)
  3467. lfd f26, 96(SP)
  3468. lfd f27, 104(SP)
  3469. lfd f28, 112(SP)
  3470. lfd f29, 120(SP)
  3471. lfd f30, 128(SP)
  3472. lfd f31, 136(SP)
  3473. #ifdef __64BIT__
  3474. ld r14, 144(SP)
  3475. ld r15, 152(SP)
  3476. ld r16, 160(SP)
  3477. ld r17, 168(SP)
  3478. ld r18, 176(SP)
  3479. ld r19, 184(SP)
  3480. ld r20, 192(SP)
  3481. ld r21, 200(SP)
  3482. ld r22, 208(SP)
  3483. #else
  3484. lwz r14, 144(SP)
  3485. lwz r15, 148(SP)
  3486. lwz r16, 152(SP)
  3487. lwz r17, 156(SP)
  3488. lwz r18, 160(SP)
  3489. lwz r19, 164(SP)
  3490. lwz r20, 168(SP)
  3491. lwz r21, 172(SP)
  3492. lwz r22, 176(SP)
  3493. #endif
  3494. addi SP, SP, STACKSIZE
  3495. blr
  3496. EPILOGUE
  3497. #endif