You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n.S 87 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r8
  54. #define LDA r9
  55. #define X r10
  56. #define INCX r5
  57. #define Y r6
  58. #define INCY r7
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r10
  66. #define LDA r5
  67. #define X r6
  68. #define INCX r7
  69. #define Y r8
  70. #define INCY r9
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r8
  75. #define LDA r9
  76. #define X r10
  77. #define INCX r5
  78. #define Y r6
  79. #define INCY r7
  80. #endif
  81. #endif
  82. #define I r11
  83. #define J r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define LDA4 r18
  89. #define Y1 r19
  90. #define Y2 r20
  91. #define PREA r21
  92. #define PREC r22
  93. #define y01 f0
  94. #define y02 f1
  95. #define y03 f2
  96. #define y04 f3
  97. #define y05 f4
  98. #define y06 f5
  99. #define y07 f6
  100. #define y08 f7
  101. #define y09 f8
  102. #define y10 f9
  103. #define y11 f10
  104. #define y12 f11
  105. #define y13 f12
  106. #define y14 f13
  107. #define y15 f14
  108. #define y16 f15
  109. #define alpha1r f16
  110. #define alpha1i f17
  111. #define alpha2r f18
  112. #define alpha2i f19
  113. #define alpha3r f20
  114. #define alpha3i f21
  115. #define alpha4r f22
  116. #define alpha4i f23
  117. #define a1 f24
  118. #define a2 f25
  119. #define a3 f26
  120. #define a4 f27
  121. #define a5 f28
  122. #define a6 f29
  123. #define a7 f30
  124. #define a8 f31
  125. #define alpha_r f14
  126. #define alpha_i f15
  127. #if defined(PPCG4)
  128. #define PREFETCHSIZE_A 34
  129. #define PREFETCHSIZE_C 16
  130. #endif
  131. #if defined(PPC440) || defined(PPC440FP2)
  132. #define PREFETCHSIZE_A 34
  133. #define PREFETCHSIZE_C 16
  134. #endif
  135. #ifdef PPC970
  136. #define PREFETCHSIZE_A 56
  137. #define PREFETCHSIZE_C 16
  138. #endif
  139. #ifdef CELL
  140. #define PREFETCHSIZE_A 56
  141. #define PREFETCHSIZE_C 16
  142. #endif
  143. #ifdef POWER4
  144. #define PREFETCHSIZE_A 34
  145. #define PREFETCHSIZE_C 16
  146. #endif
  147. #ifdef POWER5
  148. #define PREFETCHSIZE_A 40
  149. #define PREFETCHSIZE_C 24
  150. #endif
  151. #ifdef POWER6
  152. #define PREFETCHSIZE_A 24
  153. #define PREFETCHSIZE_C 24
  154. #endif
  155. #ifdef POWER8
  156. #define PREFETCHSIZE_A 24
  157. #define PREFETCHSIZE_C 24
  158. #endif
  159. #ifndef XCONJ
  160. #define FMADDR FMADD
  161. #define FMSUBR FNMSUB
  162. #else
  163. #define FMADDR FNMSUB
  164. #define FMSUBR FMADD
  165. #endif
  166. #ifndef CONJ
  167. #define FMADDX FMADD
  168. #define FMSUBX FNMSUB
  169. #else
  170. #define FMADDX FNMSUB
  171. #define FMSUBX FMADD
  172. #endif
  173. #ifndef NEEDPARAM
  174. #ifndef __64BIT__
  175. #define STACKSIZE 224
  176. #define ALPHA_R 208(SP)
  177. #define ALPHA_I 216(SP)
  178. #else
  179. #define STACKSIZE 280
  180. #define ALPHA_R 256(SP)
  181. #define ALPHA_I 264(SP)
  182. #endif
  183. PROLOGUE
  184. PROFCODE
  185. addi SP, SP, -STACKSIZE
  186. li r0, 0
  187. stfd f14, 0(SP)
  188. stfd f15, 8(SP)
  189. stfd f16, 16(SP)
  190. stfd f17, 24(SP)
  191. stfd f18, 32(SP)
  192. stfd f19, 40(SP)
  193. stfd f20, 48(SP)
  194. stfd f21, 56(SP)
  195. stfd f22, 64(SP)
  196. stfd f23, 72(SP)
  197. stfd f24, 80(SP)
  198. stfd f25, 88(SP)
  199. stfd f26, 96(SP)
  200. stfd f27, 104(SP)
  201. stfd f28, 112(SP)
  202. stfd f29, 120(SP)
  203. stfd f30, 128(SP)
  204. stfd f31, 136(SP)
  205. #ifdef __64BIT__
  206. std r14, 144(SP)
  207. std r15, 152(SP)
  208. std r16, 160(SP)
  209. std r17, 168(SP)
  210. std r18, 176(SP)
  211. std r19, 184(SP)
  212. std r20, 192(SP)
  213. std r21, 200(SP)
  214. std r22, 208(SP)
  215. #else
  216. stw r14, 144(SP)
  217. stw r15, 148(SP)
  218. stw r16, 152(SP)
  219. stw r17, 156(SP)
  220. stw r18, 160(SP)
  221. stw r19, 164(SP)
  222. stw r20, 168(SP)
  223. stw r21, 172(SP)
  224. stw r22, 176(SP)
  225. #endif
  226. #if defined(linux) || defined(__FreeBSD__)
  227. #ifndef __64BIT__
  228. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  229. #else
  230. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  231. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  232. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  233. #endif
  234. #endif
  235. #if defined(_AIX) || defined(__APPLE__)
  236. #ifndef __64BIT__
  237. #ifdef DOUBLE
  238. lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
  239. lwz X, FRAMESLOT(1) + STACKSIZE(SP)
  240. lwz INCX, FRAMESLOT(2) + STACKSIZE(SP)
  241. lwz Y, FRAMESLOT(3) + STACKSIZE(SP)
  242. lwz INCY, FRAMESLOT(4) + STACKSIZE(SP)
  243. #else
  244. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  245. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  246. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  247. #endif
  248. #else
  249. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  250. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  251. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  252. #endif
  253. #endif
  254. stfd f1, ALPHA_R
  255. stfd f2, ALPHA_I
  256. slwi LDA4, LDA, ZBASE_SHIFT + 2
  257. slwi LDA, LDA, ZBASE_SHIFT
  258. slwi INCX, INCX, ZBASE_SHIFT
  259. slwi INCY, INCY, ZBASE_SHIFT
  260. li PREA, PREFETCHSIZE_A * SIZE
  261. li PREC, PREFETCHSIZE_C * SIZE
  262. cmpwi cr0, M, 0
  263. ble- LL(999)
  264. cmpwi cr0, N, 0
  265. ble- LL(999)
  266. cmpi cr0, 0, INCY, 2 * SIZE
  267. bne LL(100)
  268. srawi. J, N, 2
  269. ble LL(20)
  270. .align 4
  271. LL(11):
  272. lfd alpha_r, ALPHA_R
  273. lfd alpha_i, ALPHA_I
  274. LFD a1, 0 * SIZE(X)
  275. LFD a2, 1 * SIZE(X)
  276. add X, X, INCX
  277. LFD a3, 0 * SIZE(X)
  278. LFD a4, 1 * SIZE(X)
  279. add X, X, INCX
  280. LFD a5, 0 * SIZE(X)
  281. LFD a6, 1 * SIZE(X)
  282. add X, X, INCX
  283. LFD a7, 0 * SIZE(X)
  284. LFD a8, 1 * SIZE(X)
  285. add X, X, INCX
  286. FMUL alpha1r, alpha_r, a1
  287. FMUL alpha1i, alpha_i, a1
  288. FMUL alpha2r, alpha_r, a3
  289. FMUL alpha2i, alpha_i, a3
  290. FMUL alpha3r, alpha_r, a5
  291. FMUL alpha3i, alpha_i, a5
  292. FMUL alpha4r, alpha_r, a7
  293. FMUL alpha4i, alpha_i, a7
  294. FMSUBR alpha1r, alpha_i, a2, alpha1r
  295. FMADDR alpha1i, alpha_r, a2, alpha1i
  296. FMSUBR alpha2r, alpha_i, a4, alpha2r
  297. FMADDR alpha2i, alpha_r, a4, alpha2i
  298. FMSUBR alpha3r, alpha_i, a6, alpha3r
  299. FMADDR alpha3i, alpha_r, a6, alpha3i
  300. FMSUBR alpha4r, alpha_i, a8, alpha4r
  301. FMADDR alpha4i, alpha_r, a8, alpha4i
  302. mr AO1, A
  303. add AO2, A, LDA
  304. add AO3, AO2, LDA
  305. add AO4, AO3, LDA
  306. add A, AO4, LDA
  307. mr Y1, Y
  308. mr Y2, Y
  309. srawi. r0, M, 3
  310. mtspr CTR, r0
  311. ble LL(15)
  312. .align 4
  313. LFD a1, 0 * SIZE(AO1)
  314. LFD a2, 1 * SIZE(AO1)
  315. LFD a3, 2 * SIZE(AO1)
  316. LFD a4, 3 * SIZE(AO1)
  317. LFD y01, 0 * SIZE(Y1)
  318. LFD y02, 1 * SIZE(Y1)
  319. LFD y03, 2 * SIZE(Y1)
  320. LFD y04, 3 * SIZE(Y1)
  321. LFD a5, 4 * SIZE(AO1)
  322. LFD a6, 5 * SIZE(AO1)
  323. LFD a7, 6 * SIZE(AO1)
  324. LFD a8, 7 * SIZE(AO1)
  325. LFD y05, 4 * SIZE(Y1)
  326. LFD y06, 5 * SIZE(Y1)
  327. LFD y07, 6 * SIZE(Y1)
  328. LFD y08, 7 * SIZE(Y1)
  329. LFD y09, 8 * SIZE(Y1)
  330. LFD y10, 9 * SIZE(Y1)
  331. LFD y11, 10 * SIZE(Y1)
  332. LFD y12, 11 * SIZE(Y1)
  333. LFD y13, 12 * SIZE(Y1)
  334. LFD y14, 13 * SIZE(Y1)
  335. LFD y15, 14 * SIZE(Y1)
  336. LFD y16, 15 * SIZE(Y1)
  337. addi Y1, Y1, 16 * SIZE
  338. bdz LL(13)
  339. .align 4
  340. LL(12):
  341. FMADD y01, alpha1r, a1, y01
  342. FMADD y02, alpha1i, a1, y02
  343. FMADD y03, alpha1r, a3, y03
  344. FMADD y04, alpha1i, a3, y04
  345. FMADD y05, alpha1r, a5, y05
  346. FMADD y06, alpha1i, a5, y06
  347. FMADD y07, alpha1r, a7, y07
  348. FMADD y08, alpha1i, a7, y08
  349. LFD a1, 8 * SIZE(AO1)
  350. LFD a3, 10 * SIZE(AO1)
  351. LFD a5, 12 * SIZE(AO1)
  352. LFD a7, 14 * SIZE(AO1)
  353. FMSUBX y01, alpha1i, a2, y01
  354. FMADDX y02, alpha1r, a2, y02
  355. FMSUBX y03, alpha1i, a4, y03
  356. FMADDX y04, alpha1r, a4, y04
  357. FMSUBX y05, alpha1i, a6, y05
  358. FMADDX y06, alpha1r, a6, y06
  359. FMSUBX y07, alpha1i, a8, y07
  360. FMADDX y08, alpha1r, a8, y08
  361. LFD a2, 9 * SIZE(AO1)
  362. LFD a4, 11 * SIZE(AO1)
  363. LFD a6, 13 * SIZE(AO1)
  364. LFD a8, 15 * SIZE(AO1)
  365. addi AO1, AO1, 16 * SIZE
  366. nop
  367. DCBT(AO1, PREA)
  368. nop
  369. FMADD y09, alpha1r, a1, y09
  370. FMADD y10, alpha1i, a1, y10
  371. FMADD y11, alpha1r, a3, y11
  372. FMADD y12, alpha1i, a3, y12
  373. FMADD y13, alpha1r, a5, y13
  374. FMADD y14, alpha1i, a5, y14
  375. FMADD y15, alpha1r, a7, y15
  376. FMADD y16, alpha1i, a7, y16
  377. LFD a1, 0 * SIZE(AO2)
  378. LFD a3, 2 * SIZE(AO2)
  379. LFD a5, 4 * SIZE(AO2)
  380. LFD a7, 6 * SIZE(AO2)
  381. FMSUBX y09, alpha1i, a2, y09
  382. FMADDX y10, alpha1r, a2, y10
  383. FMSUBX y11, alpha1i, a4, y11
  384. FMADDX y12, alpha1r, a4, y12
  385. FMSUBX y13, alpha1i, a6, y13
  386. FMADDX y14, alpha1r, a6, y14
  387. FMSUBX y15, alpha1i, a8, y15
  388. FMADDX y16, alpha1r, a8, y16
  389. LFD a2, 1 * SIZE(AO2)
  390. LFD a4, 3 * SIZE(AO2)
  391. LFD a6, 5 * SIZE(AO2)
  392. LFD a8, 7 * SIZE(AO2)
  393. FMADD y01, alpha2r, a1, y01
  394. FMADD y02, alpha2i, a1, y02
  395. FMADD y03, alpha2r, a3, y03
  396. FMADD y04, alpha2i, a3, y04
  397. FMADD y05, alpha2r, a5, y05
  398. FMADD y06, alpha2i, a5, y06
  399. FMADD y07, alpha2r, a7, y07
  400. FMADD y08, alpha2i, a7, y08
  401. LFD a1, 8 * SIZE(AO2)
  402. LFD a3, 10 * SIZE(AO2)
  403. LFD a5, 12 * SIZE(AO2)
  404. LFD a7, 14 * SIZE(AO2)
  405. FMSUBX y01, alpha2i, a2, y01
  406. FMADDX y02, alpha2r, a2, y02
  407. FMSUBX y03, alpha2i, a4, y03
  408. FMADDX y04, alpha2r, a4, y04
  409. FMSUBX y05, alpha2i, a6, y05
  410. FMADDX y06, alpha2r, a6, y06
  411. FMSUBX y07, alpha2i, a8, y07
  412. FMADDX y08, alpha2r, a8, y08
  413. LFD a2, 9 * SIZE(AO2)
  414. LFD a4, 11 * SIZE(AO2)
  415. LFD a6, 13 * SIZE(AO2)
  416. LFD a8, 15 * SIZE(AO2)
  417. addi AO2, AO2, 16 * SIZE
  418. nop
  419. DCBT(AO2, PREA)
  420. nop
  421. FMADD y09, alpha2r, a1, y09
  422. FMADD y10, alpha2i, a1, y10
  423. FMADD y11, alpha2r, a3, y11
  424. FMADD y12, alpha2i, a3, y12
  425. FMADD y13, alpha2r, a5, y13
  426. FMADD y14, alpha2i, a5, y14
  427. FMADD y15, alpha2r, a7, y15
  428. FMADD y16, alpha2i, a7, y16
  429. LFD a1, 0 * SIZE(AO3)
  430. LFD a3, 2 * SIZE(AO3)
  431. LFD a5, 4 * SIZE(AO3)
  432. LFD a7, 6 * SIZE(AO3)
  433. FMSUBX y09, alpha2i, a2, y09
  434. FMADDX y10, alpha2r, a2, y10
  435. FMSUBX y11, alpha2i, a4, y11
  436. FMADDX y12, alpha2r, a4, y12
  437. FMSUBX y13, alpha2i, a6, y13
  438. FMADDX y14, alpha2r, a6, y14
  439. FMSUBX y15, alpha2i, a8, y15
  440. FMADDX y16, alpha2r, a8, y16
  441. LFD a2, 1 * SIZE(AO3)
  442. LFD a4, 3 * SIZE(AO3)
  443. LFD a6, 5 * SIZE(AO3)
  444. LFD a8, 7 * SIZE(AO3)
  445. FMADD y01, alpha3r, a1, y01
  446. FMADD y02, alpha3i, a1, y02
  447. FMADD y03, alpha3r, a3, y03
  448. FMADD y04, alpha3i, a3, y04
  449. FMADD y05, alpha3r, a5, y05
  450. FMADD y06, alpha3i, a5, y06
  451. FMADD y07, alpha3r, a7, y07
  452. FMADD y08, alpha3i, a7, y08
  453. LFD a1, 8 * SIZE(AO3)
  454. LFD a3, 10 * SIZE(AO3)
  455. LFD a5, 12 * SIZE(AO3)
  456. LFD a7, 14 * SIZE(AO3)
  457. FMSUBX y01, alpha3i, a2, y01
  458. FMADDX y02, alpha3r, a2, y02
  459. FMSUBX y03, alpha3i, a4, y03
  460. FMADDX y04, alpha3r, a4, y04
  461. FMSUBX y05, alpha3i, a6, y05
  462. FMADDX y06, alpha3r, a6, y06
  463. FMSUBX y07, alpha3i, a8, y07
  464. FMADDX y08, alpha3r, a8, y08
  465. LFD a2, 9 * SIZE(AO3)
  466. LFD a4, 11 * SIZE(AO3)
  467. LFD a6, 13 * SIZE(AO3)
  468. LFD a8, 15 * SIZE(AO3)
  469. addi AO3, AO3, 16 * SIZE
  470. nop
  471. DCBT(AO3, PREA)
  472. nop
  473. FMADD y09, alpha3r, a1, y09
  474. FMADD y10, alpha3i, a1, y10
  475. FMADD y11, alpha3r, a3, y11
  476. FMADD y12, alpha3i, a3, y12
  477. FMADD y13, alpha3r, a5, y13
  478. FMADD y14, alpha3i, a5, y14
  479. FMADD y15, alpha3r, a7, y15
  480. FMADD y16, alpha3i, a7, y16
  481. LFD a1, 0 * SIZE(AO4)
  482. LFD a3, 2 * SIZE(AO4)
  483. LFD a5, 4 * SIZE(AO4)
  484. LFD a7, 6 * SIZE(AO4)
  485. FMSUBX y09, alpha3i, a2, y09
  486. FMADDX y10, alpha3r, a2, y10
  487. FMSUBX y11, alpha3i, a4, y11
  488. FMADDX y12, alpha3r, a4, y12
  489. FMSUBX y13, alpha3i, a6, y13
  490. FMADDX y14, alpha3r, a6, y14
  491. FMSUBX y15, alpha3i, a8, y15
  492. FMADDX y16, alpha3r, a8, y16
  493. LFD a2, 1 * SIZE(AO4)
  494. LFD a4, 3 * SIZE(AO4)
  495. LFD a6, 5 * SIZE(AO4)
  496. LFD a8, 7 * SIZE(AO4)
  497. FMADD y01, alpha4r, a1, y01
  498. FMADD y02, alpha4i, a1, y02
  499. FMADD y03, alpha4r, a3, y03
  500. FMADD y04, alpha4i, a3, y04
  501. FMADD y05, alpha4r, a5, y05
  502. FMADD y06, alpha4i, a5, y06
  503. FMADD y07, alpha4r, a7, y07
  504. FMADD y08, alpha4i, a7, y08
  505. LFD a1, 8 * SIZE(AO4)
  506. LFD a3, 10 * SIZE(AO4)
  507. LFD a5, 12 * SIZE(AO4)
  508. LFD a7, 14 * SIZE(AO4)
  509. FMSUBX y01, alpha4i, a2, y01
  510. FMADDX y02, alpha4r, a2, y02
  511. FMSUBX y03, alpha4i, a4, y03
  512. FMADDX y04, alpha4r, a4, y04
  513. STFD y01, 0 * SIZE(Y2)
  514. STFD y02, 1 * SIZE(Y2)
  515. STFD y03, 2 * SIZE(Y2)
  516. STFD y04, 3 * SIZE(Y2)
  517. LFD y01, 0 * SIZE(Y1)
  518. LFD y02, 1 * SIZE(Y1)
  519. LFD y03, 2 * SIZE(Y1)
  520. LFD y04, 3 * SIZE(Y1)
  521. FMSUBX y05, alpha4i, a6, y05
  522. FMADDX y06, alpha4r, a6, y06
  523. FMSUBX y07, alpha4i, a8, y07
  524. FMADDX y08, alpha4r, a8, y08
  525. LFD a2, 9 * SIZE(AO4)
  526. LFD a4, 11 * SIZE(AO4)
  527. LFD a6, 13 * SIZE(AO4)
  528. LFD a8, 15 * SIZE(AO4)
  529. addi AO4, AO4, 16 * SIZE
  530. nop
  531. DCBT(AO4, PREA)
  532. nop
  533. STFD y05, 4 * SIZE(Y2)
  534. STFD y06, 5 * SIZE(Y2)
  535. STFD y07, 6 * SIZE(Y2)
  536. STFD y08, 7 * SIZE(Y2)
  537. LFD y05, 4 * SIZE(Y1)
  538. LFD y06, 5 * SIZE(Y1)
  539. LFD y07, 6 * SIZE(Y1)
  540. LFD y08, 7 * SIZE(Y1)
  541. FMADD y09, alpha4r, a1, y09
  542. FMADD y10, alpha4i, a1, y10
  543. FMADD y11, alpha4r, a3, y11
  544. FMADD y12, alpha4i, a3, y12
  545. FMADD y13, alpha4r, a5, y13
  546. FMADD y14, alpha4i, a5, y14
  547. FMADD y15, alpha4r, a7, y15
  548. FMADD y16, alpha4i, a7, y16
  549. LFD a1, 0 * SIZE(AO1)
  550. LFD a3, 2 * SIZE(AO1)
  551. LFD a5, 4 * SIZE(AO1)
  552. LFD a7, 6 * SIZE(AO1)
  553. FMSUBX y09, alpha4i, a2, y09
  554. FMADDX y10, alpha4r, a2, y10
  555. FMSUBX y11, alpha4i, a4, y11
  556. FMADDX y12, alpha4r, a4, y12
  557. STFD y09, 8 * SIZE(Y2)
  558. STFD y10, 9 * SIZE(Y2)
  559. STFD y11, 10 * SIZE(Y2)
  560. STFD y12, 11 * SIZE(Y2)
  561. LFD y09, 8 * SIZE(Y1)
  562. LFD y10, 9 * SIZE(Y1)
  563. LFD y11, 10 * SIZE(Y1)
  564. LFD y12, 11 * SIZE(Y1)
  565. FMSUBX y13, alpha4i, a6, y13
  566. FMADDX y14, alpha4r, a6, y14
  567. FMSUBX y15, alpha4i, a8, y15
  568. FMADDX y16, alpha4r, a8, y16
  569. LFD a2, 1 * SIZE(AO1)
  570. LFD a4, 3 * SIZE(AO1)
  571. LFD a6, 5 * SIZE(AO1)
  572. LFD a8, 7 * SIZE(AO1)
  573. STFD y13, 12 * SIZE(Y2)
  574. STFD y14, 13 * SIZE(Y2)
  575. STFD y15, 14 * SIZE(Y2)
  576. STFD y16, 15 * SIZE(Y2)
  577. LFD y13, 12 * SIZE(Y1)
  578. LFD y14, 13 * SIZE(Y1)
  579. LFD y15, 14 * SIZE(Y1)
  580. LFD y16, 15 * SIZE(Y1)
  581. addi Y2, Y2, 16 * SIZE
  582. addi Y1, Y1, 16 * SIZE
  583. DCBT(Y1, PREC)
  584. bdnz LL(12)
  585. .align 4
  586. LL(13):
  587. FMADD y01, alpha1r, a1, y01
  588. FMADD y02, alpha1i, a1, y02
  589. FMADD y03, alpha1r, a3, y03
  590. FMADD y04, alpha1i, a3, y04
  591. FMADD y05, alpha1r, a5, y05
  592. FMADD y06, alpha1i, a5, y06
  593. FMADD y07, alpha1r, a7, y07
  594. FMADD y08, alpha1i, a7, y08
  595. LFD a1, 8 * SIZE(AO1)
  596. LFD a3, 10 * SIZE(AO1)
  597. LFD a5, 12 * SIZE(AO1)
  598. LFD a7, 14 * SIZE(AO1)
  599. FMSUBX y01, alpha1i, a2, y01
  600. FMADDX y02, alpha1r, a2, y02
  601. FMSUBX y03, alpha1i, a4, y03
  602. FMADDX y04, alpha1r, a4, y04
  603. FMSUBX y05, alpha1i, a6, y05
  604. FMADDX y06, alpha1r, a6, y06
  605. FMSUBX y07, alpha1i, a8, y07
  606. FMADDX y08, alpha1r, a8, y08
  607. LFD a2, 9 * SIZE(AO1)
  608. LFD a4, 11 * SIZE(AO1)
  609. LFD a6, 13 * SIZE(AO1)
  610. LFD a8, 15 * SIZE(AO1)
  611. FMADD y09, alpha1r, a1, y09
  612. FMADD y10, alpha1i, a1, y10
  613. FMADD y11, alpha1r, a3, y11
  614. FMADD y12, alpha1i, a3, y12
  615. FMADD y13, alpha1r, a5, y13
  616. FMADD y14, alpha1i, a5, y14
  617. FMADD y15, alpha1r, a7, y15
  618. FMADD y16, alpha1i, a7, y16
  619. LFD a1, 0 * SIZE(AO2)
  620. LFD a3, 2 * SIZE(AO2)
  621. LFD a5, 4 * SIZE(AO2)
  622. LFD a7, 6 * SIZE(AO2)
  623. FMSUBX y09, alpha1i, a2, y09
  624. FMADDX y10, alpha1r, a2, y10
  625. FMSUBX y11, alpha1i, a4, y11
  626. FMADDX y12, alpha1r, a4, y12
  627. FMSUBX y13, alpha1i, a6, y13
  628. FMADDX y14, alpha1r, a6, y14
  629. FMSUBX y15, alpha1i, a8, y15
  630. FMADDX y16, alpha1r, a8, y16
  631. LFD a2, 1 * SIZE(AO2)
  632. LFD a4, 3 * SIZE(AO2)
  633. LFD a6, 5 * SIZE(AO2)
  634. LFD a8, 7 * SIZE(AO2)
  635. FMADD y01, alpha2r, a1, y01
  636. FMADD y02, alpha2i, a1, y02
  637. FMADD y03, alpha2r, a3, y03
  638. FMADD y04, alpha2i, a3, y04
  639. FMADD y05, alpha2r, a5, y05
  640. FMADD y06, alpha2i, a5, y06
  641. FMADD y07, alpha2r, a7, y07
  642. FMADD y08, alpha2i, a7, y08
  643. LFD a1, 8 * SIZE(AO2)
  644. LFD a3, 10 * SIZE(AO2)
  645. LFD a5, 12 * SIZE(AO2)
  646. LFD a7, 14 * SIZE(AO2)
  647. FMSUBX y01, alpha2i, a2, y01
  648. FMADDX y02, alpha2r, a2, y02
  649. FMSUBX y03, alpha2i, a4, y03
  650. FMADDX y04, alpha2r, a4, y04
  651. FMSUBX y05, alpha2i, a6, y05
  652. FMADDX y06, alpha2r, a6, y06
  653. FMSUBX y07, alpha2i, a8, y07
  654. FMADDX y08, alpha2r, a8, y08
  655. LFD a2, 9 * SIZE(AO2)
  656. LFD a4, 11 * SIZE(AO2)
  657. LFD a6, 13 * SIZE(AO2)
  658. LFD a8, 15 * SIZE(AO2)
  659. FMADD y09, alpha2r, a1, y09
  660. FMADD y10, alpha2i, a1, y10
  661. FMADD y11, alpha2r, a3, y11
  662. FMADD y12, alpha2i, a3, y12
  663. FMADD y13, alpha2r, a5, y13
  664. FMADD y14, alpha2i, a5, y14
  665. FMADD y15, alpha2r, a7, y15
  666. FMADD y16, alpha2i, a7, y16
  667. LFD a1, 0 * SIZE(AO3)
  668. LFD a3, 2 * SIZE(AO3)
  669. LFD a5, 4 * SIZE(AO3)
  670. LFD a7, 6 * SIZE(AO3)
  671. FMSUBX y09, alpha2i, a2, y09
  672. FMADDX y10, alpha2r, a2, y10
  673. FMSUBX y11, alpha2i, a4, y11
  674. FMADDX y12, alpha2r, a4, y12
  675. FMSUBX y13, alpha2i, a6, y13
  676. FMADDX y14, alpha2r, a6, y14
  677. FMSUBX y15, alpha2i, a8, y15
  678. FMADDX y16, alpha2r, a8, y16
  679. LFD a2, 1 * SIZE(AO3)
  680. LFD a4, 3 * SIZE(AO3)
  681. LFD a6, 5 * SIZE(AO3)
  682. LFD a8, 7 * SIZE(AO3)
  683. FMADD y01, alpha3r, a1, y01
  684. FMADD y02, alpha3i, a1, y02
  685. FMADD y03, alpha3r, a3, y03
  686. FMADD y04, alpha3i, a3, y04
  687. FMADD y05, alpha3r, a5, y05
  688. FMADD y06, alpha3i, a5, y06
  689. FMADD y07, alpha3r, a7, y07
  690. FMADD y08, alpha3i, a7, y08
  691. LFD a1, 8 * SIZE(AO3)
  692. LFD a3, 10 * SIZE(AO3)
  693. LFD a5, 12 * SIZE(AO3)
  694. LFD a7, 14 * SIZE(AO3)
  695. FMSUBX y01, alpha3i, a2, y01
  696. FMADDX y02, alpha3r, a2, y02
  697. FMSUBX y03, alpha3i, a4, y03
  698. FMADDX y04, alpha3r, a4, y04
  699. FMSUBX y05, alpha3i, a6, y05
  700. FMADDX y06, alpha3r, a6, y06
  701. FMSUBX y07, alpha3i, a8, y07
  702. FMADDX y08, alpha3r, a8, y08
  703. LFD a2, 9 * SIZE(AO3)
  704. LFD a4, 11 * SIZE(AO3)
  705. LFD a6, 13 * SIZE(AO3)
  706. LFD a8, 15 * SIZE(AO3)
  707. FMADD y09, alpha3r, a1, y09
  708. FMADD y10, alpha3i, a1, y10
  709. FMADD y11, alpha3r, a3, y11
  710. FMADD y12, alpha3i, a3, y12
  711. FMADD y13, alpha3r, a5, y13
  712. FMADD y14, alpha3i, a5, y14
  713. FMADD y15, alpha3r, a7, y15
  714. FMADD y16, alpha3i, a7, y16
  715. LFD a1, 0 * SIZE(AO4)
  716. LFD a3, 2 * SIZE(AO4)
  717. LFD a5, 4 * SIZE(AO4)
  718. LFD a7, 6 * SIZE(AO4)
  719. FMSUBX y09, alpha3i, a2, y09
  720. FMADDX y10, alpha3r, a2, y10
  721. FMSUBX y11, alpha3i, a4, y11
  722. FMADDX y12, alpha3r, a4, y12
  723. FMSUBX y13, alpha3i, a6, y13
  724. FMADDX y14, alpha3r, a6, y14
  725. FMSUBX y15, alpha3i, a8, y15
  726. FMADDX y16, alpha3r, a8, y16
  727. LFD a2, 1 * SIZE(AO4)
  728. LFD a4, 3 * SIZE(AO4)
  729. LFD a6, 5 * SIZE(AO4)
  730. LFD a8, 7 * SIZE(AO4)
  731. FMADD y01, alpha4r, a1, y01
  732. FMADD y02, alpha4i, a1, y02
  733. FMADD y03, alpha4r, a3, y03
  734. FMADD y04, alpha4i, a3, y04
  735. FMADD y05, alpha4r, a5, y05
  736. FMADD y06, alpha4i, a5, y06
  737. FMADD y07, alpha4r, a7, y07
  738. FMADD y08, alpha4i, a7, y08
  739. LFD a1, 8 * SIZE(AO4)
  740. LFD a3, 10 * SIZE(AO4)
  741. LFD a5, 12 * SIZE(AO4)
  742. LFD a7, 14 * SIZE(AO4)
  743. FMSUBX y01, alpha4i, a2, y01
  744. FMADDX y02, alpha4r, a2, y02
  745. FMSUBX y03, alpha4i, a4, y03
  746. FMADDX y04, alpha4r, a4, y04
  747. FMSUBX y05, alpha4i, a6, y05
  748. FMADDX y06, alpha4r, a6, y06
  749. FMSUBX y07, alpha4i, a8, y07
  750. FMADDX y08, alpha4r, a8, y08
  751. LFD a2, 9 * SIZE(AO4)
  752. LFD a4, 11 * SIZE(AO4)
  753. LFD a6, 13 * SIZE(AO4)
  754. LFD a8, 15 * SIZE(AO4)
  755. FMADD y09, alpha4r, a1, y09
  756. FMADD y10, alpha4i, a1, y10
  757. FMADD y11, alpha4r, a3, y11
  758. FMADD y12, alpha4i, a3, y12
  759. FMADD y13, alpha4r, a5, y13
  760. FMADD y14, alpha4i, a5, y14
  761. FMADD y15, alpha4r, a7, y15
  762. FMADD y16, alpha4i, a7, y16
  763. LFD a1, 16 * SIZE(AO1)
  764. LFD a3, 18 * SIZE(AO1)
  765. LFD a5, 20 * SIZE(AO1)
  766. LFD a7, 22 * SIZE(AO1)
  767. FMSUBX y09, alpha4i, a2, y09
  768. FMADDX y10, alpha4r, a2, y10
  769. FMSUBX y11, alpha4i, a4, y11
  770. FMADDX y12, alpha4r, a4, y12
  771. FMSUBX y13, alpha4i, a6, y13
  772. FMADDX y14, alpha4r, a6, y14
  773. FMSUBX y15, alpha4i, a8, y15
  774. FMADDX y16, alpha4r, a8, y16
  775. LFD a2, 17 * SIZE(AO1)
  776. LFD a4, 19 * SIZE(AO1)
  777. LFD a6, 21 * SIZE(AO1)
  778. LFD a8, 23 * SIZE(AO1)
  779. addi AO1, AO1, 16 * SIZE
  780. addi AO2, AO2, 16 * SIZE
  781. addi AO3, AO3, 16 * SIZE
  782. addi AO4, AO4, 16 * SIZE
  783. STFD y01, 0 * SIZE(Y2)
  784. STFD y02, 1 * SIZE(Y2)
  785. STFD y03, 2 * SIZE(Y2)
  786. STFD y04, 3 * SIZE(Y2)
  787. STFD y05, 4 * SIZE(Y2)
  788. STFD y06, 5 * SIZE(Y2)
  789. STFD y07, 6 * SIZE(Y2)
  790. STFD y08, 7 * SIZE(Y2)
  791. STFD y09, 8 * SIZE(Y2)
  792. STFD y10, 9 * SIZE(Y2)
  793. STFD y11, 10 * SIZE(Y2)
  794. STFD y12, 11 * SIZE(Y2)
  795. STFD y13, 12 * SIZE(Y2)
  796. STFD y14, 13 * SIZE(Y2)
  797. STFD y15, 14 * SIZE(Y2)
  798. STFD y16, 15 * SIZE(Y2)
  799. addi Y2, Y2, 16 * SIZE
  800. .align 4
  801. LL(15):
  802. andi. r0, M, 7
  803. ble LL(19)
  804. andi. r0, M, 4
  805. ble LL(16)
  806. LFD y01, 0 * SIZE(Y1)
  807. LFD y02, 1 * SIZE(Y1)
  808. LFD y03, 2 * SIZE(Y1)
  809. LFD y04, 3 * SIZE(Y1)
  810. LFD a1, 0 * SIZE(AO1)
  811. LFD a3, 2 * SIZE(AO1)
  812. LFD a5, 4 * SIZE(AO1)
  813. LFD a7, 6 * SIZE(AO1)
  814. LFD y05, 4 * SIZE(Y1)
  815. LFD y06, 5 * SIZE(Y1)
  816. LFD y07, 6 * SIZE(Y1)
  817. LFD y08, 7 * SIZE(Y1)
  818. LFD a2, 1 * SIZE(AO1)
  819. LFD a4, 3 * SIZE(AO1)
  820. LFD a6, 5 * SIZE(AO1)
  821. LFD a8, 7 * SIZE(AO1)
  822. FMADD y01, alpha1r, a1, y01
  823. FMADD y02, alpha1i, a1, y02
  824. FMADD y03, alpha1r, a3, y03
  825. FMADD y04, alpha1i, a3, y04
  826. FMADD y05, alpha1r, a5, y05
  827. FMADD y06, alpha1i, a5, y06
  828. FMADD y07, alpha1r, a7, y07
  829. FMADD y08, alpha1i, a7, y08
  830. LFD a1, 0 * SIZE(AO2)
  831. LFD a3, 2 * SIZE(AO2)
  832. LFD a5, 4 * SIZE(AO2)
  833. LFD a7, 6 * SIZE(AO2)
  834. FMSUBX y01, alpha1i, a2, y01
  835. FMADDX y02, alpha1r, a2, y02
  836. FMSUBX y03, alpha1i, a4, y03
  837. FMADDX y04, alpha1r, a4, y04
  838. FMSUBX y05, alpha1i, a6, y05
  839. FMADDX y06, alpha1r, a6, y06
  840. FMSUBX y07, alpha1i, a8, y07
  841. FMADDX y08, alpha1r, a8, y08
  842. LFD a2, 1 * SIZE(AO2)
  843. LFD a4, 3 * SIZE(AO2)
  844. LFD a6, 5 * SIZE(AO2)
  845. LFD a8, 7 * SIZE(AO2)
  846. FMADD y01, alpha2r, a1, y01
  847. FMADD y02, alpha2i, a1, y02
  848. FMADD y03, alpha2r, a3, y03
  849. FMADD y04, alpha2i, a3, y04
  850. FMADD y05, alpha2r, a5, y05
  851. FMADD y06, alpha2i, a5, y06
  852. FMADD y07, alpha2r, a7, y07
  853. FMADD y08, alpha2i, a7, y08
  854. LFD a1, 0 * SIZE(AO3)
  855. LFD a3, 2 * SIZE(AO3)
  856. LFD a5, 4 * SIZE(AO3)
  857. LFD a7, 6 * SIZE(AO3)
  858. FMSUBX y01, alpha2i, a2, y01
  859. FMADDX y02, alpha2r, a2, y02
  860. FMSUBX y03, alpha2i, a4, y03
  861. FMADDX y04, alpha2r, a4, y04
  862. FMSUBX y05, alpha2i, a6, y05
  863. FMADDX y06, alpha2r, a6, y06
  864. FMSUBX y07, alpha2i, a8, y07
  865. FMADDX y08, alpha2r, a8, y08
  866. LFD a2, 1 * SIZE(AO3)
  867. LFD a4, 3 * SIZE(AO3)
  868. LFD a6, 5 * SIZE(AO3)
  869. LFD a8, 7 * SIZE(AO3)
  870. FMADD y01, alpha3r, a1, y01
  871. FMADD y02, alpha3i, a1, y02
  872. FMADD y03, alpha3r, a3, y03
  873. FMADD y04, alpha3i, a3, y04
  874. FMADD y05, alpha3r, a5, y05
  875. FMADD y06, alpha3i, a5, y06
  876. FMADD y07, alpha3r, a7, y07
  877. FMADD y08, alpha3i, a7, y08
  878. LFD a1, 0 * SIZE(AO4)
  879. LFD a3, 2 * SIZE(AO4)
  880. LFD a5, 4 * SIZE(AO4)
  881. LFD a7, 6 * SIZE(AO4)
  882. FMSUBX y01, alpha3i, a2, y01
  883. FMADDX y02, alpha3r, a2, y02
  884. FMSUBX y03, alpha3i, a4, y03
  885. FMADDX y04, alpha3r, a4, y04
  886. FMSUBX y05, alpha3i, a6, y05
  887. FMADDX y06, alpha3r, a6, y06
  888. FMSUBX y07, alpha3i, a8, y07
  889. FMADDX y08, alpha3r, a8, y08
  890. LFD a2, 1 * SIZE(AO4)
  891. LFD a4, 3 * SIZE(AO4)
  892. LFD a6, 5 * SIZE(AO4)
  893. LFD a8, 7 * SIZE(AO4)
  894. FMADD y01, alpha4r, a1, y01
  895. FMADD y02, alpha4i, a1, y02
  896. FMADD y03, alpha4r, a3, y03
  897. FMADD y04, alpha4i, a3, y04
  898. FMADD y05, alpha4r, a5, y05
  899. FMADD y06, alpha4i, a5, y06
  900. FMADD y07, alpha4r, a7, y07
  901. FMADD y08, alpha4i, a7, y08
  902. FMSUBX y01, alpha4i, a2, y01
  903. FMADDX y02, alpha4r, a2, y02
  904. FMSUBX y03, alpha4i, a4, y03
  905. FMADDX y04, alpha4r, a4, y04
  906. STFD y01, 0 * SIZE(Y2)
  907. STFD y02, 1 * SIZE(Y2)
  908. STFD y03, 2 * SIZE(Y2)
  909. STFD y04, 3 * SIZE(Y2)
  910. FMSUBX y05, alpha4i, a6, y05
  911. FMADDX y06, alpha4r, a6, y06
  912. FMSUBX y07, alpha4i, a8, y07
  913. FMADDX y08, alpha4r, a8, y08
  914. STFD y05, 4 * SIZE(Y2)
  915. STFD y06, 5 * SIZE(Y2)
  916. STFD y07, 6 * SIZE(Y2)
  917. STFD y08, 7 * SIZE(Y2)
  918. addi AO1, AO1, 8 * SIZE
  919. addi AO2, AO2, 8 * SIZE
  920. addi AO3, AO3, 8 * SIZE
  921. addi AO4, AO4, 8 * SIZE
  922. addi Y1, Y1, 8 * SIZE
  923. addi Y2, Y2, 8 * SIZE
  924. .align 4
  925. LL(16):
  926. andi. r0, M, 2
  927. nop
  928. nop
  929. ble LL(17)
  930. LFD a1, 0 * SIZE(AO1)
  931. LFD a2, 1 * SIZE(AO1)
  932. LFD a3, 2 * SIZE(AO1)
  933. LFD a4, 3 * SIZE(AO1)
  934. LFD y01, 0 * SIZE(Y1)
  935. LFD y02, 1 * SIZE(Y1)
  936. LFD y03, 2 * SIZE(Y1)
  937. LFD y04, 3 * SIZE(Y1)
  938. LFD a5, 0 * SIZE(AO2)
  939. LFD a6, 1 * SIZE(AO2)
  940. LFD a7, 2 * SIZE(AO2)
  941. LFD a8, 3 * SIZE(AO2)
  942. FMADD y01, alpha1r, a1, y01
  943. FMADD y02, alpha1i, a1, y02
  944. FMADD y03, alpha1r, a3, y03
  945. FMADD y04, alpha1i, a3, y04
  946. FMSUBX y01, alpha1i, a2, y01
  947. FMADDX y02, alpha1r, a2, y02
  948. FMSUBX y03, alpha1i, a4, y03
  949. FMADDX y04, alpha1r, a4, y04
  950. LFD a1, 0 * SIZE(AO3)
  951. LFD a2, 1 * SIZE(AO3)
  952. LFD a3, 2 * SIZE(AO3)
  953. LFD a4, 3 * SIZE(AO3)
  954. FMADD y01, alpha2r, a5, y01
  955. FMADD y02, alpha2i, a5, y02
  956. FMADD y03, alpha2r, a7, y03
  957. FMADD y04, alpha2i, a7, y04
  958. FMSUBX y01, alpha2i, a6, y01
  959. FMADDX y02, alpha2r, a6, y02
  960. FMSUBX y03, alpha2i, a8, y03
  961. FMADDX y04, alpha2r, a8, y04
  962. LFD a5, 0 * SIZE(AO4)
  963. LFD a6, 1 * SIZE(AO4)
  964. LFD a7, 2 * SIZE(AO4)
  965. LFD a8, 3 * SIZE(AO4)
  966. FMADD y01, alpha3r, a1, y01
  967. FMADD y02, alpha3i, a1, y02
  968. FMADD y03, alpha3r, a3, y03
  969. FMADD y04, alpha3i, a3, y04
  970. FMSUBX y01, alpha3i, a2, y01
  971. FMADDX y02, alpha3r, a2, y02
  972. FMSUBX y03, alpha3i, a4, y03
  973. FMADDX y04, alpha3r, a4, y04
  974. FMADD y01, alpha4r, a5, y01
  975. FMADD y02, alpha4i, a5, y02
  976. FMADD y03, alpha4r, a7, y03
  977. FMADD y04, alpha4i, a7, y04
  978. FMSUBX y01, alpha4i, a6, y01
  979. FMADDX y02, alpha4r, a6, y02
  980. FMSUBX y03, alpha4i, a8, y03
  981. FMADDX y04, alpha4r, a8, y04
  982. STFD y01, 0 * SIZE(Y2)
  983. STFD y02, 1 * SIZE(Y2)
  984. STFD y03, 2 * SIZE(Y2)
  985. STFD y04, 3 * SIZE(Y2)
  986. addi AO1, AO1, 4 * SIZE
  987. addi AO2, AO2, 4 * SIZE
  988. addi AO3, AO3, 4 * SIZE
  989. addi AO4, AO4, 4 * SIZE
  990. addi Y1, Y1, 4 * SIZE
  991. addi Y2, Y2, 4 * SIZE
  992. .align 4
  993. LL(17):
  994. andi. r0, M, 1
  995. ble LL(19)
  996. LFD y01, 0 * SIZE(Y1)
  997. LFD y02, 1 * SIZE(Y1)
  998. LFD a1, 0 * SIZE(AO1)
  999. LFD a2, 1 * SIZE(AO1)
  1000. LFD a3, 0 * SIZE(AO2)
  1001. LFD a4, 1 * SIZE(AO2)
  1002. LFD a5, 0 * SIZE(AO3)
  1003. LFD a6, 1 * SIZE(AO3)
  1004. LFD a7, 0 * SIZE(AO4)
  1005. LFD a8, 1 * SIZE(AO4)
  1006. FMADD y01, alpha1r, a1, y01
  1007. FMADD y02, alpha1i, a1, y02
  1008. FMSUBX y01, alpha1i, a2, y01
  1009. FMADDX y02, alpha1r, a2, y02
  1010. FMADD y01, alpha2r, a3, y01
  1011. FMADD y02, alpha2i, a3, y02
  1012. FMSUBX y01, alpha2i, a4, y01
  1013. FMADDX y02, alpha2r, a4, y02
  1014. FMADD y01, alpha3r, a5, y01
  1015. FMADD y02, alpha3i, a5, y02
  1016. FMSUBX y01, alpha3i, a6, y01
  1017. FMADDX y02, alpha3r, a6, y02
  1018. FMADD y01, alpha4r, a7, y01
  1019. FMADD y02, alpha4i, a7, y02
  1020. FMSUBX y01, alpha4i, a8, y01
  1021. FMADDX y02, alpha4r, a8, y02
  1022. STFD y01, 0 * SIZE(Y2)
  1023. STFD y02, 1 * SIZE(Y2)
  1024. add Y1, Y1, INCY
  1025. add Y2, Y2, INCY
  1026. .align 4
  1027. LL(19):
  1028. addi J, J, -1
  1029. cmpi cr0, 0, J, 0
  1030. bgt LL(11)
  1031. .align 4
  1032. LL(20):
  1033. andi. J, N, 2
  1034. ble LL(30)
  1035. .align 4
  1036. LL(21):
  1037. lfd alpha_r, ALPHA_R
  1038. lfd alpha_i, ALPHA_I
  1039. LFD a1, 0 * SIZE(X)
  1040. LFD a2, 1 * SIZE(X)
  1041. add X, X, INCX
  1042. LFD a3, 0 * SIZE(X)
  1043. LFD a4, 1 * SIZE(X)
  1044. add X, X, INCX
  1045. FMUL alpha1r, alpha_r, a1
  1046. FMUL alpha1i, alpha_i, a1
  1047. FMUL alpha2r, alpha_r, a3
  1048. FMUL alpha2i, alpha_i, a3
  1049. FMSUBR alpha1r, alpha_i, a2, alpha1r
  1050. FMADDR alpha1i, alpha_r, a2, alpha1i
  1051. FMSUBR alpha2r, alpha_i, a4, alpha2r
  1052. FMADDR alpha2i, alpha_r, a4, alpha2i
  1053. mr AO1, A
  1054. add AO2, A, LDA
  1055. add A, AO2, LDA
  1056. mr Y1, Y
  1057. mr Y2, Y
  1058. srawi. r0, M, 3
  1059. mtspr CTR, r0
  1060. ble LL(25)
  1061. .align 4
  1062. LFD a1, 0 * SIZE(AO1)
  1063. LFD a2, 1 * SIZE(AO1)
  1064. LFD a3, 2 * SIZE(AO1)
  1065. LFD a4, 3 * SIZE(AO1)
  1066. LFD y01, 0 * SIZE(Y1)
  1067. LFD y02, 1 * SIZE(Y1)
  1068. LFD y03, 2 * SIZE(Y1)
  1069. LFD y04, 3 * SIZE(Y1)
  1070. LFD a5, 4 * SIZE(AO1)
  1071. LFD a6, 5 * SIZE(AO1)
  1072. LFD a7, 6 * SIZE(AO1)
  1073. LFD a8, 7 * SIZE(AO1)
  1074. LFD y05, 4 * SIZE(Y1)
  1075. LFD y06, 5 * SIZE(Y1)
  1076. LFD y07, 6 * SIZE(Y1)
  1077. LFD y08, 7 * SIZE(Y1)
  1078. LFD y09, 8 * SIZE(Y1)
  1079. LFD y10, 9 * SIZE(Y1)
  1080. LFD y11, 10 * SIZE(Y1)
  1081. LFD y12, 11 * SIZE(Y1)
  1082. LFD y13, 12 * SIZE(Y1)
  1083. LFD y14, 13 * SIZE(Y1)
  1084. LFD y15, 14 * SIZE(Y1)
  1085. LFD y16, 15 * SIZE(Y1)
  1086. addi Y1, Y1, 16 * SIZE
  1087. bdz LL(23)
  1088. .align 4
  1089. LL(22):
  1090. FMADD y01, alpha1r, a1, y01
  1091. FMADD y02, alpha1i, a1, y02
  1092. FMADD y03, alpha1r, a3, y03
  1093. FMADD y04, alpha1i, a3, y04
  1094. FMADD y05, alpha1r, a5, y05
  1095. FMADD y06, alpha1i, a5, y06
  1096. FMADD y07, alpha1r, a7, y07
  1097. FMADD y08, alpha1i, a7, y08
  1098. LFD a1, 8 * SIZE(AO1)
  1099. LFD a3, 10 * SIZE(AO1)
  1100. LFD a5, 12 * SIZE(AO1)
  1101. LFD a7, 14 * SIZE(AO1)
  1102. FMSUBX y01, alpha1i, a2, y01
  1103. FMADDX y02, alpha1r, a2, y02
  1104. FMSUBX y03, alpha1i, a4, y03
  1105. FMADDX y04, alpha1r, a4, y04
  1106. FMSUBX y05, alpha1i, a6, y05
  1107. FMADDX y06, alpha1r, a6, y06
  1108. FMSUBX y07, alpha1i, a8, y07
  1109. FMADDX y08, alpha1r, a8, y08
  1110. LFD a2, 9 * SIZE(AO1)
  1111. LFD a4, 11 * SIZE(AO1)
  1112. LFD a6, 13 * SIZE(AO1)
  1113. LFD a8, 15 * SIZE(AO1)
  1114. addi AO1, AO1, 16 * SIZE
  1115. nop
  1116. DCBT(AO1, PREA)
  1117. nop
  1118. FMADD y09, alpha1r, a1, y09
  1119. FMADD y10, alpha1i, a1, y10
  1120. FMADD y11, alpha1r, a3, y11
  1121. FMADD y12, alpha1i, a3, y12
  1122. FMADD y13, alpha1r, a5, y13
  1123. FMADD y14, alpha1i, a5, y14
  1124. FMADD y15, alpha1r, a7, y15
  1125. FMADD y16, alpha1i, a7, y16
  1126. LFD a1, 0 * SIZE(AO2)
  1127. LFD a3, 2 * SIZE(AO2)
  1128. LFD a5, 4 * SIZE(AO2)
  1129. LFD a7, 6 * SIZE(AO2)
  1130. FMSUBX y09, alpha1i, a2, y09
  1131. FMADDX y10, alpha1r, a2, y10
  1132. FMSUBX y11, alpha1i, a4, y11
  1133. FMADDX y12, alpha1r, a4, y12
  1134. FMSUBX y13, alpha1i, a6, y13
  1135. FMADDX y14, alpha1r, a6, y14
  1136. FMSUBX y15, alpha1i, a8, y15
  1137. FMADDX y16, alpha1r, a8, y16
  1138. LFD a2, 1 * SIZE(AO2)
  1139. LFD a4, 3 * SIZE(AO2)
  1140. LFD a6, 5 * SIZE(AO2)
  1141. LFD a8, 7 * SIZE(AO2)
  1142. FMADD y01, alpha2r, a1, y01
  1143. FMADD y02, alpha2i, a1, y02
  1144. FMADD y03, alpha2r, a3, y03
  1145. FMADD y04, alpha2i, a3, y04
  1146. FMADD y05, alpha2r, a5, y05
  1147. FMADD y06, alpha2i, a5, y06
  1148. FMADD y07, alpha2r, a7, y07
  1149. FMADD y08, alpha2i, a7, y08
  1150. LFD a1, 8 * SIZE(AO2)
  1151. LFD a3, 10 * SIZE(AO2)
  1152. LFD a5, 12 * SIZE(AO2)
  1153. LFD a7, 14 * SIZE(AO2)
  1154. FMSUBX y01, alpha2i, a2, y01
  1155. FMADDX y02, alpha2r, a2, y02
  1156. FMSUBX y03, alpha2i, a4, y03
  1157. FMADDX y04, alpha2r, a4, y04
  1158. STFD y01, 0 * SIZE(Y2)
  1159. STFD y02, 1 * SIZE(Y2)
  1160. STFD y03, 2 * SIZE(Y2)
  1161. STFD y04, 3 * SIZE(Y2)
  1162. LFD y01, 0 * SIZE(Y1)
  1163. LFD y02, 1 * SIZE(Y1)
  1164. LFD y03, 2 * SIZE(Y1)
  1165. LFD y04, 3 * SIZE(Y1)
  1166. FMSUBX y05, alpha2i, a6, y05
  1167. FMADDX y06, alpha2r, a6, y06
  1168. FMSUBX y07, alpha2i, a8, y07
  1169. FMADDX y08, alpha2r, a8, y08
  1170. LFD a2, 9 * SIZE(AO2)
  1171. LFD a4, 11 * SIZE(AO2)
  1172. LFD a6, 13 * SIZE(AO2)
  1173. LFD a8, 15 * SIZE(AO2)
  1174. STFD y05, 4 * SIZE(Y2)
  1175. STFD y06, 5 * SIZE(Y2)
  1176. STFD y07, 6 * SIZE(Y2)
  1177. STFD y08, 7 * SIZE(Y2)
  1178. LFD y05, 4 * SIZE(Y1)
  1179. LFD y06, 5 * SIZE(Y1)
  1180. LFD y07, 6 * SIZE(Y1)
  1181. LFD y08, 7 * SIZE(Y1)
  1182. addi AO2, AO2, 16 * SIZE
  1183. nop
  1184. DCBT(AO2, PREA)
  1185. nop
  1186. FMADD y09, alpha2r, a1, y09
  1187. FMADD y10, alpha2i, a1, y10
  1188. FMADD y11, alpha2r, a3, y11
  1189. FMADD y12, alpha2i, a3, y12
  1190. FMADD y13, alpha2r, a5, y13
  1191. FMADD y14, alpha2i, a5, y14
  1192. FMADD y15, alpha2r, a7, y15
  1193. FMADD y16, alpha2i, a7, y16
  1194. LFD a1, 0 * SIZE(AO1)
  1195. LFD a3, 2 * SIZE(AO1)
  1196. LFD a5, 4 * SIZE(AO1)
  1197. LFD a7, 6 * SIZE(AO1)
  1198. FMSUBX y09, alpha2i, a2, y09
  1199. FMADDX y10, alpha2r, a2, y10
  1200. FMSUBX y11, alpha2i, a4, y11
  1201. FMADDX y12, alpha2r, a4, y12
  1202. STFD y09, 8 * SIZE(Y2)
  1203. STFD y10, 9 * SIZE(Y2)
  1204. STFD y11, 10 * SIZE(Y2)
  1205. STFD y12, 11 * SIZE(Y2)
  1206. LFD y09, 8 * SIZE(Y1)
  1207. LFD y10, 9 * SIZE(Y1)
  1208. LFD y11, 10 * SIZE(Y1)
  1209. LFD y12, 11 * SIZE(Y1)
  1210. FMSUBX y13, alpha2i, a6, y13
  1211. FMADDX y14, alpha2r, a6, y14
  1212. FMSUBX y15, alpha2i, a8, y15
  1213. FMADDX y16, alpha2r, a8, y16
  1214. LFD a2, 1 * SIZE(AO1)
  1215. LFD a4, 3 * SIZE(AO1)
  1216. LFD a6, 5 * SIZE(AO1)
  1217. LFD a8, 7 * SIZE(AO1)
  1218. STFD y13, 12 * SIZE(Y2)
  1219. STFD y14, 13 * SIZE(Y2)
  1220. STFD y15, 14 * SIZE(Y2)
  1221. STFD y16, 15 * SIZE(Y2)
  1222. LFD y13, 12 * SIZE(Y1)
  1223. LFD y14, 13 * SIZE(Y1)
  1224. LFD y15, 14 * SIZE(Y1)
  1225. LFD y16, 15 * SIZE(Y1)
  1226. addi Y2, Y2, 16 * SIZE
  1227. addi Y1, Y1, 16 * SIZE
  1228. DCBT(Y1, PREC)
  1229. bdnz LL(22)
  1230. .align 4
  1231. LL(23):
  1232. FMADD y01, alpha1r, a1, y01
  1233. FMADD y02, alpha1i, a1, y02
  1234. FMADD y03, alpha1r, a3, y03
  1235. FMADD y04, alpha1i, a3, y04
  1236. FMADD y05, alpha1r, a5, y05
  1237. FMADD y06, alpha1i, a5, y06
  1238. FMADD y07, alpha1r, a7, y07
  1239. FMADD y08, alpha1i, a7, y08
  1240. LFD a1, 8 * SIZE(AO1)
  1241. LFD a3, 10 * SIZE(AO1)
  1242. LFD a5, 12 * SIZE(AO1)
  1243. LFD a7, 14 * SIZE(AO1)
  1244. FMSUBX y01, alpha1i, a2, y01
  1245. FMADDX y02, alpha1r, a2, y02
  1246. FMSUBX y03, alpha1i, a4, y03
  1247. FMADDX y04, alpha1r, a4, y04
  1248. FMSUBX y05, alpha1i, a6, y05
  1249. FMADDX y06, alpha1r, a6, y06
  1250. FMSUBX y07, alpha1i, a8, y07
  1251. FMADDX y08, alpha1r, a8, y08
  1252. LFD a2, 9 * SIZE(AO1)
  1253. LFD a4, 11 * SIZE(AO1)
  1254. LFD a6, 13 * SIZE(AO1)
  1255. LFD a8, 15 * SIZE(AO1)
  1256. FMADD y09, alpha1r, a1, y09
  1257. FMADD y10, alpha1i, a1, y10
  1258. FMADD y11, alpha1r, a3, y11
  1259. FMADD y12, alpha1i, a3, y12
  1260. FMADD y13, alpha1r, a5, y13
  1261. FMADD y14, alpha1i, a5, y14
  1262. FMADD y15, alpha1r, a7, y15
  1263. FMADD y16, alpha1i, a7, y16
  1264. LFD a1, 0 * SIZE(AO2)
  1265. LFD a3, 2 * SIZE(AO2)
  1266. LFD a5, 4 * SIZE(AO2)
  1267. LFD a7, 6 * SIZE(AO2)
  1268. FMSUBX y09, alpha1i, a2, y09
  1269. FMADDX y10, alpha1r, a2, y10
  1270. FMSUBX y11, alpha1i, a4, y11
  1271. FMADDX y12, alpha1r, a4, y12
  1272. FMSUBX y13, alpha1i, a6, y13
  1273. FMADDX y14, alpha1r, a6, y14
  1274. FMSUBX y15, alpha1i, a8, y15
  1275. FMADDX y16, alpha1r, a8, y16
  1276. LFD a2, 1 * SIZE(AO2)
  1277. LFD a4, 3 * SIZE(AO2)
  1278. LFD a6, 5 * SIZE(AO2)
  1279. LFD a8, 7 * SIZE(AO2)
  1280. FMADD y01, alpha2r, a1, y01
  1281. FMADD y02, alpha2i, a1, y02
  1282. FMADD y03, alpha2r, a3, y03
  1283. FMADD y04, alpha2i, a3, y04
  1284. FMADD y05, alpha2r, a5, y05
  1285. FMADD y06, alpha2i, a5, y06
  1286. FMADD y07, alpha2r, a7, y07
  1287. FMADD y08, alpha2i, a7, y08
  1288. LFD a1, 8 * SIZE(AO2)
  1289. LFD a3, 10 * SIZE(AO2)
  1290. LFD a5, 12 * SIZE(AO2)
  1291. LFD a7, 14 * SIZE(AO2)
  1292. FMSUBX y01, alpha2i, a2, y01
  1293. FMADDX y02, alpha2r, a2, y02
  1294. FMSUBX y03, alpha2i, a4, y03
  1295. FMADDX y04, alpha2r, a4, y04
  1296. STFD y01, 0 * SIZE(Y2)
  1297. STFD y02, 1 * SIZE(Y2)
  1298. STFD y03, 2 * SIZE(Y2)
  1299. STFD y04, 3 * SIZE(Y2)
  1300. FMSUBX y05, alpha2i, a6, y05
  1301. FMADDX y06, alpha2r, a6, y06
  1302. FMSUBX y07, alpha2i, a8, y07
  1303. FMADDX y08, alpha2r, a8, y08
  1304. LFD a2, 9 * SIZE(AO2)
  1305. LFD a4, 11 * SIZE(AO2)
  1306. LFD a6, 13 * SIZE(AO2)
  1307. LFD a8, 15 * SIZE(AO2)
  1308. STFD y05, 4 * SIZE(Y2)
  1309. STFD y06, 5 * SIZE(Y2)
  1310. STFD y07, 6 * SIZE(Y2)
  1311. STFD y08, 7 * SIZE(Y2)
  1312. FMADD y09, alpha2r, a1, y09
  1313. FMADD y10, alpha2i, a1, y10
  1314. FMADD y11, alpha2r, a3, y11
  1315. FMADD y12, alpha2i, a3, y12
  1316. FMADD y13, alpha2r, a5, y13
  1317. FMADD y14, alpha2i, a5, y14
  1318. FMADD y15, alpha2r, a7, y15
  1319. FMADD y16, alpha2i, a7, y16
  1320. FMSUBX y09, alpha2i, a2, y09
  1321. FMADDX y10, alpha2r, a2, y10
  1322. FMSUBX y11, alpha2i, a4, y11
  1323. FMADDX y12, alpha2r, a4, y12
  1324. FMSUBX y13, alpha2i, a6, y13
  1325. FMADDX y14, alpha2r, a6, y14
  1326. FMSUBX y15, alpha2i, a8, y15
  1327. FMADDX y16, alpha2r, a8, y16
  1328. STFD y09, 8 * SIZE(Y2)
  1329. STFD y10, 9 * SIZE(Y2)
  1330. STFD y11, 10 * SIZE(Y2)
  1331. STFD y12, 11 * SIZE(Y2)
  1332. STFD y13, 12 * SIZE(Y2)
  1333. STFD y14, 13 * SIZE(Y2)
  1334. STFD y15, 14 * SIZE(Y2)
  1335. STFD y16, 15 * SIZE(Y2)
  1336. addi AO1, AO1, 16 * SIZE
  1337. addi AO2, AO2, 16 * SIZE
  1338. addi Y2, Y2, 16 * SIZE
  1339. .align 4
  1340. LL(25):
  1341. andi. r0, M, 7
  1342. ble LL(30)
  1343. andi. r0, M, 4
  1344. ble LL(26)
  1345. LFD y01, 0 * SIZE(Y1)
  1346. LFD y02, 1 * SIZE(Y1)
  1347. LFD y03, 2 * SIZE(Y1)
  1348. LFD y04, 3 * SIZE(Y1)
  1349. LFD a1, 0 * SIZE(AO1)
  1350. LFD a3, 2 * SIZE(AO1)
  1351. LFD a5, 4 * SIZE(AO1)
  1352. LFD a7, 6 * SIZE(AO1)
  1353. LFD y05, 4 * SIZE(Y1)
  1354. LFD y06, 5 * SIZE(Y1)
  1355. LFD y07, 6 * SIZE(Y1)
  1356. LFD y08, 7 * SIZE(Y1)
  1357. LFD a2, 1 * SIZE(AO1)
  1358. LFD a4, 3 * SIZE(AO1)
  1359. LFD a6, 5 * SIZE(AO1)
  1360. LFD a8, 7 * SIZE(AO1)
  1361. FMADD y01, alpha1r, a1, y01
  1362. FMADD y02, alpha1i, a1, y02
  1363. FMADD y03, alpha1r, a3, y03
  1364. FMADD y04, alpha1i, a3, y04
  1365. FMADD y05, alpha1r, a5, y05
  1366. FMADD y06, alpha1i, a5, y06
  1367. FMADD y07, alpha1r, a7, y07
  1368. FMADD y08, alpha1i, a7, y08
  1369. LFD a1, 0 * SIZE(AO2)
  1370. LFD a3, 2 * SIZE(AO2)
  1371. LFD a5, 4 * SIZE(AO2)
  1372. LFD a7, 6 * SIZE(AO2)
  1373. FMSUBX y01, alpha1i, a2, y01
  1374. FMADDX y02, alpha1r, a2, y02
  1375. FMSUBX y03, alpha1i, a4, y03
  1376. FMADDX y04, alpha1r, a4, y04
  1377. FMSUBX y05, alpha1i, a6, y05
  1378. FMADDX y06, alpha1r, a6, y06
  1379. FMSUBX y07, alpha1i, a8, y07
  1380. FMADDX y08, alpha1r, a8, y08
  1381. LFD a2, 1 * SIZE(AO2)
  1382. LFD a4, 3 * SIZE(AO2)
  1383. LFD a6, 5 * SIZE(AO2)
  1384. LFD a8, 7 * SIZE(AO2)
  1385. FMADD y01, alpha2r, a1, y01
  1386. FMADD y02, alpha2i, a1, y02
  1387. FMADD y03, alpha2r, a3, y03
  1388. FMADD y04, alpha2i, a3, y04
  1389. FMADD y05, alpha2r, a5, y05
  1390. FMADD y06, alpha2i, a5, y06
  1391. FMADD y07, alpha2r, a7, y07
  1392. FMADD y08, alpha2i, a7, y08
  1393. FMSUBX y01, alpha2i, a2, y01
  1394. FMADDX y02, alpha2r, a2, y02
  1395. FMSUBX y03, alpha2i, a4, y03
  1396. FMADDX y04, alpha2r, a4, y04
  1397. STFD y01, 0 * SIZE(Y2)
  1398. STFD y02, 1 * SIZE(Y2)
  1399. STFD y03, 2 * SIZE(Y2)
  1400. STFD y04, 3 * SIZE(Y2)
  1401. FMSUBX y05, alpha2i, a6, y05
  1402. FMADDX y06, alpha2r, a6, y06
  1403. FMSUBX y07, alpha2i, a8, y07
  1404. FMADDX y08, alpha2r, a8, y08
  1405. STFD y05, 4 * SIZE(Y2)
  1406. STFD y06, 5 * SIZE(Y2)
  1407. STFD y07, 6 * SIZE(Y2)
  1408. STFD y08, 7 * SIZE(Y2)
  1409. addi AO1, AO1, 8 * SIZE
  1410. addi AO2, AO2, 8 * SIZE
  1411. addi Y1, Y1, 8 * SIZE
  1412. addi Y2, Y2, 8 * SIZE
  1413. .align 4
  1414. LL(26):
  1415. andi. r0, M, 2
  1416. ble LL(27)
  1417. LFD a1, 0 * SIZE(AO1)
  1418. LFD a3, 2 * SIZE(AO1)
  1419. LFD a5, 0 * SIZE(AO2)
  1420. LFD a7, 2 * SIZE(AO2)
  1421. LFD y01, 0 * SIZE(Y1)
  1422. LFD y02, 1 * SIZE(Y1)
  1423. LFD y03, 2 * SIZE(Y1)
  1424. LFD y04, 3 * SIZE(Y1)
  1425. LFD a2, 1 * SIZE(AO1)
  1426. LFD a4, 3 * SIZE(AO1)
  1427. LFD a6, 1 * SIZE(AO2)
  1428. LFD a8, 3 * SIZE(AO2)
  1429. FMADD y01, alpha1r, a1, y01
  1430. FMADD y02, alpha1i, a1, y02
  1431. FMADD y03, alpha1r, a3, y03
  1432. FMADD y04, alpha1i, a3, y04
  1433. FMSUBX y01, alpha1i, a2, y01
  1434. FMADDX y02, alpha1r, a2, y02
  1435. FMSUBX y03, alpha1i, a4, y03
  1436. FMADDX y04, alpha1r, a4, y04
  1437. FMADD y01, alpha2r, a5, y01
  1438. FMADD y02, alpha2i, a5, y02
  1439. FMADD y03, alpha2r, a7, y03
  1440. FMADD y04, alpha2i, a7, y04
  1441. FMSUBX y01, alpha2i, a6, y01
  1442. FMADDX y02, alpha2r, a6, y02
  1443. FMSUBX y03, alpha2i, a8, y03
  1444. FMADDX y04, alpha2r, a8, y04
  1445. STFD y01, 0 * SIZE(Y2)
  1446. STFD y02, 1 * SIZE(Y2)
  1447. STFD y03, 2 * SIZE(Y2)
  1448. STFD y04, 3 * SIZE(Y2)
  1449. addi AO1, AO1, 4 * SIZE
  1450. addi AO2, AO2, 4 * SIZE
  1451. addi Y1, Y1, 4 * SIZE
  1452. addi Y2, Y2, 4 * SIZE
  1453. .align 4
  1454. LL(27):
  1455. andi. r0, M, 1
  1456. ble LL(30)
  1457. LFD y01, 0 * SIZE(Y1)
  1458. LFD y02, 1 * SIZE(Y1)
  1459. LFD a1, 0 * SIZE(AO1)
  1460. LFD a2, 1 * SIZE(AO1)
  1461. LFD a3, 0 * SIZE(AO2)
  1462. LFD a4, 1 * SIZE(AO2)
  1463. FMADD y01, alpha1r, a1, y01
  1464. FMADD y02, alpha1i, a1, y02
  1465. FMSUBX y01, alpha1i, a2, y01
  1466. FMADDX y02, alpha1r, a2, y02
  1467. FMADD y01, alpha2r, a3, y01
  1468. FMADD y02, alpha2i, a3, y02
  1469. FMSUBX y01, alpha2i, a4, y01
  1470. FMADDX y02, alpha2r, a4, y02
  1471. STFD y01, 0 * SIZE(Y2)
  1472. STFD y02, 1 * SIZE(Y2)
  1473. add Y1, Y1, INCY
  1474. add Y2, Y2, INCY
  1475. .align 4
  1476. LL(30):
  1477. andi. J, N, 1
  1478. ble LL(999)
  1479. .align 4
  1480. LL(31):
  1481. lfd alpha_r, ALPHA_R
  1482. lfd alpha_i, ALPHA_I
  1483. LFD a1, 0 * SIZE(X)
  1484. LFD a2, 1 * SIZE(X)
  1485. add X, X, INCX
  1486. FMUL alpha1r, alpha_r, a1
  1487. FMUL alpha1i, alpha_i, a1
  1488. FMSUBR alpha1r, alpha_i, a2, alpha1r
  1489. FMADDR alpha1i, alpha_r, a2, alpha1i
  1490. mr AO1, A
  1491. add A, AO1, LDA
  1492. mr Y1, Y
  1493. mr Y2, Y
  1494. srawi. r0, M, 3
  1495. mtspr CTR, r0
  1496. ble LL(35)
  1497. .align 4
  1498. LFD y01, 0 * SIZE(Y1)
  1499. LFD y02, 1 * SIZE(Y1)
  1500. LFD y03, 2 * SIZE(Y1)
  1501. LFD y04, 3 * SIZE(Y1)
  1502. LFD y05, 4 * SIZE(Y1)
  1503. LFD y06, 5 * SIZE(Y1)
  1504. LFD y07, 6 * SIZE(Y1)
  1505. LFD y08, 7 * SIZE(Y1)
  1506. LFD y09, 8 * SIZE(Y1)
  1507. LFD y10, 9 * SIZE(Y1)
  1508. LFD y11, 10 * SIZE(Y1)
  1509. LFD y12, 11 * SIZE(Y1)
  1510. LFD y13, 12 * SIZE(Y1)
  1511. LFD y14, 13 * SIZE(Y1)
  1512. LFD y15, 14 * SIZE(Y1)
  1513. LFD y16, 15 * SIZE(Y1)
  1514. LFD a1, 0 * SIZE(AO1)
  1515. LFD a2, 1 * SIZE(AO1)
  1516. LFD a3, 2 * SIZE(AO1)
  1517. LFD a4, 3 * SIZE(AO1)
  1518. LFD a5, 4 * SIZE(AO1)
  1519. LFD a6, 5 * SIZE(AO1)
  1520. LFD a7, 6 * SIZE(AO1)
  1521. LFD a8, 7 * SIZE(AO1)
  1522. addi Y1, Y1, 16 * SIZE
  1523. bdz LL(33)
  1524. .align 4
  1525. LL(32):
  1526. FMADD y01, alpha1r, a1, y01
  1527. FMADD y02, alpha1i, a1, y02
  1528. FMADD y03, alpha1r, a3, y03
  1529. FMADD y04, alpha1i, a3, y04
  1530. FMADD y05, alpha1r, a5, y05
  1531. FMADD y06, alpha1i, a5, y06
  1532. FMADD y07, alpha1r, a7, y07
  1533. FMADD y08, alpha1i, a7, y08
  1534. LFD a1, 8 * SIZE(AO1)
  1535. LFD a3, 10 * SIZE(AO1)
  1536. LFD a5, 12 * SIZE(AO1)
  1537. LFD a7, 14 * SIZE(AO1)
  1538. FMSUBX y01, alpha1i, a2, y01
  1539. FMADDX y02, alpha1r, a2, y02
  1540. FMSUBX y03, alpha1i, a4, y03
  1541. FMADDX y04, alpha1r, a4, y04
  1542. STFD y01, 0 * SIZE(Y2)
  1543. STFD y02, 1 * SIZE(Y2)
  1544. STFD y03, 2 * SIZE(Y2)
  1545. STFD y04, 3 * SIZE(Y2)
  1546. LFD y01, 0 * SIZE(Y1)
  1547. LFD y02, 1 * SIZE(Y1)
  1548. LFD y03, 2 * SIZE(Y1)
  1549. LFD y04, 3 * SIZE(Y1)
  1550. FMSUBX y05, alpha1i, a6, y05
  1551. FMADDX y06, alpha1r, a6, y06
  1552. FMSUBX y07, alpha1i, a8, y07
  1553. FMADDX y08, alpha1r, a8, y08
  1554. LFD a2, 9 * SIZE(AO1)
  1555. LFD a4, 11 * SIZE(AO1)
  1556. LFD a6, 13 * SIZE(AO1)
  1557. LFD a8, 15 * SIZE(AO1)
  1558. addi AO1, AO1, 16 * SIZE
  1559. nop
  1560. DCBT(AO1, PREA)
  1561. nop
  1562. STFD y05, 4 * SIZE(Y2)
  1563. STFD y06, 5 * SIZE(Y2)
  1564. STFD y07, 6 * SIZE(Y2)
  1565. STFD y08, 7 * SIZE(Y2)
  1566. LFD y05, 4 * SIZE(Y1)
  1567. LFD y06, 5 * SIZE(Y1)
  1568. LFD y07, 6 * SIZE(Y1)
  1569. LFD y08, 7 * SIZE(Y1)
  1570. FMADD y09, alpha1r, a1, y09
  1571. FMADD y10, alpha1i, a1, y10
  1572. FMADD y11, alpha1r, a3, y11
  1573. FMADD y12, alpha1i, a3, y12
  1574. FMADD y13, alpha1r, a5, y13
  1575. FMADD y14, alpha1i, a5, y14
  1576. FMADD y15, alpha1r, a7, y15
  1577. FMADD y16, alpha1i, a7, y16
  1578. LFD a1, 0 * SIZE(AO1)
  1579. LFD a3, 2 * SIZE(AO1)
  1580. LFD a5, 4 * SIZE(AO1)
  1581. LFD a7, 6 * SIZE(AO1)
  1582. FMSUBX y09, alpha1i, a2, y09
  1583. FMADDX y10, alpha1r, a2, y10
  1584. FMSUBX y11, alpha1i, a4, y11
  1585. FMADDX y12, alpha1r, a4, y12
  1586. STFD y09, 8 * SIZE(Y2)
  1587. STFD y10, 9 * SIZE(Y2)
  1588. STFD y11, 10 * SIZE(Y2)
  1589. STFD y12, 11 * SIZE(Y2)
  1590. LFD y09, 8 * SIZE(Y1)
  1591. LFD y10, 9 * SIZE(Y1)
  1592. LFD y11, 10 * SIZE(Y1)
  1593. LFD y12, 11 * SIZE(Y1)
  1594. FMSUBX y13, alpha1i, a6, y13
  1595. FMADDX y14, alpha1r, a6, y14
  1596. FMSUBX y15, alpha1i, a8, y15
  1597. FMADDX y16, alpha1r, a8, y16
  1598. LFD a2, 1 * SIZE(AO1)
  1599. LFD a4, 3 * SIZE(AO1)
  1600. LFD a6, 5 * SIZE(AO1)
  1601. LFD a8, 7 * SIZE(AO1)
  1602. STFD y13, 12 * SIZE(Y2)
  1603. STFD y14, 13 * SIZE(Y2)
  1604. STFD y15, 14 * SIZE(Y2)
  1605. STFD y16, 15 * SIZE(Y2)
  1606. LFD y13, 12 * SIZE(Y1)
  1607. LFD y14, 13 * SIZE(Y1)
  1608. LFD y15, 14 * SIZE(Y1)
  1609. LFD y16, 15 * SIZE(Y1)
  1610. addi Y1, Y1, 16 * SIZE
  1611. addi Y2, Y2, 16 * SIZE
  1612. DCBT(Y1, PREC)
  1613. bdnz LL(32)
  1614. .align 4
  1615. LL(33):
  1616. FMADD y01, alpha1r, a1, y01
  1617. FMADD y02, alpha1i, a1, y02
  1618. FMADD y03, alpha1r, a3, y03
  1619. FMADD y04, alpha1i, a3, y04
  1620. FMADD y05, alpha1r, a5, y05
  1621. FMADD y06, alpha1i, a5, y06
  1622. FMADD y07, alpha1r, a7, y07
  1623. FMADD y08, alpha1i, a7, y08
  1624. LFD a1, 8 * SIZE(AO1)
  1625. LFD a3, 10 * SIZE(AO1)
  1626. LFD a5, 12 * SIZE(AO1)
  1627. LFD a7, 14 * SIZE(AO1)
  1628. FMSUBX y01, alpha1i, a2, y01
  1629. FMADDX y02, alpha1r, a2, y02
  1630. FMSUBX y03, alpha1i, a4, y03
  1631. FMADDX y04, alpha1r, a4, y04
  1632. STFD y01, 0 * SIZE(Y2)
  1633. STFD y02, 1 * SIZE(Y2)
  1634. STFD y03, 2 * SIZE(Y2)
  1635. STFD y04, 3 * SIZE(Y2)
  1636. FMSUBX y05, alpha1i, a6, y05
  1637. FMADDX y06, alpha1r, a6, y06
  1638. FMSUBX y07, alpha1i, a8, y07
  1639. FMADDX y08, alpha1r, a8, y08
  1640. LFD a2, 9 * SIZE(AO1)
  1641. LFD a4, 11 * SIZE(AO1)
  1642. LFD a6, 13 * SIZE(AO1)
  1643. LFD a8, 15 * SIZE(AO1)
  1644. STFD y05, 4 * SIZE(Y2)
  1645. STFD y06, 5 * SIZE(Y2)
  1646. STFD y07, 6 * SIZE(Y2)
  1647. STFD y08, 7 * SIZE(Y2)
  1648. FMADD y09, alpha1r, a1, y09
  1649. FMADD y10, alpha1i, a1, y10
  1650. FMADD y11, alpha1r, a3, y11
  1651. FMADD y12, alpha1i, a3, y12
  1652. FMADD y13, alpha1r, a5, y13
  1653. FMADD y14, alpha1i, a5, y14
  1654. FMADD y15, alpha1r, a7, y15
  1655. FMADD y16, alpha1i, a7, y16
  1656. FMSUBX y09, alpha1i, a2, y09
  1657. FMADDX y10, alpha1r, a2, y10
  1658. FMSUBX y11, alpha1i, a4, y11
  1659. FMADDX y12, alpha1r, a4, y12
  1660. STFD y09, 8 * SIZE(Y2)
  1661. STFD y10, 9 * SIZE(Y2)
  1662. STFD y11, 10 * SIZE(Y2)
  1663. STFD y12, 11 * SIZE(Y2)
  1664. FMSUBX y13, alpha1i, a6, y13
  1665. FMADDX y14, alpha1r, a6, y14
  1666. FMSUBX y15, alpha1i, a8, y15
  1667. FMADDX y16, alpha1r, a8, y16
  1668. STFD y13, 12 * SIZE(Y2)
  1669. STFD y14, 13 * SIZE(Y2)
  1670. STFD y15, 14 * SIZE(Y2)
  1671. STFD y16, 15 * SIZE(Y2)
  1672. addi AO1, AO1, 16 * SIZE
  1673. addi Y2, Y2, 16 * SIZE
  1674. .align 4
  1675. LL(35):
  1676. andi. r0, M, 7
  1677. ble LL(999)
  1678. andi. r0, M, 4
  1679. ble LL(36)
  1680. LFD y01, 0 * SIZE(Y1)
  1681. LFD y02, 1 * SIZE(Y1)
  1682. LFD y03, 2 * SIZE(Y1)
  1683. LFD y04, 3 * SIZE(Y1)
  1684. LFD a1, 0 * SIZE(AO1)
  1685. LFD a3, 2 * SIZE(AO1)
  1686. LFD a5, 4 * SIZE(AO1)
  1687. LFD a7, 6 * SIZE(AO1)
  1688. LFD y05, 4 * SIZE(Y1)
  1689. LFD y06, 5 * SIZE(Y1)
  1690. LFD y07, 6 * SIZE(Y1)
  1691. LFD y08, 7 * SIZE(Y1)
  1692. LFD a2, 1 * SIZE(AO1)
  1693. LFD a4, 3 * SIZE(AO1)
  1694. LFD a6, 5 * SIZE(AO1)
  1695. LFD a8, 7 * SIZE(AO1)
  1696. FMADD y01, alpha1r, a1, y01
  1697. FMADD y02, alpha1i, a1, y02
  1698. FMADD y03, alpha1r, a3, y03
  1699. FMADD y04, alpha1i, a3, y04
  1700. FMADD y05, alpha1r, a5, y05
  1701. FMADD y06, alpha1i, a5, y06
  1702. FMADD y07, alpha1r, a7, y07
  1703. FMADD y08, alpha1i, a7, y08
  1704. FMSUBX y01, alpha1i, a2, y01
  1705. FMADDX y02, alpha1r, a2, y02
  1706. FMSUBX y03, alpha1i, a4, y03
  1707. FMADDX y04, alpha1r, a4, y04
  1708. FMSUBX y05, alpha1i, a6, y05
  1709. FMADDX y06, alpha1r, a6, y06
  1710. FMSUBX y07, alpha1i, a8, y07
  1711. FMADDX y08, alpha1r, a8, y08
  1712. STFD y01, 0 * SIZE(Y2)
  1713. STFD y02, 1 * SIZE(Y2)
  1714. STFD y03, 2 * SIZE(Y2)
  1715. STFD y04, 3 * SIZE(Y2)
  1716. STFD y05, 4 * SIZE(Y2)
  1717. STFD y06, 5 * SIZE(Y2)
  1718. STFD y07, 6 * SIZE(Y2)
  1719. STFD y08, 7 * SIZE(Y2)
  1720. addi AO1, AO1, 8 * SIZE
  1721. addi Y1, Y1, 8 * SIZE
  1722. addi Y2, Y2, 8 * SIZE
  1723. .align 4
  1724. LL(36):
  1725. andi. r0, M, 2
  1726. ble LL(37)
  1727. LFD a1, 0 * SIZE(AO1)
  1728. LFD a2, 1 * SIZE(AO1)
  1729. LFD a3, 2 * SIZE(AO1)
  1730. LFD a4, 3 * SIZE(AO1)
  1731. LFD y01, 0 * SIZE(Y1)
  1732. LFD y02, 1 * SIZE(Y1)
  1733. LFD y03, 2 * SIZE(Y1)
  1734. LFD y04, 3 * SIZE(Y1)
  1735. FMADD y01, alpha1r, a1, y01
  1736. FMADD y02, alpha1i, a1, y02
  1737. FMADD y03, alpha1r, a3, y03
  1738. FMADD y04, alpha1i, a3, y04
  1739. FMSUBX y01, alpha1i, a2, y01
  1740. FMADDX y02, alpha1r, a2, y02
  1741. FMSUBX y03, alpha1i, a4, y03
  1742. FMADDX y04, alpha1r, a4, y04
  1743. STFD y01, 0 * SIZE(Y2)
  1744. STFD y02, 1 * SIZE(Y2)
  1745. STFD y03, 2 * SIZE(Y2)
  1746. STFD y04, 3 * SIZE(Y2)
  1747. addi AO1, AO1, 4 * SIZE
  1748. addi Y1, Y1, 4 * SIZE
  1749. addi Y2, Y2, 4 * SIZE
  1750. .align 4
  1751. LL(37):
  1752. andi. r0, M, 1
  1753. ble LL(999)
  1754. LFD y01, 0 * SIZE(Y1)
  1755. LFD y02, 1 * SIZE(Y1)
  1756. LFD a1, 0 * SIZE(AO1)
  1757. LFD a2, 1 * SIZE(AO1)
  1758. FMADD y01, alpha1r, a1, y01
  1759. FMADD y02, alpha1i, a1, y02
  1760. FMSUBX y01, alpha1i, a2, y01
  1761. FMADDX y02, alpha1r, a2, y02
  1762. STFD y01, 0 * SIZE(Y2)
  1763. STFD y02, 1 * SIZE(Y2)
  1764. add Y1, Y1, INCY
  1765. add Y2, Y2, INCY
  1766. b LL(999)
  1767. .align 4
  1768. LL(100):
  1769. srawi. J, N, 2
  1770. ble LL(120)
  1771. .align 4
  1772. LL(111):
  1773. lfd alpha_r, ALPHA_R
  1774. lfd alpha_i, ALPHA_I
  1775. LFD a1, 0 * SIZE(X)
  1776. LFD a2, 1 * SIZE(X)
  1777. add X, X, INCX
  1778. LFD a3, 0 * SIZE(X)
  1779. LFD a4, 1 * SIZE(X)
  1780. add X, X, INCX
  1781. LFD a5, 0 * SIZE(X)
  1782. LFD a6, 1 * SIZE(X)
  1783. add X, X, INCX
  1784. LFD a7, 0 * SIZE(X)
  1785. LFD a8, 1 * SIZE(X)
  1786. add X, X, INCX
  1787. FMUL alpha1r, alpha_r, a1
  1788. FMUL alpha1i, alpha_i, a1
  1789. FMUL alpha2r, alpha_r, a3
  1790. FMUL alpha2i, alpha_i, a3
  1791. FMUL alpha3r, alpha_r, a5
  1792. FMUL alpha3i, alpha_i, a5
  1793. FMUL alpha4r, alpha_r, a7
  1794. FMUL alpha4i, alpha_i, a7
  1795. FMSUBR alpha1r, alpha_i, a2, alpha1r
  1796. FMADDR alpha1i, alpha_r, a2, alpha1i
  1797. FMSUBR alpha2r, alpha_i, a4, alpha2r
  1798. FMADDR alpha2i, alpha_r, a4, alpha2i
  1799. FMSUBR alpha3r, alpha_i, a6, alpha3r
  1800. FMADDR alpha3i, alpha_r, a6, alpha3i
  1801. FMSUBR alpha4r, alpha_i, a8, alpha4r
  1802. FMADDR alpha4i, alpha_r, a8, alpha4i
  1803. mr AO1, A
  1804. add AO2, A, LDA
  1805. add AO3, AO2, LDA
  1806. add AO4, AO3, LDA
  1807. add A, AO4, LDA
  1808. mr Y1, Y
  1809. mr Y2, Y
  1810. srawi. r0, M, 3
  1811. mtspr CTR, r0
  1812. ble LL(115)
  1813. .align 4
  1814. LFD y01, 0 * SIZE(Y1)
  1815. LFD y02, 1 * SIZE(Y1)
  1816. add Y1, Y1, INCY
  1817. LFD y03, 0 * SIZE(Y1)
  1818. LFD y04, 1 * SIZE(Y1)
  1819. add Y1, Y1, INCY
  1820. LFD y05, 0 * SIZE(Y1)
  1821. LFD y06, 1 * SIZE(Y1)
  1822. add Y1, Y1, INCY
  1823. LFD y07, 0 * SIZE(Y1)
  1824. LFD y08, 1 * SIZE(Y1)
  1825. add Y1, Y1, INCY
  1826. LFD y09, 0 * SIZE(Y1)
  1827. LFD y10, 1 * SIZE(Y1)
  1828. add Y1, Y1, INCY
  1829. LFD y11, 0 * SIZE(Y1)
  1830. LFD y12, 1 * SIZE(Y1)
  1831. add Y1, Y1, INCY
  1832. LFD y13, 0 * SIZE(Y1)
  1833. LFD y14, 1 * SIZE(Y1)
  1834. add Y1, Y1, INCY
  1835. LFD y15, 0 * SIZE(Y1)
  1836. LFD y16, 1 * SIZE(Y1)
  1837. add Y1, Y1, INCY
  1838. LFD a1, 0 * SIZE(AO1)
  1839. LFD a2, 1 * SIZE(AO1)
  1840. LFD a3, 2 * SIZE(AO1)
  1841. LFD a4, 3 * SIZE(AO1)
  1842. LFD a5, 4 * SIZE(AO1)
  1843. LFD a6, 5 * SIZE(AO1)
  1844. LFD a7, 6 * SIZE(AO1)
  1845. LFD a8, 7 * SIZE(AO1)
  1846. bdz LL(113)
  1847. .align 4
  1848. LL(112):
  1849. FMADD y01, alpha1r, a1, y01
  1850. FMADD y02, alpha1i, a1, y02
  1851. FMADD y03, alpha1r, a3, y03
  1852. FMADD y04, alpha1i, a3, y04
  1853. FMADD y05, alpha1r, a5, y05
  1854. FMADD y06, alpha1i, a5, y06
  1855. FMADD y07, alpha1r, a7, y07
  1856. FMADD y08, alpha1i, a7, y08
  1857. LFD a1, 8 * SIZE(AO1)
  1858. LFD a3, 10 * SIZE(AO1)
  1859. LFD a5, 12 * SIZE(AO1)
  1860. LFD a7, 14 * SIZE(AO1)
  1861. FMSUBX y01, alpha1i, a2, y01
  1862. FMADDX y02, alpha1r, a2, y02
  1863. FMSUBX y03, alpha1i, a4, y03
  1864. FMADDX y04, alpha1r, a4, y04
  1865. FMSUBX y05, alpha1i, a6, y05
  1866. FMADDX y06, alpha1r, a6, y06
  1867. FMSUBX y07, alpha1i, a8, y07
  1868. FMADDX y08, alpha1r, a8, y08
  1869. LFD a2, 9 * SIZE(AO1)
  1870. LFD a4, 11 * SIZE(AO1)
  1871. LFD a6, 13 * SIZE(AO1)
  1872. LFD a8, 15 * SIZE(AO1)
  1873. addi AO1, AO1, 16 * SIZE
  1874. nop
  1875. DCBT(AO1, PREA)
  1876. nop
  1877. FMADD y09, alpha1r, a1, y09
  1878. FMADD y10, alpha1i, a1, y10
  1879. FMADD y11, alpha1r, a3, y11
  1880. FMADD y12, alpha1i, a3, y12
  1881. FMADD y13, alpha1r, a5, y13
  1882. FMADD y14, alpha1i, a5, y14
  1883. FMADD y15, alpha1r, a7, y15
  1884. FMADD y16, alpha1i, a7, y16
  1885. LFD a1, 0 * SIZE(AO2)
  1886. LFD a3, 2 * SIZE(AO2)
  1887. LFD a5, 4 * SIZE(AO2)
  1888. LFD a7, 6 * SIZE(AO2)
  1889. FMSUBX y09, alpha1i, a2, y09
  1890. FMADDX y10, alpha1r, a2, y10
  1891. FMSUBX y11, alpha1i, a4, y11
  1892. FMADDX y12, alpha1r, a4, y12
  1893. FMSUBX y13, alpha1i, a6, y13
  1894. FMADDX y14, alpha1r, a6, y14
  1895. FMSUBX y15, alpha1i, a8, y15
  1896. FMADDX y16, alpha1r, a8, y16
  1897. LFD a2, 1 * SIZE(AO2)
  1898. LFD a4, 3 * SIZE(AO2)
  1899. LFD a6, 5 * SIZE(AO2)
  1900. LFD a8, 7 * SIZE(AO2)
  1901. FMADD y01, alpha2r, a1, y01
  1902. FMADD y02, alpha2i, a1, y02
  1903. FMADD y03, alpha2r, a3, y03
  1904. FMADD y04, alpha2i, a3, y04
  1905. FMADD y05, alpha2r, a5, y05
  1906. FMADD y06, alpha2i, a5, y06
  1907. FMADD y07, alpha2r, a7, y07
  1908. FMADD y08, alpha2i, a7, y08
  1909. LFD a1, 8 * SIZE(AO2)
  1910. LFD a3, 10 * SIZE(AO2)
  1911. LFD a5, 12 * SIZE(AO2)
  1912. LFD a7, 14 * SIZE(AO2)
  1913. FMSUBX y01, alpha2i, a2, y01
  1914. FMADDX y02, alpha2r, a2, y02
  1915. FMSUBX y03, alpha2i, a4, y03
  1916. FMADDX y04, alpha2r, a4, y04
  1917. FMSUBX y05, alpha2i, a6, y05
  1918. FMADDX y06, alpha2r, a6, y06
  1919. FMSUBX y07, alpha2i, a8, y07
  1920. FMADDX y08, alpha2r, a8, y08
  1921. LFD a2, 9 * SIZE(AO2)
  1922. LFD a4, 11 * SIZE(AO2)
  1923. LFD a6, 13 * SIZE(AO2)
  1924. LFD a8, 15 * SIZE(AO2)
  1925. addi AO2, AO2, 16 * SIZE
  1926. nop
  1927. DCBT(AO2, PREA)
  1928. nop
  1929. FMADD y09, alpha2r, a1, y09
  1930. FMADD y10, alpha2i, a1, y10
  1931. FMADD y11, alpha2r, a3, y11
  1932. FMADD y12, alpha2i, a3, y12
  1933. FMADD y13, alpha2r, a5, y13
  1934. FMADD y14, alpha2i, a5, y14
  1935. FMADD y15, alpha2r, a7, y15
  1936. FMADD y16, alpha2i, a7, y16
  1937. LFD a1, 0 * SIZE(AO3)
  1938. LFD a3, 2 * SIZE(AO3)
  1939. LFD a5, 4 * SIZE(AO3)
  1940. LFD a7, 6 * SIZE(AO3)
  1941. FMSUBX y09, alpha2i, a2, y09
  1942. FMADDX y10, alpha2r, a2, y10
  1943. FMSUBX y11, alpha2i, a4, y11
  1944. FMADDX y12, alpha2r, a4, y12
  1945. FMSUBX y13, alpha2i, a6, y13
  1946. FMADDX y14, alpha2r, a6, y14
  1947. FMSUBX y15, alpha2i, a8, y15
  1948. FMADDX y16, alpha2r, a8, y16
  1949. LFD a2, 1 * SIZE(AO3)
  1950. LFD a4, 3 * SIZE(AO3)
  1951. LFD a6, 5 * SIZE(AO3)
  1952. LFD a8, 7 * SIZE(AO3)
  1953. FMADD y01, alpha3r, a1, y01
  1954. FMADD y02, alpha3i, a1, y02
  1955. FMADD y03, alpha3r, a3, y03
  1956. FMADD y04, alpha3i, a3, y04
  1957. FMADD y05, alpha3r, a5, y05
  1958. FMADD y06, alpha3i, a5, y06
  1959. FMADD y07, alpha3r, a7, y07
  1960. FMADD y08, alpha3i, a7, y08
  1961. LFD a1, 8 * SIZE(AO3)
  1962. LFD a3, 10 * SIZE(AO3)
  1963. LFD a5, 12 * SIZE(AO3)
  1964. LFD a7, 14 * SIZE(AO3)
  1965. FMSUBX y01, alpha3i, a2, y01
  1966. FMADDX y02, alpha3r, a2, y02
  1967. FMSUBX y03, alpha3i, a4, y03
  1968. FMADDX y04, alpha3r, a4, y04
  1969. FMSUBX y05, alpha3i, a6, y05
  1970. FMADDX y06, alpha3r, a6, y06
  1971. FMSUBX y07, alpha3i, a8, y07
  1972. FMADDX y08, alpha3r, a8, y08
  1973. LFD a2, 9 * SIZE(AO3)
  1974. LFD a4, 11 * SIZE(AO3)
  1975. LFD a6, 13 * SIZE(AO3)
  1976. LFD a8, 15 * SIZE(AO3)
  1977. addi AO3, AO3, 16 * SIZE
  1978. nop
  1979. DCBT(AO3, PREA)
  1980. nop
  1981. FMADD y09, alpha3r, a1, y09
  1982. FMADD y10, alpha3i, a1, y10
  1983. FMADD y11, alpha3r, a3, y11
  1984. FMADD y12, alpha3i, a3, y12
  1985. FMADD y13, alpha3r, a5, y13
  1986. FMADD y14, alpha3i, a5, y14
  1987. FMADD y15, alpha3r, a7, y15
  1988. FMADD y16, alpha3i, a7, y16
  1989. LFD a1, 0 * SIZE(AO4)
  1990. LFD a3, 2 * SIZE(AO4)
  1991. LFD a5, 4 * SIZE(AO4)
  1992. LFD a7, 6 * SIZE(AO4)
  1993. FMSUBX y09, alpha3i, a2, y09
  1994. FMADDX y10, alpha3r, a2, y10
  1995. FMSUBX y11, alpha3i, a4, y11
  1996. FMADDX y12, alpha3r, a4, y12
  1997. FMSUBX y13, alpha3i, a6, y13
  1998. FMADDX y14, alpha3r, a6, y14
  1999. FMSUBX y15, alpha3i, a8, y15
  2000. FMADDX y16, alpha3r, a8, y16
  2001. LFD a2, 1 * SIZE(AO4)
  2002. LFD a4, 3 * SIZE(AO4)
  2003. LFD a6, 5 * SIZE(AO4)
  2004. LFD a8, 7 * SIZE(AO4)
  2005. FMADD y01, alpha4r, a1, y01
  2006. FMADD y02, alpha4i, a1, y02
  2007. FMADD y03, alpha4r, a3, y03
  2008. FMADD y04, alpha4i, a3, y04
  2009. FMADD y05, alpha4r, a5, y05
  2010. FMADD y06, alpha4i, a5, y06
  2011. FMADD y07, alpha4r, a7, y07
  2012. FMADD y08, alpha4i, a7, y08
  2013. LFD a1, 8 * SIZE(AO4)
  2014. LFD a3, 10 * SIZE(AO4)
  2015. LFD a5, 12 * SIZE(AO4)
  2016. LFD a7, 14 * SIZE(AO4)
  2017. FMSUBX y01, alpha4i, a2, y01
  2018. FMADDX y02, alpha4r, a2, y02
  2019. FMSUBX y03, alpha4i, a4, y03
  2020. FMADDX y04, alpha4r, a4, y04
  2021. STFD y01, 0 * SIZE(Y2)
  2022. nop
  2023. STFD y02, 1 * SIZE(Y2)
  2024. add Y2, Y2, INCY
  2025. LFD y01, 0 * SIZE(Y1)
  2026. nop
  2027. LFD y02, 1 * SIZE(Y1)
  2028. add Y1, Y1, INCY
  2029. STFD y03, 0 * SIZE(Y2)
  2030. nop
  2031. STFD y04, 1 * SIZE(Y2)
  2032. add Y2, Y2, INCY
  2033. LFD y03, 0 * SIZE(Y1)
  2034. nop
  2035. LFD y04, 1 * SIZE(Y1)
  2036. add Y1, Y1, INCY
  2037. FMSUBX y05, alpha4i, a6, y05
  2038. FMADDX y06, alpha4r, a6, y06
  2039. FMSUBX y07, alpha4i, a8, y07
  2040. FMADDX y08, alpha4r, a8, y08
  2041. LFD a2, 9 * SIZE(AO4)
  2042. LFD a4, 11 * SIZE(AO4)
  2043. LFD a6, 13 * SIZE(AO4)
  2044. LFD a8, 15 * SIZE(AO4)
  2045. addi AO4, AO4, 16 * SIZE
  2046. nop
  2047. DCBT(AO4, PREA)
  2048. nop
  2049. STFD y05, 0 * SIZE(Y2)
  2050. nop
  2051. STFD y06, 1 * SIZE(Y2)
  2052. add Y2, Y2, INCY
  2053. LFD y05, 0 * SIZE(Y1)
  2054. nop
  2055. LFD y06, 1 * SIZE(Y1)
  2056. add Y1, Y1, INCY
  2057. STFD y07, 0 * SIZE(Y2)
  2058. nop
  2059. STFD y08, 1 * SIZE(Y2)
  2060. add Y2, Y2, INCY
  2061. LFD y07, 0 * SIZE(Y1)
  2062. nop
  2063. LFD y08, 1 * SIZE(Y1)
  2064. add Y1, Y1, INCY
  2065. FMADD y09, alpha4r, a1, y09
  2066. FMADD y10, alpha4i, a1, y10
  2067. FMADD y11, alpha4r, a3, y11
  2068. FMADD y12, alpha4i, a3, y12
  2069. FMADD y13, alpha4r, a5, y13
  2070. FMADD y14, alpha4i, a5, y14
  2071. FMADD y15, alpha4r, a7, y15
  2072. FMADD y16, alpha4i, a7, y16
  2073. LFD a1, 0 * SIZE(AO1)
  2074. LFD a3, 2 * SIZE(AO1)
  2075. LFD a5, 4 * SIZE(AO1)
  2076. LFD a7, 6 * SIZE(AO1)
  2077. FMSUBX y09, alpha4i, a2, y09
  2078. FMADDX y10, alpha4r, a2, y10
  2079. FMSUBX y11, alpha4i, a4, y11
  2080. FMADDX y12, alpha4r, a4, y12
  2081. STFD y09, 0 * SIZE(Y2)
  2082. nop
  2083. STFD y10, 1 * SIZE(Y2)
  2084. add Y2, Y2, INCY
  2085. LFD y09, 0 * SIZE(Y1)
  2086. nop
  2087. LFD y10, 1 * SIZE(Y1)
  2088. add Y1, Y1, INCY
  2089. STFD y11, 0 * SIZE(Y2)
  2090. nop
  2091. STFD y12, 1 * SIZE(Y2)
  2092. add Y2, Y2, INCY
  2093. LFD y11, 0 * SIZE(Y1)
  2094. nop
  2095. LFD y12, 1 * SIZE(Y1)
  2096. add Y1, Y1, INCY
  2097. FMSUBX y13, alpha4i, a6, y13
  2098. FMADDX y14, alpha4r, a6, y14
  2099. FMSUBX y15, alpha4i, a8, y15
  2100. FMADDX y16, alpha4r, a8, y16
  2101. LFD a2, 1 * SIZE(AO1)
  2102. LFD a4, 3 * SIZE(AO1)
  2103. LFD a6, 5 * SIZE(AO1)
  2104. LFD a8, 7 * SIZE(AO1)
  2105. STFD y13, 0 * SIZE(Y2)
  2106. nop
  2107. STFD y14, 1 * SIZE(Y2)
  2108. add Y2, Y2, INCY
  2109. LFD y13, 0 * SIZE(Y1)
  2110. nop
  2111. LFD y14, 1 * SIZE(Y1)
  2112. add Y1, Y1, INCY
  2113. STFD y15, 0 * SIZE(Y2)
  2114. nop
  2115. STFD y16, 1 * SIZE(Y2)
  2116. add Y2, Y2, INCY
  2117. LFD y15, 0 * SIZE(Y1)
  2118. nop
  2119. LFD y16, 1 * SIZE(Y1)
  2120. add Y1, Y1, INCY
  2121. DCBT(Y1, PREC)
  2122. bdnz LL(112)
  2123. .align 4
  2124. LL(113):
  2125. FMADD y01, alpha1r, a1, y01
  2126. FMADD y02, alpha1i, a1, y02
  2127. FMADD y03, alpha1r, a3, y03
  2128. FMADD y04, alpha1i, a3, y04
  2129. FMADD y05, alpha1r, a5, y05
  2130. FMADD y06, alpha1i, a5, y06
  2131. FMADD y07, alpha1r, a7, y07
  2132. FMADD y08, alpha1i, a7, y08
  2133. LFD a1, 8 * SIZE(AO1)
  2134. LFD a3, 10 * SIZE(AO1)
  2135. LFD a5, 12 * SIZE(AO1)
  2136. LFD a7, 14 * SIZE(AO1)
  2137. FMSUBX y01, alpha1i, a2, y01
  2138. FMADDX y02, alpha1r, a2, y02
  2139. FMSUBX y03, alpha1i, a4, y03
  2140. FMADDX y04, alpha1r, a4, y04
  2141. FMSUBX y05, alpha1i, a6, y05
  2142. FMADDX y06, alpha1r, a6, y06
  2143. FMSUBX y07, alpha1i, a8, y07
  2144. FMADDX y08, alpha1r, a8, y08
  2145. LFD a2, 9 * SIZE(AO1)
  2146. LFD a4, 11 * SIZE(AO1)
  2147. LFD a6, 13 * SIZE(AO1)
  2148. LFD a8, 15 * SIZE(AO1)
  2149. FMADD y09, alpha1r, a1, y09
  2150. FMADD y10, alpha1i, a1, y10
  2151. FMADD y11, alpha1r, a3, y11
  2152. FMADD y12, alpha1i, a3, y12
  2153. FMADD y13, alpha1r, a5, y13
  2154. FMADD y14, alpha1i, a5, y14
  2155. FMADD y15, alpha1r, a7, y15
  2156. FMADD y16, alpha1i, a7, y16
  2157. LFD a1, 0 * SIZE(AO2)
  2158. LFD a3, 2 * SIZE(AO2)
  2159. LFD a5, 4 * SIZE(AO2)
  2160. LFD a7, 6 * SIZE(AO2)
  2161. FMSUBX y09, alpha1i, a2, y09
  2162. FMADDX y10, alpha1r, a2, y10
  2163. FMSUBX y11, alpha1i, a4, y11
  2164. FMADDX y12, alpha1r, a4, y12
  2165. FMSUBX y13, alpha1i, a6, y13
  2166. FMADDX y14, alpha1r, a6, y14
  2167. FMSUBX y15, alpha1i, a8, y15
  2168. FMADDX y16, alpha1r, a8, y16
  2169. LFD a2, 1 * SIZE(AO2)
  2170. LFD a4, 3 * SIZE(AO2)
  2171. LFD a6, 5 * SIZE(AO2)
  2172. LFD a8, 7 * SIZE(AO2)
  2173. FMADD y01, alpha2r, a1, y01
  2174. FMADD y02, alpha2i, a1, y02
  2175. FMADD y03, alpha2r, a3, y03
  2176. FMADD y04, alpha2i, a3, y04
  2177. FMADD y05, alpha2r, a5, y05
  2178. FMADD y06, alpha2i, a5, y06
  2179. FMADD y07, alpha2r, a7, y07
  2180. FMADD y08, alpha2i, a7, y08
  2181. LFD a1, 8 * SIZE(AO2)
  2182. LFD a3, 10 * SIZE(AO2)
  2183. LFD a5, 12 * SIZE(AO2)
  2184. LFD a7, 14 * SIZE(AO2)
  2185. FMSUBX y01, alpha2i, a2, y01
  2186. FMADDX y02, alpha2r, a2, y02
  2187. FMSUBX y03, alpha2i, a4, y03
  2188. FMADDX y04, alpha2r, a4, y04
  2189. FMSUBX y05, alpha2i, a6, y05
  2190. FMADDX y06, alpha2r, a6, y06
  2191. FMSUBX y07, alpha2i, a8, y07
  2192. FMADDX y08, alpha2r, a8, y08
  2193. LFD a2, 9 * SIZE(AO2)
  2194. LFD a4, 11 * SIZE(AO2)
  2195. LFD a6, 13 * SIZE(AO2)
  2196. LFD a8, 15 * SIZE(AO2)
  2197. FMADD y09, alpha2r, a1, y09
  2198. FMADD y10, alpha2i, a1, y10
  2199. FMADD y11, alpha2r, a3, y11
  2200. FMADD y12, alpha2i, a3, y12
  2201. FMADD y13, alpha2r, a5, y13
  2202. FMADD y14, alpha2i, a5, y14
  2203. FMADD y15, alpha2r, a7, y15
  2204. FMADD y16, alpha2i, a7, y16
  2205. LFD a1, 0 * SIZE(AO3)
  2206. LFD a3, 2 * SIZE(AO3)
  2207. LFD a5, 4 * SIZE(AO3)
  2208. LFD a7, 6 * SIZE(AO3)
  2209. FMSUBX y09, alpha2i, a2, y09
  2210. FMADDX y10, alpha2r, a2, y10
  2211. FMSUBX y11, alpha2i, a4, y11
  2212. FMADDX y12, alpha2r, a4, y12
  2213. FMSUBX y13, alpha2i, a6, y13
  2214. FMADDX y14, alpha2r, a6, y14
  2215. FMSUBX y15, alpha2i, a8, y15
  2216. FMADDX y16, alpha2r, a8, y16
  2217. LFD a2, 1 * SIZE(AO3)
  2218. LFD a4, 3 * SIZE(AO3)
  2219. LFD a6, 5 * SIZE(AO3)
  2220. LFD a8, 7 * SIZE(AO3)
  2221. FMADD y01, alpha3r, a1, y01
  2222. FMADD y02, alpha3i, a1, y02
  2223. FMADD y03, alpha3r, a3, y03
  2224. FMADD y04, alpha3i, a3, y04
  2225. FMADD y05, alpha3r, a5, y05
  2226. FMADD y06, alpha3i, a5, y06
  2227. FMADD y07, alpha3r, a7, y07
  2228. FMADD y08, alpha3i, a7, y08
  2229. LFD a1, 8 * SIZE(AO3)
  2230. LFD a3, 10 * SIZE(AO3)
  2231. LFD a5, 12 * SIZE(AO3)
  2232. LFD a7, 14 * SIZE(AO3)
  2233. FMSUBX y01, alpha3i, a2, y01
  2234. FMADDX y02, alpha3r, a2, y02
  2235. FMSUBX y03, alpha3i, a4, y03
  2236. FMADDX y04, alpha3r, a4, y04
  2237. FMSUBX y05, alpha3i, a6, y05
  2238. FMADDX y06, alpha3r, a6, y06
  2239. FMSUBX y07, alpha3i, a8, y07
  2240. FMADDX y08, alpha3r, a8, y08
  2241. LFD a2, 9 * SIZE(AO3)
  2242. LFD a4, 11 * SIZE(AO3)
  2243. LFD a6, 13 * SIZE(AO3)
  2244. LFD a8, 15 * SIZE(AO3)
  2245. FMADD y09, alpha3r, a1, y09
  2246. FMADD y10, alpha3i, a1, y10
  2247. FMADD y11, alpha3r, a3, y11
  2248. FMADD y12, alpha3i, a3, y12
  2249. FMADD y13, alpha3r, a5, y13
  2250. FMADD y14, alpha3i, a5, y14
  2251. FMADD y15, alpha3r, a7, y15
  2252. FMADD y16, alpha3i, a7, y16
  2253. LFD a1, 0 * SIZE(AO4)
  2254. LFD a3, 2 * SIZE(AO4)
  2255. LFD a5, 4 * SIZE(AO4)
  2256. LFD a7, 6 * SIZE(AO4)
  2257. FMSUBX y09, alpha3i, a2, y09
  2258. FMADDX y10, alpha3r, a2, y10
  2259. FMSUBX y11, alpha3i, a4, y11
  2260. FMADDX y12, alpha3r, a4, y12
  2261. FMSUBX y13, alpha3i, a6, y13
  2262. FMADDX y14, alpha3r, a6, y14
  2263. FMSUBX y15, alpha3i, a8, y15
  2264. FMADDX y16, alpha3r, a8, y16
  2265. LFD a2, 1 * SIZE(AO4)
  2266. LFD a4, 3 * SIZE(AO4)
  2267. LFD a6, 5 * SIZE(AO4)
  2268. LFD a8, 7 * SIZE(AO4)
  2269. FMADD y01, alpha4r, a1, y01
  2270. FMADD y02, alpha4i, a1, y02
  2271. FMADD y03, alpha4r, a3, y03
  2272. FMADD y04, alpha4i, a3, y04
  2273. FMADD y05, alpha4r, a5, y05
  2274. FMADD y06, alpha4i, a5, y06
  2275. FMADD y07, alpha4r, a7, y07
  2276. FMADD y08, alpha4i, a7, y08
  2277. LFD a1, 8 * SIZE(AO4)
  2278. LFD a3, 10 * SIZE(AO4)
  2279. LFD a5, 12 * SIZE(AO4)
  2280. LFD a7, 14 * SIZE(AO4)
  2281. FMSUBX y01, alpha4i, a2, y01
  2282. FMADDX y02, alpha4r, a2, y02
  2283. FMSUBX y03, alpha4i, a4, y03
  2284. FMADDX y04, alpha4r, a4, y04
  2285. STFD y01, 0 * SIZE(Y2)
  2286. nop
  2287. STFD y02, 1 * SIZE(Y2)
  2288. add Y2, Y2, INCY
  2289. STFD y03, 0 * SIZE(Y2)
  2290. nop
  2291. STFD y04, 1 * SIZE(Y2)
  2292. add Y2, Y2, INCY
  2293. FMSUBX y05, alpha4i, a6, y05
  2294. FMADDX y06, alpha4r, a6, y06
  2295. FMSUBX y07, alpha4i, a8, y07
  2296. FMADDX y08, alpha4r, a8, y08
  2297. LFD a2, 9 * SIZE(AO4)
  2298. LFD a4, 11 * SIZE(AO4)
  2299. LFD a6, 13 * SIZE(AO4)
  2300. LFD a8, 15 * SIZE(AO4)
  2301. STFD y05, 0 * SIZE(Y2)
  2302. nop
  2303. STFD y06, 1 * SIZE(Y2)
  2304. add Y2, Y2, INCY
  2305. STFD y07, 0 * SIZE(Y2)
  2306. nop
  2307. STFD y08, 1 * SIZE(Y2)
  2308. add Y2, Y2, INCY
  2309. FMADD y09, alpha4r, a1, y09
  2310. FMADD y10, alpha4i, a1, y10
  2311. FMADD y11, alpha4r, a3, y11
  2312. FMADD y12, alpha4i, a3, y12
  2313. FMADD y13, alpha4r, a5, y13
  2314. FMADD y14, alpha4i, a5, y14
  2315. FMADD y15, alpha4r, a7, y15
  2316. FMADD y16, alpha4i, a7, y16
  2317. FMSUBX y09, alpha4i, a2, y09
  2318. FMADDX y10, alpha4r, a2, y10
  2319. FMSUBX y11, alpha4i, a4, y11
  2320. FMADDX y12, alpha4r, a4, y12
  2321. STFD y09, 0 * SIZE(Y2)
  2322. nop
  2323. STFD y10, 1 * SIZE(Y2)
  2324. add Y2, Y2, INCY
  2325. STFD y11, 0 * SIZE(Y2)
  2326. nop
  2327. STFD y12, 1 * SIZE(Y2)
  2328. add Y2, Y2, INCY
  2329. FMSUBX y13, alpha4i, a6, y13
  2330. FMADDX y14, alpha4r, a6, y14
  2331. FMSUBX y15, alpha4i, a8, y15
  2332. FMADDX y16, alpha4r, a8, y16
  2333. STFD y13, 0 * SIZE(Y2)
  2334. nop
  2335. STFD y14, 1 * SIZE(Y2)
  2336. add Y2, Y2, INCY
  2337. STFD y15, 0 * SIZE(Y2)
  2338. nop
  2339. STFD y16, 1 * SIZE(Y2)
  2340. add Y2, Y2, INCY
  2341. addi AO1, AO1, 16 * SIZE
  2342. addi AO2, AO2, 16 * SIZE
  2343. addi AO3, AO3, 16 * SIZE
  2344. addi AO4, AO4, 16 * SIZE
  2345. .align 4
  2346. LL(115):
  2347. andi. r0, M, 7
  2348. ble LL(119)
  2349. andi. r0, M, 4
  2350. ble LL(116)
  2351. LFD y01, 0 * SIZE(Y1)
  2352. LFD y02, 1 * SIZE(Y1)
  2353. add Y1, Y1, INCY
  2354. LFD y03, 0 * SIZE(Y1)
  2355. LFD y04, 1 * SIZE(Y1)
  2356. add Y1, Y1, INCY
  2357. LFD a1, 0 * SIZE(AO1)
  2358. LFD a3, 2 * SIZE(AO1)
  2359. LFD a5, 4 * SIZE(AO1)
  2360. LFD a7, 6 * SIZE(AO1)
  2361. LFD y05, 0 * SIZE(Y1)
  2362. LFD y06, 1 * SIZE(Y1)
  2363. add Y1, Y1, INCY
  2364. LFD y07, 0 * SIZE(Y1)
  2365. LFD y08, 1 * SIZE(Y1)
  2366. add Y1, Y1, INCY
  2367. LFD a2, 1 * SIZE(AO1)
  2368. LFD a4, 3 * SIZE(AO1)
  2369. LFD a6, 5 * SIZE(AO1)
  2370. LFD a8, 7 * SIZE(AO1)
  2371. FMADD y01, alpha1r, a1, y01
  2372. FMADD y02, alpha1i, a1, y02
  2373. FMADD y03, alpha1r, a3, y03
  2374. FMADD y04, alpha1i, a3, y04
  2375. FMADD y05, alpha1r, a5, y05
  2376. FMADD y06, alpha1i, a5, y06
  2377. FMADD y07, alpha1r, a7, y07
  2378. FMADD y08, alpha1i, a7, y08
  2379. LFD a1, 0 * SIZE(AO2)
  2380. LFD a3, 2 * SIZE(AO2)
  2381. LFD a5, 4 * SIZE(AO2)
  2382. LFD a7, 6 * SIZE(AO2)
  2383. FMSUBX y01, alpha1i, a2, y01
  2384. FMADDX y02, alpha1r, a2, y02
  2385. FMSUBX y03, alpha1i, a4, y03
  2386. FMADDX y04, alpha1r, a4, y04
  2387. FMSUBX y05, alpha1i, a6, y05
  2388. FMADDX y06, alpha1r, a6, y06
  2389. FMSUBX y07, alpha1i, a8, y07
  2390. FMADDX y08, alpha1r, a8, y08
  2391. LFD a2, 1 * SIZE(AO2)
  2392. LFD a4, 3 * SIZE(AO2)
  2393. LFD a6, 5 * SIZE(AO2)
  2394. LFD a8, 7 * SIZE(AO2)
  2395. FMADD y01, alpha2r, a1, y01
  2396. FMADD y02, alpha2i, a1, y02
  2397. FMADD y03, alpha2r, a3, y03
  2398. FMADD y04, alpha2i, a3, y04
  2399. FMADD y05, alpha2r, a5, y05
  2400. FMADD y06, alpha2i, a5, y06
  2401. FMADD y07, alpha2r, a7, y07
  2402. FMADD y08, alpha2i, a7, y08
  2403. LFD a1, 0 * SIZE(AO3)
  2404. LFD a3, 2 * SIZE(AO3)
  2405. LFD a5, 4 * SIZE(AO3)
  2406. LFD a7, 6 * SIZE(AO3)
  2407. FMSUBX y01, alpha2i, a2, y01
  2408. FMADDX y02, alpha2r, a2, y02
  2409. FMSUBX y03, alpha2i, a4, y03
  2410. FMADDX y04, alpha2r, a4, y04
  2411. FMSUBX y05, alpha2i, a6, y05
  2412. FMADDX y06, alpha2r, a6, y06
  2413. FMSUBX y07, alpha2i, a8, y07
  2414. FMADDX y08, alpha2r, a8, y08
  2415. LFD a2, 1 * SIZE(AO3)
  2416. LFD a4, 3 * SIZE(AO3)
  2417. LFD a6, 5 * SIZE(AO3)
  2418. LFD a8, 7 * SIZE(AO3)
  2419. FMADD y01, alpha3r, a1, y01
  2420. FMADD y02, alpha3i, a1, y02
  2421. FMADD y03, alpha3r, a3, y03
  2422. FMADD y04, alpha3i, a3, y04
  2423. FMADD y05, alpha3r, a5, y05
  2424. FMADD y06, alpha3i, a5, y06
  2425. FMADD y07, alpha3r, a7, y07
  2426. FMADD y08, alpha3i, a7, y08
  2427. LFD a1, 0 * SIZE(AO4)
  2428. LFD a3, 2 * SIZE(AO4)
  2429. LFD a5, 4 * SIZE(AO4)
  2430. LFD a7, 6 * SIZE(AO4)
  2431. FMSUBX y01, alpha3i, a2, y01
  2432. FMADDX y02, alpha3r, a2, y02
  2433. FMSUBX y03, alpha3i, a4, y03
  2434. FMADDX y04, alpha3r, a4, y04
  2435. FMSUBX y05, alpha3i, a6, y05
  2436. FMADDX y06, alpha3r, a6, y06
  2437. FMSUBX y07, alpha3i, a8, y07
  2438. FMADDX y08, alpha3r, a8, y08
  2439. LFD a2, 1 * SIZE(AO4)
  2440. LFD a4, 3 * SIZE(AO4)
  2441. LFD a6, 5 * SIZE(AO4)
  2442. LFD a8, 7 * SIZE(AO4)
  2443. FMADD y01, alpha4r, a1, y01
  2444. FMADD y02, alpha4i, a1, y02
  2445. FMADD y03, alpha4r, a3, y03
  2446. FMADD y04, alpha4i, a3, y04
  2447. FMADD y05, alpha4r, a5, y05
  2448. FMADD y06, alpha4i, a5, y06
  2449. FMADD y07, alpha4r, a7, y07
  2450. FMADD y08, alpha4i, a7, y08
  2451. FMSUBX y01, alpha4i, a2, y01
  2452. FMADDX y02, alpha4r, a2, y02
  2453. FMSUBX y03, alpha4i, a4, y03
  2454. FMADDX y04, alpha4r, a4, y04
  2455. FMSUBX y05, alpha4i, a6, y05
  2456. FMADDX y06, alpha4r, a6, y06
  2457. FMSUBX y07, alpha4i, a8, y07
  2458. FMADDX y08, alpha4r, a8, y08
  2459. STFD y01, 0 * SIZE(Y2)
  2460. addi AO1, AO1, 8 * SIZE
  2461. STFD y02, 1 * SIZE(Y2)
  2462. add Y2, Y2, INCY
  2463. STFD y03, 0 * SIZE(Y2)
  2464. addi AO2, AO2, 8 * SIZE
  2465. STFD y04, 1 * SIZE(Y2)
  2466. add Y2, Y2, INCY
  2467. STFD y05, 0 * SIZE(Y2)
  2468. addi AO3, AO3, 8 * SIZE
  2469. STFD y06, 1 * SIZE(Y2)
  2470. add Y2, Y2, INCY
  2471. STFD y07, 0 * SIZE(Y2)
  2472. addi AO4, AO4, 8 * SIZE
  2473. STFD y08, 1 * SIZE(Y2)
  2474. add Y2, Y2, INCY
  2475. .align 4
  2476. LL(116):
  2477. andi. r0, M, 2
  2478. ble LL(117)
  2479. LFD a1, 0 * SIZE(AO1)
  2480. LFD a2, 1 * SIZE(AO1)
  2481. LFD a3, 2 * SIZE(AO1)
  2482. LFD a4, 3 * SIZE(AO1)
  2483. LFD y01, 0 * SIZE(Y1)
  2484. LFD y02, 1 * SIZE(Y1)
  2485. add Y1, Y1, INCY
  2486. LFD y03, 0 * SIZE(Y1)
  2487. LFD y04, 1 * SIZE(Y1)
  2488. add Y1, Y1, INCY
  2489. LFD a5, 0 * SIZE(AO2)
  2490. LFD a6, 1 * SIZE(AO2)
  2491. LFD a7, 2 * SIZE(AO2)
  2492. LFD a8, 3 * SIZE(AO2)
  2493. FMADD y01, alpha1r, a1, y01
  2494. FMADD y02, alpha1i, a1, y02
  2495. FMADD y03, alpha1r, a3, y03
  2496. FMADD y04, alpha1i, a3, y04
  2497. FMSUBX y01, alpha1i, a2, y01
  2498. FMADDX y02, alpha1r, a2, y02
  2499. FMSUBX y03, alpha1i, a4, y03
  2500. FMADDX y04, alpha1r, a4, y04
  2501. LFD a1, 0 * SIZE(AO3)
  2502. LFD a2, 1 * SIZE(AO3)
  2503. LFD a3, 2 * SIZE(AO3)
  2504. LFD a4, 3 * SIZE(AO3)
  2505. FMADD y01, alpha2r, a5, y01
  2506. FMADD y02, alpha2i, a5, y02
  2507. FMADD y03, alpha2r, a7, y03
  2508. FMADD y04, alpha2i, a7, y04
  2509. FMSUBX y01, alpha2i, a6, y01
  2510. FMADDX y02, alpha2r, a6, y02
  2511. FMSUBX y03, alpha2i, a8, y03
  2512. FMADDX y04, alpha2r, a8, y04
  2513. LFD a5, 0 * SIZE(AO4)
  2514. LFD a6, 1 * SIZE(AO4)
  2515. LFD a7, 2 * SIZE(AO4)
  2516. LFD a8, 3 * SIZE(AO4)
  2517. FMADD y01, alpha3r, a1, y01
  2518. FMADD y02, alpha3i, a1, y02
  2519. FMADD y03, alpha3r, a3, y03
  2520. FMADD y04, alpha3i, a3, y04
  2521. FMSUBX y01, alpha3i, a2, y01
  2522. FMADDX y02, alpha3r, a2, y02
  2523. FMSUBX y03, alpha3i, a4, y03
  2524. FMADDX y04, alpha3r, a4, y04
  2525. FMADD y01, alpha4r, a5, y01
  2526. FMADD y02, alpha4i, a5, y02
  2527. FMADD y03, alpha4r, a7, y03
  2528. FMADD y04, alpha4i, a7, y04
  2529. FMSUBX y01, alpha4i, a6, y01
  2530. FMADDX y02, alpha4r, a6, y02
  2531. FMSUBX y03, alpha4i, a8, y03
  2532. FMADDX y04, alpha4r, a8, y04
  2533. STFD y01, 0 * SIZE(Y2)
  2534. addi AO1, AO1, 4 * SIZE
  2535. STFD y02, 1 * SIZE(Y2)
  2536. add Y2, Y2, INCY
  2537. STFD y03, 0 * SIZE(Y2)
  2538. addi AO2, AO2, 4 * SIZE
  2539. STFD y04, 1 * SIZE(Y2)
  2540. add Y2, Y2, INCY
  2541. addi AO3, AO3, 4 * SIZE
  2542. addi AO4, AO4, 4 * SIZE
  2543. .align 4
  2544. LL(117):
  2545. andi. r0, M, 1
  2546. ble LL(119)
  2547. LFD y01, 0 * SIZE(Y1)
  2548. LFD y02, 1 * SIZE(Y1)
  2549. add Y1, Y1, INCY
  2550. LFD a1, 0 * SIZE(AO1)
  2551. LFD a2, 1 * SIZE(AO1)
  2552. LFD a3, 0 * SIZE(AO2)
  2553. LFD a4, 1 * SIZE(AO2)
  2554. LFD a5, 0 * SIZE(AO3)
  2555. LFD a6, 1 * SIZE(AO3)
  2556. LFD a7, 0 * SIZE(AO4)
  2557. LFD a8, 1 * SIZE(AO4)
  2558. FMADD y01, alpha1r, a1, y01
  2559. FMADD y02, alpha1i, a1, y02
  2560. FMSUBX y01, alpha1i, a2, y01
  2561. FMADDX y02, alpha1r, a2, y02
  2562. FMADD y01, alpha2r, a3, y01
  2563. FMADD y02, alpha2i, a3, y02
  2564. FMSUBX y01, alpha2i, a4, y01
  2565. FMADDX y02, alpha2r, a4, y02
  2566. FMADD y01, alpha3r, a5, y01
  2567. FMADD y02, alpha3i, a5, y02
  2568. FMSUBX y01, alpha3i, a6, y01
  2569. FMADDX y02, alpha3r, a6, y02
  2570. FMADD y01, alpha4r, a7, y01
  2571. FMADD y02, alpha4i, a7, y02
  2572. FMSUBX y01, alpha4i, a8, y01
  2573. FMADDX y02, alpha4r, a8, y02
  2574. STFD y01, 0 * SIZE(Y2)
  2575. STFD y02, 1 * SIZE(Y2)
  2576. add Y2, Y2, INCY
  2577. .align 4
  2578. LL(119):
  2579. addi J, J, -1
  2580. cmpi cr0, 0, J, 0
  2581. bgt LL(111)
  2582. .align 4
  2583. LL(120):
  2584. andi. J, N, 2
  2585. ble LL(130)
  2586. .align 4
  2587. LL(121):
  2588. lfd alpha_r, ALPHA_R
  2589. lfd alpha_i, ALPHA_I
  2590. LFD a1, 0 * SIZE(X)
  2591. LFD a2, 1 * SIZE(X)
  2592. add X, X, INCX
  2593. LFD a3, 0 * SIZE(X)
  2594. LFD a4, 1 * SIZE(X)
  2595. add X, X, INCX
  2596. FMUL alpha1r, alpha_r, a1
  2597. FMUL alpha1i, alpha_i, a1
  2598. FMUL alpha2r, alpha_r, a3
  2599. FMUL alpha2i, alpha_i, a3
  2600. FMSUBR alpha1r, alpha_i, a2, alpha1r
  2601. FMADDR alpha1i, alpha_r, a2, alpha1i
  2602. FMSUBR alpha2r, alpha_i, a4, alpha2r
  2603. FMADDR alpha2i, alpha_r, a4, alpha2i
  2604. mr AO1, A
  2605. add AO2, A, LDA
  2606. add A, AO2, LDA
  2607. mr Y1, Y
  2608. mr Y2, Y
  2609. srawi. r0, M, 3
  2610. mtspr CTR, r0
  2611. ble LL(125)
  2612. .align 4
  2613. LFD y01, 0 * SIZE(Y1)
  2614. LFD y02, 1 * SIZE(Y1)
  2615. add Y1, Y1, INCY
  2616. LFD y03, 0 * SIZE(Y1)
  2617. LFD y04, 1 * SIZE(Y1)
  2618. add Y1, Y1, INCY
  2619. LFD a1, 0 * SIZE(AO1)
  2620. LFD a3, 2 * SIZE(AO1)
  2621. LFD a5, 4 * SIZE(AO1)
  2622. LFD a7, 6 * SIZE(AO1)
  2623. LFD y05, 0 * SIZE(Y1)
  2624. LFD y06, 1 * SIZE(Y1)
  2625. add Y1, Y1, INCY
  2626. LFD y07, 0 * SIZE(Y1)
  2627. LFD y08, 1 * SIZE(Y1)
  2628. add Y1, Y1, INCY
  2629. LFD a2, 1 * SIZE(AO1)
  2630. LFD a4, 3 * SIZE(AO1)
  2631. LFD a6, 5 * SIZE(AO1)
  2632. LFD a8, 7 * SIZE(AO1)
  2633. LFD y09, 0 * SIZE(Y1)
  2634. LFD y10, 1 * SIZE(Y1)
  2635. add Y1, Y1, INCY
  2636. LFD y11, 0 * SIZE(Y1)
  2637. LFD y12, 1 * SIZE(Y1)
  2638. add Y1, Y1, INCY
  2639. LFD y13, 0 * SIZE(Y1)
  2640. LFD y14, 1 * SIZE(Y1)
  2641. add Y1, Y1, INCY
  2642. LFD y15, 0 * SIZE(Y1)
  2643. LFD y16, 1 * SIZE(Y1)
  2644. add Y1, Y1, INCY
  2645. bdz LL(123)
  2646. .align 4
  2647. LL(122):
  2648. FMADD y01, alpha1r, a1, y01
  2649. FMADD y02, alpha1i, a1, y02
  2650. FMADD y03, alpha1r, a3, y03
  2651. FMADD y04, alpha1i, a3, y04
  2652. FMADD y05, alpha1r, a5, y05
  2653. FMADD y06, alpha1i, a5, y06
  2654. FMADD y07, alpha1r, a7, y07
  2655. FMADD y08, alpha1i, a7, y08
  2656. LFD a1, 8 * SIZE(AO1)
  2657. LFD a3, 10 * SIZE(AO1)
  2658. LFD a5, 12 * SIZE(AO1)
  2659. LFD a7, 14 * SIZE(AO1)
  2660. FMSUBX y01, alpha1i, a2, y01
  2661. FMADDX y02, alpha1r, a2, y02
  2662. FMSUBX y03, alpha1i, a4, y03
  2663. FMADDX y04, alpha1r, a4, y04
  2664. FMSUBX y05, alpha1i, a6, y05
  2665. FMADDX y06, alpha1r, a6, y06
  2666. FMSUBX y07, alpha1i, a8, y07
  2667. FMADDX y08, alpha1r, a8, y08
  2668. LFD a2, 9 * SIZE(AO1)
  2669. LFD a4, 11 * SIZE(AO1)
  2670. LFD a6, 13 * SIZE(AO1)
  2671. LFD a8, 15 * SIZE(AO1)
  2672. addi AO1, AO1, 16 * SIZE
  2673. nop
  2674. DCBT(AO1, PREA)
  2675. nop
  2676. FMADD y09, alpha1r, a1, y09
  2677. FMADD y10, alpha1i, a1, y10
  2678. FMADD y11, alpha1r, a3, y11
  2679. FMADD y12, alpha1i, a3, y12
  2680. FMADD y13, alpha1r, a5, y13
  2681. FMADD y14, alpha1i, a5, y14
  2682. FMADD y15, alpha1r, a7, y15
  2683. FMADD y16, alpha1i, a7, y16
  2684. LFD a1, 0 * SIZE(AO2)
  2685. LFD a3, 2 * SIZE(AO2)
  2686. LFD a5, 4 * SIZE(AO2)
  2687. LFD a7, 6 * SIZE(AO2)
  2688. FMSUBX y09, alpha1i, a2, y09
  2689. FMADDX y10, alpha1r, a2, y10
  2690. FMSUBX y11, alpha1i, a4, y11
  2691. FMADDX y12, alpha1r, a4, y12
  2692. FMSUBX y13, alpha1i, a6, y13
  2693. FMADDX y14, alpha1r, a6, y14
  2694. FMSUBX y15, alpha1i, a8, y15
  2695. FMADDX y16, alpha1r, a8, y16
  2696. LFD a2, 1 * SIZE(AO2)
  2697. LFD a4, 3 * SIZE(AO2)
  2698. LFD a6, 5 * SIZE(AO2)
  2699. LFD a8, 7 * SIZE(AO2)
  2700. FMADD y01, alpha2r, a1, y01
  2701. FMADD y02, alpha2i, a1, y02
  2702. FMADD y03, alpha2r, a3, y03
  2703. FMADD y04, alpha2i, a3, y04
  2704. FMADD y05, alpha2r, a5, y05
  2705. FMADD y06, alpha2i, a5, y06
  2706. FMADD y07, alpha2r, a7, y07
  2707. FMADD y08, alpha2i, a7, y08
  2708. LFD a1, 8 * SIZE(AO2)
  2709. LFD a3, 10 * SIZE(AO2)
  2710. LFD a5, 12 * SIZE(AO2)
  2711. LFD a7, 14 * SIZE(AO2)
  2712. FMSUBX y01, alpha2i, a2, y01
  2713. FMADDX y02, alpha2r, a2, y02
  2714. FMSUBX y03, alpha2i, a4, y03
  2715. FMADDX y04, alpha2r, a4, y04
  2716. STFD y01, 0 * SIZE(Y2)
  2717. nop
  2718. STFD y02, 1 * SIZE(Y2)
  2719. add Y2, Y2, INCY
  2720. LFD y01, 0 * SIZE(Y1)
  2721. nop
  2722. LFD y02, 1 * SIZE(Y1)
  2723. add Y1, Y1, INCY
  2724. STFD y03, 0 * SIZE(Y2)
  2725. nop
  2726. STFD y04, 1 * SIZE(Y2)
  2727. add Y2, Y2, INCY
  2728. LFD y03, 0 * SIZE(Y1)
  2729. nop
  2730. LFD y04, 1 * SIZE(Y1)
  2731. add Y1, Y1, INCY
  2732. FMSUBX y05, alpha2i, a6, y05
  2733. FMADDX y06, alpha2r, a6, y06
  2734. FMSUBX y07, alpha2i, a8, y07
  2735. FMADDX y08, alpha2r, a8, y08
  2736. LFD a2, 9 * SIZE(AO2)
  2737. LFD a4, 11 * SIZE(AO2)
  2738. LFD a6, 13 * SIZE(AO2)
  2739. LFD a8, 15 * SIZE(AO2)
  2740. addi AO2, AO2, 16 * SIZE
  2741. nop
  2742. DCBT(AO2, PREA)
  2743. nop
  2744. STFD y05, 0 * SIZE(Y2)
  2745. nop
  2746. STFD y06, 1 * SIZE(Y2)
  2747. add Y2, Y2, INCY
  2748. LFD y05, 0 * SIZE(Y1)
  2749. nop
  2750. LFD y06, 1 * SIZE(Y1)
  2751. add Y1, Y1, INCY
  2752. STFD y07, 0 * SIZE(Y2)
  2753. nop
  2754. STFD y08, 1 * SIZE(Y2)
  2755. add Y2, Y2, INCY
  2756. LFD y07, 0 * SIZE(Y1)
  2757. nop
  2758. LFD y08, 1 * SIZE(Y1)
  2759. add Y1, Y1, INCY
  2760. FMADD y09, alpha2r, a1, y09
  2761. FMADD y10, alpha2i, a1, y10
  2762. FMADD y11, alpha2r, a3, y11
  2763. FMADD y12, alpha2i, a3, y12
  2764. FMADD y13, alpha2r, a5, y13
  2765. FMADD y14, alpha2i, a5, y14
  2766. FMADD y15, alpha2r, a7, y15
  2767. FMADD y16, alpha2i, a7, y16
  2768. LFD a1, 0 * SIZE(AO1)
  2769. LFD a3, 2 * SIZE(AO1)
  2770. LFD a5, 4 * SIZE(AO1)
  2771. LFD a7, 6 * SIZE(AO1)
  2772. FMSUBX y09, alpha2i, a2, y09
  2773. FMADDX y10, alpha2r, a2, y10
  2774. FMSUBX y11, alpha2i, a4, y11
  2775. FMADDX y12, alpha2r, a4, y12
  2776. STFD y09, 0 * SIZE(Y2)
  2777. nop
  2778. STFD y10, 1 * SIZE(Y2)
  2779. add Y2, Y2, INCY
  2780. LFD y09, 0 * SIZE(Y1)
  2781. nop
  2782. LFD y10, 1 * SIZE(Y1)
  2783. add Y1, Y1, INCY
  2784. STFD y11, 0 * SIZE(Y2)
  2785. nop
  2786. STFD y12, 1 * SIZE(Y2)
  2787. add Y2, Y2, INCY
  2788. LFD y11, 0 * SIZE(Y1)
  2789. nop
  2790. LFD y12, 1 * SIZE(Y1)
  2791. add Y1, Y1, INCY
  2792. FMSUBX y13, alpha2i, a6, y13
  2793. FMADDX y14, alpha2r, a6, y14
  2794. FMSUBX y15, alpha2i, a8, y15
  2795. FMADDX y16, alpha2r, a8, y16
  2796. LFD a2, 1 * SIZE(AO1)
  2797. LFD a4, 3 * SIZE(AO1)
  2798. LFD a6, 5 * SIZE(AO1)
  2799. LFD a8, 7 * SIZE(AO1)
  2800. STFD y13, 0 * SIZE(Y2)
  2801. nop
  2802. STFD y14, 1 * SIZE(Y2)
  2803. add Y2, Y2, INCY
  2804. STFD y15, 0 * SIZE(Y2)
  2805. nop
  2806. STFD y16, 1 * SIZE(Y2)
  2807. add Y2, Y2, INCY
  2808. LFD y13, 0 * SIZE(Y1)
  2809. nop
  2810. LFD y14, 1 * SIZE(Y1)
  2811. add Y1, Y1, INCY
  2812. LFD y15, 0 * SIZE(Y1)
  2813. nop
  2814. LFD y16, 1 * SIZE(Y1)
  2815. add Y1, Y1, INCY
  2816. DCBT(Y1, PREC)
  2817. bdnz LL(122)
  2818. .align 4
  2819. LL(123):
  2820. FMADD y01, alpha1r, a1, y01
  2821. FMADD y02, alpha1i, a1, y02
  2822. FMADD y03, alpha1r, a3, y03
  2823. FMADD y04, alpha1i, a3, y04
  2824. FMADD y05, alpha1r, a5, y05
  2825. FMADD y06, alpha1i, a5, y06
  2826. FMADD y07, alpha1r, a7, y07
  2827. FMADD y08, alpha1i, a7, y08
  2828. LFD a1, 8 * SIZE(AO1)
  2829. LFD a3, 10 * SIZE(AO1)
  2830. LFD a5, 12 * SIZE(AO1)
  2831. LFD a7, 14 * SIZE(AO1)
  2832. FMSUBX y01, alpha1i, a2, y01
  2833. FMADDX y02, alpha1r, a2, y02
  2834. FMSUBX y03, alpha1i, a4, y03
  2835. FMADDX y04, alpha1r, a4, y04
  2836. FMSUBX y05, alpha1i, a6, y05
  2837. FMADDX y06, alpha1r, a6, y06
  2838. FMSUBX y07, alpha1i, a8, y07
  2839. FMADDX y08, alpha1r, a8, y08
  2840. LFD a2, 9 * SIZE(AO1)
  2841. LFD a4, 11 * SIZE(AO1)
  2842. LFD a6, 13 * SIZE(AO1)
  2843. LFD a8, 15 * SIZE(AO1)
  2844. FMADD y09, alpha1r, a1, y09
  2845. FMADD y10, alpha1i, a1, y10
  2846. FMADD y11, alpha1r, a3, y11
  2847. FMADD y12, alpha1i, a3, y12
  2848. FMADD y13, alpha1r, a5, y13
  2849. FMADD y14, alpha1i, a5, y14
  2850. FMADD y15, alpha1r, a7, y15
  2851. FMADD y16, alpha1i, a7, y16
  2852. LFD a1, 0 * SIZE(AO2)
  2853. LFD a3, 2 * SIZE(AO2)
  2854. LFD a5, 4 * SIZE(AO2)
  2855. LFD a7, 6 * SIZE(AO2)
  2856. FMSUBX y09, alpha1i, a2, y09
  2857. FMADDX y10, alpha1r, a2, y10
  2858. FMSUBX y11, alpha1i, a4, y11
  2859. FMADDX y12, alpha1r, a4, y12
  2860. FMSUBX y13, alpha1i, a6, y13
  2861. FMADDX y14, alpha1r, a6, y14
  2862. FMSUBX y15, alpha1i, a8, y15
  2863. FMADDX y16, alpha1r, a8, y16
  2864. LFD a2, 1 * SIZE(AO2)
  2865. LFD a4, 3 * SIZE(AO2)
  2866. LFD a6, 5 * SIZE(AO2)
  2867. LFD a8, 7 * SIZE(AO2)
  2868. FMADD y01, alpha2r, a1, y01
  2869. FMADD y02, alpha2i, a1, y02
  2870. FMADD y03, alpha2r, a3, y03
  2871. FMADD y04, alpha2i, a3, y04
  2872. FMADD y05, alpha2r, a5, y05
  2873. FMADD y06, alpha2i, a5, y06
  2874. FMADD y07, alpha2r, a7, y07
  2875. FMADD y08, alpha2i, a7, y08
  2876. LFD a1, 8 * SIZE(AO2)
  2877. LFD a3, 10 * SIZE(AO2)
  2878. LFD a5, 12 * SIZE(AO2)
  2879. LFD a7, 14 * SIZE(AO2)
  2880. FMSUBX y01, alpha2i, a2, y01
  2881. FMADDX y02, alpha2r, a2, y02
  2882. FMSUBX y03, alpha2i, a4, y03
  2883. FMADDX y04, alpha2r, a4, y04
  2884. STFD y01, 0 * SIZE(Y2)
  2885. addi AO1, AO1, 16 * SIZE
  2886. STFD y02, 1 * SIZE(Y2)
  2887. add Y2, Y2, INCY
  2888. STFD y03, 0 * SIZE(Y2)
  2889. nop
  2890. STFD y04, 1 * SIZE(Y2)
  2891. add Y2, Y2, INCY
  2892. FMSUBX y05, alpha2i, a6, y05
  2893. FMADDX y06, alpha2r, a6, y06
  2894. FMSUBX y07, alpha2i, a8, y07
  2895. FMADDX y08, alpha2r, a8, y08
  2896. LFD a2, 9 * SIZE(AO2)
  2897. LFD a4, 11 * SIZE(AO2)
  2898. LFD a6, 13 * SIZE(AO2)
  2899. LFD a8, 15 * SIZE(AO2)
  2900. STFD y05, 0 * SIZE(Y2)
  2901. addi AO2, AO2, 16 * SIZE
  2902. STFD y06, 1 * SIZE(Y2)
  2903. add Y2, Y2, INCY
  2904. STFD y07, 0 * SIZE(Y2)
  2905. nop
  2906. STFD y08, 1 * SIZE(Y2)
  2907. add Y2, Y2, INCY
  2908. FMADD y09, alpha2r, a1, y09
  2909. FMADD y10, alpha2i, a1, y10
  2910. FMADD y11, alpha2r, a3, y11
  2911. FMADD y12, alpha2i, a3, y12
  2912. FMADD y13, alpha2r, a5, y13
  2913. FMADD y14, alpha2i, a5, y14
  2914. FMADD y15, alpha2r, a7, y15
  2915. FMADD y16, alpha2i, a7, y16
  2916. FMSUBX y09, alpha2i, a2, y09
  2917. FMADDX y10, alpha2r, a2, y10
  2918. FMSUBX y11, alpha2i, a4, y11
  2919. FMADDX y12, alpha2r, a4, y12
  2920. STFD y09, 0 * SIZE(Y2)
  2921. nop
  2922. STFD y10, 1 * SIZE(Y2)
  2923. add Y2, Y2, INCY
  2924. STFD y11, 0 * SIZE(Y2)
  2925. nop
  2926. STFD y12, 1 * SIZE(Y2)
  2927. add Y2, Y2, INCY
  2928. FMSUBX y13, alpha2i, a6, y13
  2929. FMADDX y14, alpha2r, a6, y14
  2930. FMSUBX y15, alpha2i, a8, y15
  2931. FMADDX y16, alpha2r, a8, y16
  2932. STFD y13, 0 * SIZE(Y2)
  2933. nop
  2934. STFD y14, 1 * SIZE(Y2)
  2935. add Y2, Y2, INCY
  2936. STFD y15, 0 * SIZE(Y2)
  2937. nop
  2938. STFD y16, 1 * SIZE(Y2)
  2939. add Y2, Y2, INCY
  2940. .align 4
  2941. LL(125):
  2942. andi. r0, M, 7
  2943. ble LL(130)
  2944. andi. r0, M, 4
  2945. ble LL(126)
  2946. LFD y01, 0 * SIZE(Y1)
  2947. LFD y02, 1 * SIZE(Y1)
  2948. add Y1, Y1, INCY
  2949. LFD y03, 0 * SIZE(Y1)
  2950. LFD y04, 1 * SIZE(Y1)
  2951. add Y1, Y1, INCY
  2952. LFD a1, 0 * SIZE(AO1)
  2953. LFD a3, 2 * SIZE(AO1)
  2954. LFD a5, 4 * SIZE(AO1)
  2955. LFD a7, 6 * SIZE(AO1)
  2956. LFD y05, 0 * SIZE(Y1)
  2957. LFD y06, 1 * SIZE(Y1)
  2958. add Y1, Y1, INCY
  2959. LFD y07, 0 * SIZE(Y1)
  2960. LFD y08, 1 * SIZE(Y1)
  2961. add Y1, Y1, INCY
  2962. LFD a2, 1 * SIZE(AO1)
  2963. LFD a4, 3 * SIZE(AO1)
  2964. LFD a6, 5 * SIZE(AO1)
  2965. LFD a8, 7 * SIZE(AO1)
  2966. FMADD y01, alpha1r, a1, y01
  2967. FMADD y02, alpha1i, a1, y02
  2968. FMADD y03, alpha1r, a3, y03
  2969. FMADD y04, alpha1i, a3, y04
  2970. FMADD y05, alpha1r, a5, y05
  2971. FMADD y06, alpha1i, a5, y06
  2972. FMADD y07, alpha1r, a7, y07
  2973. FMADD y08, alpha1i, a7, y08
  2974. LFD a1, 0 * SIZE(AO2)
  2975. LFD a3, 2 * SIZE(AO2)
  2976. LFD a5, 4 * SIZE(AO2)
  2977. LFD a7, 6 * SIZE(AO2)
  2978. FMSUBX y01, alpha1i, a2, y01
  2979. FMADDX y02, alpha1r, a2, y02
  2980. FMSUBX y03, alpha1i, a4, y03
  2981. FMADDX y04, alpha1r, a4, y04
  2982. FMSUBX y05, alpha1i, a6, y05
  2983. FMADDX y06, alpha1r, a6, y06
  2984. FMSUBX y07, alpha1i, a8, y07
  2985. FMADDX y08, alpha1r, a8, y08
  2986. LFD a2, 1 * SIZE(AO2)
  2987. LFD a4, 3 * SIZE(AO2)
  2988. LFD a6, 5 * SIZE(AO2)
  2989. LFD a8, 7 * SIZE(AO2)
  2990. FMADD y01, alpha2r, a1, y01
  2991. FMADD y02, alpha2i, a1, y02
  2992. FMADD y03, alpha2r, a3, y03
  2993. FMADD y04, alpha2i, a3, y04
  2994. FMADD y05, alpha2r, a5, y05
  2995. FMADD y06, alpha2i, a5, y06
  2996. FMADD y07, alpha2r, a7, y07
  2997. FMADD y08, alpha2i, a7, y08
  2998. FMSUBX y01, alpha2i, a2, y01
  2999. FMADDX y02, alpha2r, a2, y02
  3000. FMSUBX y03, alpha2i, a4, y03
  3001. FMADDX y04, alpha2r, a4, y04
  3002. STFD y01, 0 * SIZE(Y2)
  3003. addi AO1, AO1, 8 * SIZE
  3004. STFD y02, 1 * SIZE(Y2)
  3005. add Y2, Y2, INCY
  3006. STFD y03, 0 * SIZE(Y2)
  3007. addi AO2, AO2, 8 * SIZE
  3008. STFD y04, 1 * SIZE(Y2)
  3009. add Y2, Y2, INCY
  3010. FMSUBX y05, alpha2i, a6, y05
  3011. FMADDX y06, alpha2r, a6, y06
  3012. FMSUBX y07, alpha2i, a8, y07
  3013. FMADDX y08, alpha2r, a8, y08
  3014. STFD y05, 0 * SIZE(Y2)
  3015. nop
  3016. STFD y06, 1 * SIZE(Y2)
  3017. add Y2, Y2, INCY
  3018. STFD y07, 0 * SIZE(Y2)
  3019. nop
  3020. STFD y08, 1 * SIZE(Y2)
  3021. add Y2, Y2, INCY
  3022. .align 4
  3023. LL(126):
  3024. andi. r0, M, 2
  3025. ble LL(127)
  3026. LFD a1, 0 * SIZE(AO1)
  3027. LFD a2, 1 * SIZE(AO1)
  3028. LFD a3, 2 * SIZE(AO1)
  3029. LFD a4, 3 * SIZE(AO1)
  3030. LFD y01, 0 * SIZE(Y1)
  3031. LFD y02, 1 * SIZE(Y1)
  3032. add Y1, Y1, INCY
  3033. LFD y03, 0 * SIZE(Y1)
  3034. LFD y04, 1 * SIZE(Y1)
  3035. add Y1, Y1, INCY
  3036. LFD a5, 0 * SIZE(AO2)
  3037. LFD a6, 1 * SIZE(AO2)
  3038. LFD a7, 2 * SIZE(AO2)
  3039. LFD a8, 3 * SIZE(AO2)
  3040. FMADD y01, alpha1r, a1, y01
  3041. FMADD y02, alpha1i, a1, y02
  3042. FMADD y03, alpha1r, a3, y03
  3043. FMADD y04, alpha1i, a3, y04
  3044. FMSUBX y01, alpha1i, a2, y01
  3045. FMADDX y02, alpha1r, a2, y02
  3046. FMSUBX y03, alpha1i, a4, y03
  3047. FMADDX y04, alpha1r, a4, y04
  3048. FMADD y01, alpha2r, a5, y01
  3049. FMADD y02, alpha2i, a5, y02
  3050. FMADD y03, alpha2r, a7, y03
  3051. FMADD y04, alpha2i, a7, y04
  3052. FMSUBX y01, alpha2i, a6, y01
  3053. FMADDX y02, alpha2r, a6, y02
  3054. FMSUBX y03, alpha2i, a8, y03
  3055. FMADDX y04, alpha2r, a8, y04
  3056. STFD y01, 0 * SIZE(Y2)
  3057. addi AO1, AO1, 4 * SIZE
  3058. STFD y02, 1 * SIZE(Y2)
  3059. add Y2, Y2, INCY
  3060. STFD y03, 0 * SIZE(Y2)
  3061. addi AO2, AO2, 4 * SIZE
  3062. STFD y04, 1 * SIZE(Y2)
  3063. add Y2, Y2, INCY
  3064. .align 4
  3065. LL(127):
  3066. andi. r0, M, 1
  3067. ble LL(130)
  3068. LFD y01, 0 * SIZE(Y1)
  3069. LFD y02, 1 * SIZE(Y1)
  3070. add Y1, Y1, INCY
  3071. LFD a1, 0 * SIZE(AO1)
  3072. LFD a2, 1 * SIZE(AO1)
  3073. LFD a3, 0 * SIZE(AO2)
  3074. LFD a4, 1 * SIZE(AO2)
  3075. FMADD y01, alpha1r, a1, y01
  3076. FMADD y02, alpha1i, a1, y02
  3077. FMSUBX y01, alpha1i, a2, y01
  3078. FMADDX y02, alpha1r, a2, y02
  3079. FMADD y01, alpha2r, a3, y01
  3080. FMADD y02, alpha2i, a3, y02
  3081. FMSUBX y01, alpha2i, a4, y01
  3082. FMADDX y02, alpha2r, a4, y02
  3083. STFD y01, 0 * SIZE(Y2)
  3084. STFD y02, 1 * SIZE(Y2)
  3085. add Y2, Y2, INCY
  3086. .align 4
  3087. LL(130):
  3088. andi. J, N, 1
  3089. ble LL(999)
  3090. .align 4
  3091. LL(131):
  3092. lfd alpha_r, ALPHA_R
  3093. lfd alpha_i, ALPHA_I
  3094. LFD a1, 0 * SIZE(X)
  3095. LFD a2, 1 * SIZE(X)
  3096. add X, X, INCX
  3097. FMUL alpha1r, alpha_r, a1
  3098. FMUL alpha1i, alpha_i, a1
  3099. FMSUBR alpha1r, alpha_i, a2, alpha1r
  3100. FMADDR alpha1i, alpha_r, a2, alpha1i
  3101. mr AO1, A
  3102. add A, AO1, LDA
  3103. mr Y1, Y
  3104. mr Y2, Y
  3105. srawi. r0, M, 3
  3106. mtspr CTR, r0
  3107. ble LL(135)
  3108. .align 4
  3109. LFD y01, 0 * SIZE(Y1)
  3110. LFD y02, 1 * SIZE(Y1)
  3111. add Y1, Y1, INCY
  3112. LFD y03, 0 * SIZE(Y1)
  3113. LFD y04, 1 * SIZE(Y1)
  3114. add Y1, Y1, INCY
  3115. LFD a1, 0 * SIZE(AO1)
  3116. LFD a3, 2 * SIZE(AO1)
  3117. LFD a5, 4 * SIZE(AO1)
  3118. LFD a7, 6 * SIZE(AO1)
  3119. LFD y05, 0 * SIZE(Y1)
  3120. LFD y06, 1 * SIZE(Y1)
  3121. add Y1, Y1, INCY
  3122. LFD y07, 0 * SIZE(Y1)
  3123. LFD y08, 1 * SIZE(Y1)
  3124. add Y1, Y1, INCY
  3125. LFD a2, 1 * SIZE(AO1)
  3126. LFD a4, 3 * SIZE(AO1)
  3127. LFD a6, 5 * SIZE(AO1)
  3128. LFD a8, 7 * SIZE(AO1)
  3129. LFD y09, 0 * SIZE(Y1)
  3130. LFD y10, 1 * SIZE(Y1)
  3131. add Y1, Y1, INCY
  3132. LFD y11, 0 * SIZE(Y1)
  3133. LFD y12, 1 * SIZE(Y1)
  3134. add Y1, Y1, INCY
  3135. LFD y13, 0 * SIZE(Y1)
  3136. LFD y14, 1 * SIZE(Y1)
  3137. add Y1, Y1, INCY
  3138. LFD y15, 0 * SIZE(Y1)
  3139. LFD y16, 1 * SIZE(Y1)
  3140. add Y1, Y1, INCY
  3141. bdz LL(133)
  3142. .align 4
  3143. LL(132):
  3144. FMADD y01, alpha1r, a1, y01
  3145. FMADD y02, alpha1i, a1, y02
  3146. FMADD y03, alpha1r, a3, y03
  3147. FMADD y04, alpha1i, a3, y04
  3148. FMADD y05, alpha1r, a5, y05
  3149. FMADD y06, alpha1i, a5, y06
  3150. FMADD y07, alpha1r, a7, y07
  3151. FMADD y08, alpha1i, a7, y08
  3152. LFD a1, 8 * SIZE(AO1)
  3153. LFD a3, 10 * SIZE(AO1)
  3154. LFD a5, 12 * SIZE(AO1)
  3155. LFD a7, 14 * SIZE(AO1)
  3156. FMSUBX y01, alpha1i, a2, y01
  3157. FMADDX y02, alpha1r, a2, y02
  3158. FMSUBX y03, alpha1i, a4, y03
  3159. FMADDX y04, alpha1r, a4, y04
  3160. STFD y01, 0 * SIZE(Y2)
  3161. nop
  3162. STFD y02, 1 * SIZE(Y2)
  3163. add Y2, Y2, INCY
  3164. LFD y01, 0 * SIZE(Y1)
  3165. nop
  3166. LFD y02, 1 * SIZE(Y1)
  3167. add Y1, Y1, INCY
  3168. STFD y03, 0 * SIZE(Y2)
  3169. nop
  3170. STFD y04, 1 * SIZE(Y2)
  3171. add Y2, Y2, INCY
  3172. LFD y03, 0 * SIZE(Y1)
  3173. nop
  3174. LFD y04, 1 * SIZE(Y1)
  3175. add Y1, Y1, INCY
  3176. FMSUBX y05, alpha1i, a6, y05
  3177. FMADDX y06, alpha1r, a6, y06
  3178. FMSUBX y07, alpha1i, a8, y07
  3179. FMADDX y08, alpha1r, a8, y08
  3180. LFD a2, 9 * SIZE(AO1)
  3181. LFD a4, 11 * SIZE(AO1)
  3182. LFD a6, 13 * SIZE(AO1)
  3183. LFD a8, 15 * SIZE(AO1)
  3184. addi AO1, AO1, 16 * SIZE
  3185. nop
  3186. DCBT(AO1, PREA)
  3187. nop
  3188. STFD y05, 0 * SIZE(Y2)
  3189. nop
  3190. STFD y06, 1 * SIZE(Y2)
  3191. add Y2, Y2, INCY
  3192. LFD y05, 0 * SIZE(Y1)
  3193. nop
  3194. LFD y06, 1 * SIZE(Y1)
  3195. add Y1, Y1, INCY
  3196. STFD y07, 0 * SIZE(Y2)
  3197. nop
  3198. STFD y08, 1 * SIZE(Y2)
  3199. add Y2, Y2, INCY
  3200. LFD y07, 0 * SIZE(Y1)
  3201. nop
  3202. LFD y08, 1 * SIZE(Y1)
  3203. add Y1, Y1, INCY
  3204. FMADD y09, alpha1r, a1, y09
  3205. FMADD y10, alpha1i, a1, y10
  3206. FMADD y11, alpha1r, a3, y11
  3207. FMADD y12, alpha1i, a3, y12
  3208. FMADD y13, alpha1r, a5, y13
  3209. FMADD y14, alpha1i, a5, y14
  3210. FMADD y15, alpha1r, a7, y15
  3211. FMADD y16, alpha1i, a7, y16
  3212. LFD a1, 0 * SIZE(AO1)
  3213. LFD a3, 2 * SIZE(AO1)
  3214. LFD a5, 4 * SIZE(AO1)
  3215. LFD a7, 6 * SIZE(AO1)
  3216. FMSUBX y09, alpha1i, a2, y09
  3217. FMADDX y10, alpha1r, a2, y10
  3218. FMSUBX y11, alpha1i, a4, y11
  3219. FMADDX y12, alpha1r, a4, y12
  3220. STFD y09, 0 * SIZE(Y2)
  3221. nop
  3222. STFD y10, 1 * SIZE(Y2)
  3223. add Y2, Y2, INCY
  3224. LFD y09, 0 * SIZE(Y1)
  3225. nop
  3226. LFD y10, 1 * SIZE(Y1)
  3227. add Y1, Y1, INCY
  3228. STFD y11, 0 * SIZE(Y2)
  3229. nop
  3230. STFD y12, 1 * SIZE(Y2)
  3231. add Y2, Y2, INCY
  3232. LFD y11, 0 * SIZE(Y1)
  3233. nop
  3234. LFD y12, 1 * SIZE(Y1)
  3235. add Y1, Y1, INCY
  3236. FMSUBX y13, alpha1i, a6, y13
  3237. FMADDX y14, alpha1r, a6, y14
  3238. FMSUBX y15, alpha1i, a8, y15
  3239. FMADDX y16, alpha1r, a8, y16
  3240. LFD a2, 1 * SIZE(AO1)
  3241. LFD a4, 3 * SIZE(AO1)
  3242. LFD a6, 5 * SIZE(AO1)
  3243. LFD a8, 7 * SIZE(AO1)
  3244. STFD y13, 0 * SIZE(Y2)
  3245. nop
  3246. STFD y14, 1 * SIZE(Y2)
  3247. add Y2, Y2, INCY
  3248. STFD y15, 0 * SIZE(Y2)
  3249. nop
  3250. STFD y16, 1 * SIZE(Y2)
  3251. add Y2, Y2, INCY
  3252. LFD y13, 0 * SIZE(Y1)
  3253. nop
  3254. LFD y14, 1 * SIZE(Y1)
  3255. add Y1, Y1, INCY
  3256. LFD y15, 0 * SIZE(Y1)
  3257. nop
  3258. LFD y16, 1 * SIZE(Y1)
  3259. add Y1, Y1, INCY
  3260. DCBT(Y1, PREC)
  3261. bdnz LL(132)
  3262. .align 4
  3263. LL(133):
  3264. FMADD y01, alpha1r, a1, y01
  3265. FMADD y02, alpha1i, a1, y02
  3266. FMADD y03, alpha1r, a3, y03
  3267. FMADD y04, alpha1i, a3, y04
  3268. FMADD y05, alpha1r, a5, y05
  3269. FMADD y06, alpha1i, a5, y06
  3270. FMADD y07, alpha1r, a7, y07
  3271. FMADD y08, alpha1i, a7, y08
  3272. LFD a1, 8 * SIZE(AO1)
  3273. LFD a3, 10 * SIZE(AO1)
  3274. LFD a5, 12 * SIZE(AO1)
  3275. LFD a7, 14 * SIZE(AO1)
  3276. FMSUBX y01, alpha1i, a2, y01
  3277. FMADDX y02, alpha1r, a2, y02
  3278. FMSUBX y03, alpha1i, a4, y03
  3279. FMADDX y04, alpha1r, a4, y04
  3280. FMSUBX y05, alpha1i, a6, y05
  3281. FMADDX y06, alpha1r, a6, y06
  3282. FMSUBX y07, alpha1i, a8, y07
  3283. FMADDX y08, alpha1r, a8, y08
  3284. LFD a2, 9 * SIZE(AO1)
  3285. LFD a4, 11 * SIZE(AO1)
  3286. LFD a6, 13 * SIZE(AO1)
  3287. LFD a8, 15 * SIZE(AO1)
  3288. FMADD y09, alpha1r, a1, y09
  3289. FMADD y10, alpha1i, a1, y10
  3290. FMADD y11, alpha1r, a3, y11
  3291. FMADD y12, alpha1i, a3, y12
  3292. FMADD y13, alpha1r, a5, y13
  3293. FMADD y14, alpha1i, a5, y14
  3294. FMADD y15, alpha1r, a7, y15
  3295. FMADD y16, alpha1i, a7, y16
  3296. FMSUBX y09, alpha1i, a2, y09
  3297. FMADDX y10, alpha1r, a2, y10
  3298. FMSUBX y11, alpha1i, a4, y11
  3299. FMADDX y12, alpha1r, a4, y12
  3300. FMSUBX y13, alpha1i, a6, y13
  3301. FMADDX y14, alpha1r, a6, y14
  3302. FMSUBX y15, alpha1i, a8, y15
  3303. FMADDX y16, alpha1r, a8, y16
  3304. STFD y01, 0 * SIZE(Y2)
  3305. addi AO1, AO1, 16 * SIZE
  3306. STFD y02, 1 * SIZE(Y2)
  3307. add Y2, Y2, INCY
  3308. STFD y03, 0 * SIZE(Y2)
  3309. nop
  3310. STFD y04, 1 * SIZE(Y2)
  3311. add Y2, Y2, INCY
  3312. STFD y05, 0 * SIZE(Y2)
  3313. nop
  3314. STFD y06, 1 * SIZE(Y2)
  3315. add Y2, Y2, INCY
  3316. STFD y07, 0 * SIZE(Y2)
  3317. nop
  3318. STFD y08, 1 * SIZE(Y2)
  3319. add Y2, Y2, INCY
  3320. STFD y09, 0 * SIZE(Y2)
  3321. nop
  3322. STFD y10, 1 * SIZE(Y2)
  3323. add Y2, Y2, INCY
  3324. STFD y11, 0 * SIZE(Y2)
  3325. nop
  3326. STFD y12, 1 * SIZE(Y2)
  3327. add Y2, Y2, INCY
  3328. STFD y13, 0 * SIZE(Y2)
  3329. nop
  3330. STFD y14, 1 * SIZE(Y2)
  3331. add Y2, Y2, INCY
  3332. STFD y15, 0 * SIZE(Y2)
  3333. nop
  3334. STFD y16, 1 * SIZE(Y2)
  3335. add Y2, Y2, INCY
  3336. .align 4
  3337. LL(135):
  3338. andi. r0, M, 7
  3339. ble LL(999)
  3340. andi. r0, M, 4
  3341. ble LL(136)
  3342. LFD y01, 0 * SIZE(Y1)
  3343. nop
  3344. LFD y02, 1 * SIZE(Y1)
  3345. add Y1, Y1, INCY
  3346. LFD y03, 0 * SIZE(Y1)
  3347. nop
  3348. LFD y04, 1 * SIZE(Y1)
  3349. add Y1, Y1, INCY
  3350. LFD y05, 0 * SIZE(Y1)
  3351. nop
  3352. LFD y06, 1 * SIZE(Y1)
  3353. add Y1, Y1, INCY
  3354. LFD y07, 0 * SIZE(Y1)
  3355. nop
  3356. LFD y08, 1 * SIZE(Y1)
  3357. add Y1, Y1, INCY
  3358. LFD a1, 0 * SIZE(AO1)
  3359. LFD a3, 2 * SIZE(AO1)
  3360. LFD a5, 4 * SIZE(AO1)
  3361. LFD a7, 6 * SIZE(AO1)
  3362. LFD a2, 1 * SIZE(AO1)
  3363. LFD a4, 3 * SIZE(AO1)
  3364. LFD a6, 5 * SIZE(AO1)
  3365. LFD a8, 7 * SIZE(AO1)
  3366. FMADD y01, alpha1r, a1, y01
  3367. FMADD y02, alpha1i, a1, y02
  3368. FMADD y03, alpha1r, a3, y03
  3369. FMADD y04, alpha1i, a3, y04
  3370. FMADD y05, alpha1r, a5, y05
  3371. FMADD y06, alpha1i, a5, y06
  3372. FMADD y07, alpha1r, a7, y07
  3373. FMADD y08, alpha1i, a7, y08
  3374. FMSUBX y01, alpha1i, a2, y01
  3375. FMADDX y02, alpha1r, a2, y02
  3376. FMSUBX y03, alpha1i, a4, y03
  3377. FMADDX y04, alpha1r, a4, y04
  3378. FMSUBX y05, alpha1i, a6, y05
  3379. FMADDX y06, alpha1r, a6, y06
  3380. FMSUBX y07, alpha1i, a8, y07
  3381. FMADDX y08, alpha1r, a8, y08
  3382. STFD y01, 0 * SIZE(Y2)
  3383. addi AO1, AO1, 8 * SIZE
  3384. STFD y02, 1 * SIZE(Y2)
  3385. add Y2, Y2, INCY
  3386. STFD y03, 0 * SIZE(Y2)
  3387. nop
  3388. STFD y04, 1 * SIZE(Y2)
  3389. add Y2, Y2, INCY
  3390. STFD y05, 0 * SIZE(Y2)
  3391. nop
  3392. STFD y06, 1 * SIZE(Y2)
  3393. add Y2, Y2, INCY
  3394. STFD y07, 0 * SIZE(Y2)
  3395. nop
  3396. STFD y08, 1 * SIZE(Y2)
  3397. add Y2, Y2, INCY
  3398. .align 4
  3399. LL(136):
  3400. andi. r0, M, 2
  3401. ble LL(137)
  3402. LFD a1, 0 * SIZE(AO1)
  3403. LFD a2, 1 * SIZE(AO1)
  3404. LFD a3, 2 * SIZE(AO1)
  3405. LFD a4, 3 * SIZE(AO1)
  3406. LFD y01, 0 * SIZE(Y1)
  3407. nop
  3408. LFD y02, 1 * SIZE(Y1)
  3409. add Y1, Y1, INCY
  3410. LFD y03, 0 * SIZE(Y1)
  3411. nop
  3412. LFD y04, 1 * SIZE(Y1)
  3413. add Y1, Y1, INCY
  3414. FMADD y01, alpha1r, a1, y01
  3415. FMADD y02, alpha1i, a1, y02
  3416. FMADD y03, alpha1r, a3, y03
  3417. FMADD y04, alpha1i, a3, y04
  3418. FMSUBX y01, alpha1i, a2, y01
  3419. FMADDX y02, alpha1r, a2, y02
  3420. FMSUBX y03, alpha1i, a4, y03
  3421. FMADDX y04, alpha1r, a4, y04
  3422. STFD y01, 0 * SIZE(Y2)
  3423. addi AO1, AO1, 4 * SIZE
  3424. STFD y02, 1 * SIZE(Y2)
  3425. add Y2, Y2, INCY
  3426. STFD y03, 0 * SIZE(Y2)
  3427. nop
  3428. STFD y04, 1 * SIZE(Y2)
  3429. add Y2, Y2, INCY
  3430. .align 4
  3431. LL(137):
  3432. andi. r0, M, 1
  3433. ble LL(999)
  3434. LFD y01, 0 * SIZE(Y1)
  3435. nop
  3436. LFD y02, 1 * SIZE(Y1)
  3437. add Y1, Y1, INCY
  3438. LFD a1, 0 * SIZE(AO1)
  3439. LFD a2, 1 * SIZE(AO1)
  3440. FMADD y01, alpha1r, a1, y01
  3441. FMADD y02, alpha1i, a1, y02
  3442. FMSUBX y01, alpha1i, a2, y01
  3443. FMADDX y02, alpha1r, a2, y02
  3444. STFD y01, 0 * SIZE(Y2)
  3445. nop
  3446. STFD y02, 1 * SIZE(Y2)
  3447. add Y2, Y2, INCY
  3448. .align 4
  3449. LL(999):
  3450. li r3, 0
  3451. lfd f14, 0(SP)
  3452. lfd f15, 8(SP)
  3453. lfd f16, 16(SP)
  3454. lfd f17, 24(SP)
  3455. lfd f18, 32(SP)
  3456. lfd f19, 40(SP)
  3457. lfd f20, 48(SP)
  3458. lfd f21, 56(SP)
  3459. lfd f22, 64(SP)
  3460. lfd f23, 72(SP)
  3461. lfd f24, 80(SP)
  3462. lfd f25, 88(SP)
  3463. lfd f26, 96(SP)
  3464. lfd f27, 104(SP)
  3465. lfd f28, 112(SP)
  3466. lfd f29, 120(SP)
  3467. lfd f30, 128(SP)
  3468. lfd f31, 136(SP)
  3469. #ifdef __64BIT__
  3470. ld r14, 144(SP)
  3471. ld r15, 152(SP)
  3472. ld r16, 160(SP)
  3473. ld r17, 168(SP)
  3474. ld r18, 176(SP)
  3475. ld r19, 184(SP)
  3476. ld r20, 192(SP)
  3477. ld r21, 200(SP)
  3478. ld r22, 208(SP)
  3479. #else
  3480. lwz r14, 144(SP)
  3481. lwz r15, 148(SP)
  3482. lwz r16, 152(SP)
  3483. lwz r17, 156(SP)
  3484. lwz r18, 160(SP)
  3485. lwz r19, 164(SP)
  3486. lwz r20, 168(SP)
  3487. lwz r21, 172(SP)
  3488. lwz r22, 176(SP)
  3489. #endif
  3490. addi SP, SP, STACKSIZE
  3491. blr
  3492. EPILOGUE
  3493. #endif