You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_macros_16x4_power8.S 67 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/03/05 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. /*********************************************************************
  35. * Macros for N=4, M=16 *
  36. *********************************************************************/
  37. #if defined(_AIX)
  38. define(`LOAD4x16_1', `
  39. #else
  40. .macro LOAD4x16_1
  41. #endif
  42. lxvd2x vs0, 0, AO
  43. lxvd2x vs1, o16, AO
  44. lxvd2x vs2, o32, AO
  45. lxvd2x vs3, o48, AO
  46. lxvdsx vs24, 0, BO
  47. lxvdsx vs25, o8, BO
  48. addi AO, AO, 64
  49. lxvd2x vs4, 0, AO
  50. lxvd2x vs5, o16, AO
  51. lxvd2x vs6, o32, AO
  52. lxvd2x vs7, o48, AO
  53. lxvdsx vs26, o16, BO
  54. lxvdsx vs27, o24, BO
  55. addi AO, AO, 64
  56. addi BO, BO, 32
  57. #if defined(_AIX)
  58. ')
  59. #else
  60. .endm
  61. #endif
  62. #if defined(_AIX)
  63. define(`KERNEL4x16_I1', `
  64. #else
  65. .macro KERNEL4x16_I1
  66. #endif
  67. xvmuldp vs32, vs0, vs24
  68. xvmuldp vs33, vs1, vs24
  69. xvmuldp vs34, vs2, vs24
  70. xvmuldp vs35, vs3, vs24
  71. lxvd2x vs8, 0, AO
  72. lxvd2x vs9, o16, AO
  73. xvmuldp vs36, vs4, vs24
  74. xvmuldp vs37, vs5, vs24
  75. xvmuldp vs38, vs6, vs24
  76. xvmuldp vs39, vs7, vs24
  77. lxvdsx vs28, 0, BO
  78. lxvdsx vs29, o8, BO
  79. xvmuldp vs40, vs0, vs25
  80. xvmuldp vs41, vs1, vs25
  81. xvmuldp vs42, vs2, vs25
  82. xvmuldp vs43, vs3, vs25
  83. lxvd2x vs10, o32, AO
  84. lxvd2x vs11, o48, AO
  85. xvmuldp vs44, vs4, vs25
  86. xvmuldp vs45, vs5, vs25
  87. xvmuldp vs46, vs6, vs25
  88. xvmuldp vs47, vs7, vs25
  89. addi AO, AO, 64
  90. xvmuldp vs48, vs0, vs26
  91. xvmuldp vs49, vs1, vs26
  92. xvmuldp vs50, vs2, vs26
  93. xvmuldp vs51, vs3, vs26
  94. lxvd2x vs12, 0, AO
  95. lxvd2x vs13, o16, AO
  96. xvmuldp vs52, vs4, vs26
  97. xvmuldp vs53, vs5, vs26
  98. xvmuldp vs54, vs6, vs26
  99. xvmuldp vs55, vs7, vs26
  100. lxvd2x vs14, o32, AO
  101. lxvd2x vs15, o48, AO
  102. xvmuldp vs56, vs0, vs27
  103. xvmuldp vs57, vs1, vs27
  104. xvmuldp vs58, vs2, vs27
  105. xvmuldp vs59, vs3, vs27
  106. lxvdsx vs30, o16, BO
  107. lxvdsx vs31, o24, BO
  108. xvmuldp vs60, vs4, vs27
  109. xvmuldp vs61, vs5, vs27
  110. xvmuldp vs62, vs6, vs27
  111. xvmuldp vs63, vs7, vs27
  112. addi AO, AO, 64
  113. addi BO, BO, 32
  114. #if defined(_AIX)
  115. ')
  116. #else
  117. .endm
  118. #endif
  119. #if defined(_AIX)
  120. define(`KERNEL4x16_1', `
  121. #else
  122. .macro KERNEL4x16_1
  123. #endif
  124. xvmaddadp vs32, vs0, vs24
  125. xvmaddadp vs33, vs1, vs24
  126. xvmaddadp vs34, vs2, vs24
  127. xvmaddadp vs35, vs3, vs24
  128. lxvd2x vs8, 0, AO
  129. lxvd2x vs9, o16, AO
  130. xvmaddadp vs36, vs4, vs24
  131. xvmaddadp vs37, vs5, vs24
  132. xvmaddadp vs38, vs6, vs24
  133. xvmaddadp vs39, vs7, vs24
  134. lxvdsx vs28, 0, BO
  135. lxvdsx vs29, o8, BO
  136. xvmaddadp vs40, vs0, vs25
  137. xvmaddadp vs41, vs1, vs25
  138. xvmaddadp vs42, vs2, vs25
  139. xvmaddadp vs43, vs3, vs25
  140. lxvd2x vs10, o32, AO
  141. lxvd2x vs11, o48, AO
  142. xvmaddadp vs44, vs4, vs25
  143. xvmaddadp vs45, vs5, vs25
  144. xvmaddadp vs46, vs6, vs25
  145. xvmaddadp vs47, vs7, vs25
  146. addi AO, AO, 64
  147. xvmaddadp vs48, vs0, vs26
  148. xvmaddadp vs49, vs1, vs26
  149. xvmaddadp vs50, vs2, vs26
  150. xvmaddadp vs51, vs3, vs26
  151. lxvd2x vs12, 0, AO
  152. lxvd2x vs13, o16, AO
  153. xvmaddadp vs52, vs4, vs26
  154. xvmaddadp vs53, vs5, vs26
  155. xvmaddadp vs54, vs6, vs26
  156. xvmaddadp vs55, vs7, vs26
  157. lxvd2x vs14, o32, AO
  158. lxvd2x vs15, o48, AO
  159. xvmaddadp vs56, vs0, vs27
  160. xvmaddadp vs57, vs1, vs27
  161. xvmaddadp vs58, vs2, vs27
  162. xvmaddadp vs59, vs3, vs27
  163. lxvdsx vs30, o16, BO
  164. lxvdsx vs31, o24, BO
  165. xvmaddadp vs60, vs4, vs27
  166. xvmaddadp vs61, vs5, vs27
  167. xvmaddadp vs62, vs6, vs27
  168. xvmaddadp vs63, vs7, vs27
  169. addi AO, AO, 64
  170. addi BO, BO, 32
  171. #if defined(_AIX)
  172. ')
  173. #else
  174. .endm
  175. #endif
  176. #if defined(_AIX)
  177. define(`KERNEL4x16_2', `
  178. #else
  179. .macro KERNEL4x16_2
  180. #endif
  181. xvmaddadp vs32, vs8, vs28
  182. xvmaddadp vs33, vs9, vs28
  183. xvmaddadp vs34, vs10, vs28
  184. xvmaddadp vs35, vs11, vs28
  185. lxvd2x vs0, 0, AO
  186. lxvd2x vs1, o16, AO
  187. xvmaddadp vs36, vs12, vs28
  188. xvmaddadp vs37, vs13, vs28
  189. xvmaddadp vs38, vs14, vs28
  190. xvmaddadp vs39, vs15, vs28
  191. lxvdsx vs24, 0, BO
  192. lxvdsx vs25, o8, BO
  193. xvmaddadp vs40, vs8, vs29
  194. xvmaddadp vs41, vs9, vs29
  195. xvmaddadp vs42, vs10, vs29
  196. xvmaddadp vs43, vs11, vs29
  197. lxvd2x vs2, o32, AO
  198. lxvd2x vs3, o48, AO
  199. xvmaddadp vs44, vs12, vs29
  200. xvmaddadp vs45, vs13, vs29
  201. xvmaddadp vs46, vs14, vs29
  202. xvmaddadp vs47, vs15, vs29
  203. addi AO, AO, 64
  204. xvmaddadp vs48, vs8, vs30
  205. xvmaddadp vs49, vs9, vs30
  206. xvmaddadp vs50, vs10, vs30
  207. xvmaddadp vs51, vs11, vs30
  208. lxvd2x vs4, 0, AO
  209. lxvd2x vs5, o16, AO
  210. xvmaddadp vs52, vs12, vs30
  211. xvmaddadp vs53, vs13, vs30
  212. xvmaddadp vs54, vs14, vs30
  213. xvmaddadp vs55, vs15, vs30
  214. lxvd2x vs6, o32, AO
  215. lxvd2x vs7, o48, AO
  216. xvmaddadp vs56, vs8, vs31
  217. xvmaddadp vs57, vs9, vs31
  218. xvmaddadp vs58, vs10, vs31
  219. xvmaddadp vs59, vs11, vs31
  220. lxvdsx vs26, o16, BO
  221. lxvdsx vs27, o24, BO
  222. xvmaddadp vs60, vs12, vs31
  223. xvmaddadp vs61, vs13, vs31
  224. xvmaddadp vs62, vs14, vs31
  225. xvmaddadp vs63, vs15, vs31
  226. addi AO, AO, 64
  227. addi BO, BO, 32
  228. #if defined(_AIX)
  229. ')
  230. #else
  231. .endm
  232. #endif
  233. #if defined(_AIX)
  234. define(`KERNEL4x16_E2', `
  235. #else
  236. .macro KERNEL4x16_E2
  237. #endif
  238. xvmaddadp vs32, vs8, vs28
  239. xvmaddadp vs33, vs9, vs28
  240. xvmaddadp vs34, vs10, vs28
  241. xvmaddadp vs35, vs11, vs28
  242. xvmaddadp vs36, vs12, vs28
  243. xvmaddadp vs37, vs13, vs28
  244. xvmaddadp vs38, vs14, vs28
  245. xvmaddadp vs39, vs15, vs28
  246. xvmaddadp vs40, vs8, vs29
  247. xvmaddadp vs41, vs9, vs29
  248. xvmaddadp vs42, vs10, vs29
  249. xvmaddadp vs43, vs11, vs29
  250. xvmaddadp vs44, vs12, vs29
  251. xvmaddadp vs45, vs13, vs29
  252. xvmaddadp vs46, vs14, vs29
  253. xvmaddadp vs47, vs15, vs29
  254. xvmaddadp vs48, vs8, vs30
  255. xvmaddadp vs49, vs9, vs30
  256. xvmaddadp vs50, vs10, vs30
  257. xvmaddadp vs51, vs11, vs30
  258. xvmaddadp vs52, vs12, vs30
  259. xvmaddadp vs53, vs13, vs30
  260. xvmaddadp vs54, vs14, vs30
  261. xvmaddadp vs55, vs15, vs30
  262. xvmaddadp vs56, vs8, vs31
  263. xvmaddadp vs57, vs9, vs31
  264. xvmaddadp vs58, vs10, vs31
  265. xvmaddadp vs59, vs11, vs31
  266. xvmaddadp vs60, vs12, vs31
  267. xvmaddadp vs61, vs13, vs31
  268. xvmaddadp vs62, vs14, vs31
  269. xvmaddadp vs63, vs15, vs31
  270. #if defined(_AIX)
  271. ')
  272. #else
  273. .endm
  274. #endif
  275. #if defined(_AIX)
  276. define(`KERNEL4x16_SUBI1', `
  277. #else
  278. .macro KERNEL4x16_SUBI1
  279. #endif
  280. lxvd2x vs0, 0, AO
  281. lxvd2x vs1, o16, AO
  282. lxvd2x vs2, o32, AO
  283. lxvd2x vs3, o48, AO
  284. lxvdsx vs24, 0, BO
  285. lxvdsx vs25, o8, BO
  286. lxvdsx vs26, o16, BO
  287. lxvdsx vs27, o24, BO
  288. addi AO, AO, 64
  289. addi BO, BO, 32
  290. lxvd2x vs4, 0, AO
  291. lxvd2x vs5, o16, AO
  292. lxvd2x vs6, o32, AO
  293. lxvd2x vs7, o48, AO
  294. addi AO, AO, 64
  295. xvmuldp vs32, vs0, vs24
  296. xvmuldp vs33, vs1, vs24
  297. xvmuldp vs34, vs2, vs24
  298. xvmuldp vs35, vs3, vs24
  299. xvmuldp vs36, vs4, vs24
  300. xvmuldp vs37, vs5, vs24
  301. xvmuldp vs38, vs6, vs24
  302. xvmuldp vs39, vs7, vs24
  303. xvmuldp vs40, vs0, vs25
  304. xvmuldp vs41, vs1, vs25
  305. xvmuldp vs42, vs2, vs25
  306. xvmuldp vs43, vs3, vs25
  307. xvmuldp vs44, vs4, vs25
  308. xvmuldp vs45, vs5, vs25
  309. xvmuldp vs46, vs6, vs25
  310. xvmuldp vs47, vs7, vs25
  311. xvmuldp vs48, vs0, vs26
  312. xvmuldp vs49, vs1, vs26
  313. xvmuldp vs50, vs2, vs26
  314. xvmuldp vs51, vs3, vs26
  315. xvmuldp vs52, vs4, vs26
  316. xvmuldp vs53, vs5, vs26
  317. xvmuldp vs54, vs6, vs26
  318. xvmuldp vs55, vs7, vs26
  319. xvmuldp vs56, vs0, vs27
  320. xvmuldp vs57, vs1, vs27
  321. xvmuldp vs58, vs2, vs27
  322. xvmuldp vs59, vs3, vs27
  323. xvmuldp vs60, vs4, vs27
  324. xvmuldp vs61, vs5, vs27
  325. xvmuldp vs62, vs6, vs27
  326. xvmuldp vs63, vs7, vs27
  327. #if defined(_AIX)
  328. ')
  329. #else
  330. .endm
  331. #endif
  332. #if defined(_AIX)
  333. define(`KERNEL4x16_SUB1', `
  334. #else
  335. .macro KERNEL4x16_SUB1
  336. #endif
  337. lxvd2x vs0, 0, AO
  338. lxvd2x vs1, o16, AO
  339. lxvd2x vs2, o32, AO
  340. lxvd2x vs3, o48, AO
  341. lxvdsx vs24, 0, BO
  342. lxvdsx vs25, o8, BO
  343. lxvdsx vs26, o16, BO
  344. lxvdsx vs27, o24, BO
  345. addi AO, AO, 64
  346. addi BO, BO, 32
  347. lxvd2x vs4, 0, AO
  348. lxvd2x vs5, o16, AO
  349. lxvd2x vs6, o32, AO
  350. lxvd2x vs7, o48, AO
  351. addi AO, AO, 64
  352. xvmaddadp vs32, vs0, vs24
  353. xvmaddadp vs33, vs1, vs24
  354. xvmaddadp vs34, vs2, vs24
  355. xvmaddadp vs35, vs3, vs24
  356. xvmaddadp vs36, vs4, vs24
  357. xvmaddadp vs37, vs5, vs24
  358. xvmaddadp vs38, vs6, vs24
  359. xvmaddadp vs39, vs7, vs24
  360. xvmaddadp vs40, vs0, vs25
  361. xvmaddadp vs41, vs1, vs25
  362. xvmaddadp vs42, vs2, vs25
  363. xvmaddadp vs43, vs3, vs25
  364. xvmaddadp vs44, vs4, vs25
  365. xvmaddadp vs45, vs5, vs25
  366. xvmaddadp vs46, vs6, vs25
  367. xvmaddadp vs47, vs7, vs25
  368. xvmaddadp vs48, vs0, vs26
  369. xvmaddadp vs49, vs1, vs26
  370. xvmaddadp vs50, vs2, vs26
  371. xvmaddadp vs51, vs3, vs26
  372. xvmaddadp vs52, vs4, vs26
  373. xvmaddadp vs53, vs5, vs26
  374. xvmaddadp vs54, vs6, vs26
  375. xvmaddadp vs55, vs7, vs26
  376. xvmaddadp vs56, vs0, vs27
  377. xvmaddadp vs57, vs1, vs27
  378. xvmaddadp vs58, vs2, vs27
  379. xvmaddadp vs59, vs3, vs27
  380. xvmaddadp vs60, vs4, vs27
  381. xvmaddadp vs61, vs5, vs27
  382. xvmaddadp vs62, vs6, vs27
  383. xvmaddadp vs63, vs7, vs27
  384. #if defined(_AIX)
  385. ')
  386. #else
  387. .endm
  388. #endif
  389. #if defined(_AIX)
  390. define(`SAVE4x16', `
  391. #else
  392. .macro SAVE4x16
  393. #endif
  394. mr T1, CO
  395. addi T2, T1, 64
  396. #ifndef TRMMKERNEL
  397. lxvd2x vs0, 0, T1
  398. lxvd2x vs1, o16, T1
  399. lxvd2x vs2, o32, T1
  400. lxvd2x vs3, o48, T1
  401. lxvd2x vs4, 0, T2
  402. lxvd2x vs5, o16, T2
  403. lxvd2x vs6, o32, T2
  404. lxvd2x vs7, o48, T2
  405. #endif
  406. #ifndef TRMMKERNEL
  407. xvmaddadp vs0, vs32, alpha_r
  408. xvmaddadp vs1, vs33, alpha_r
  409. xvmaddadp vs2, vs34, alpha_r
  410. xvmaddadp vs3, vs35, alpha_r
  411. xvmaddadp vs4, vs36, alpha_r
  412. xvmaddadp vs5, vs37, alpha_r
  413. xvmaddadp vs6, vs38, alpha_r
  414. xvmaddadp vs7, vs39, alpha_r
  415. #else
  416. xvmuldp vs0, vs32, alpha_r
  417. xvmuldp vs1, vs33, alpha_r
  418. xvmuldp vs2, vs34, alpha_r
  419. xvmuldp vs3, vs35, alpha_r
  420. xvmuldp vs4, vs36, alpha_r
  421. xvmuldp vs5, vs37, alpha_r
  422. xvmuldp vs6, vs38, alpha_r
  423. xvmuldp vs7, vs39, alpha_r
  424. #endif
  425. stxvd2x vs0, 0, T1
  426. stxvd2x vs1, o16, T1
  427. stxvd2x vs2, o32, T1
  428. stxvd2x vs3, o48, T1
  429. stxvd2x vs4, 0, T2
  430. stxvd2x vs5, o16, T2
  431. stxvd2x vs6, o32, T2
  432. stxvd2x vs7, o48, T2
  433. add T1, T1, LDC
  434. add T2, T2, LDC
  435. #ifndef TRMMKERNEL
  436. lxvd2x vs8, 0, T1
  437. lxvd2x vs9, o16, T1
  438. lxvd2x vs10, o32, T1
  439. lxvd2x vs11, o48, T1
  440. lxvd2x vs12, 0, T2
  441. lxvd2x vs13, o16, T2
  442. lxvd2x vs14, o32, T2
  443. lxvd2x vs15, o48, T2
  444. #endif
  445. #ifndef TRMMKERNEL
  446. xvmaddadp vs8, vs40, alpha_r
  447. xvmaddadp vs9, vs41, alpha_r
  448. xvmaddadp vs10, vs42, alpha_r
  449. xvmaddadp vs11, vs43, alpha_r
  450. xvmaddadp vs12, vs44, alpha_r
  451. xvmaddadp vs13, vs45, alpha_r
  452. xvmaddadp vs14, vs46, alpha_r
  453. xvmaddadp vs15, vs47, alpha_r
  454. #else
  455. xvmuldp vs8, vs40, alpha_r
  456. xvmuldp vs9, vs41, alpha_r
  457. xvmuldp vs10, vs42, alpha_r
  458. xvmuldp vs11, vs43, alpha_r
  459. xvmuldp vs12, vs44, alpha_r
  460. xvmuldp vs13, vs45, alpha_r
  461. xvmuldp vs14, vs46, alpha_r
  462. xvmuldp vs15, vs47, alpha_r
  463. #endif
  464. stxvd2x vs8, 0, T1
  465. stxvd2x vs9, o16, T1
  466. stxvd2x vs10, o32, T1
  467. stxvd2x vs11, o48, T1
  468. stxvd2x vs12, 0, T2
  469. stxvd2x vs13, o16, T2
  470. stxvd2x vs14, o32, T2
  471. stxvd2x vs15, o48, T2
  472. add T1, T1, LDC
  473. add T2, T2, LDC
  474. #ifndef TRMMKERNEL
  475. lxvd2x vs0, 0, T1
  476. lxvd2x vs1, o16, T1
  477. lxvd2x vs2, o32, T1
  478. lxvd2x vs3, o48, T1
  479. lxvd2x vs4, 0, T2
  480. lxvd2x vs5, o16, T2
  481. lxvd2x vs6, o32, T2
  482. lxvd2x vs7, o48, T2
  483. #endif
  484. #ifndef TRMMKERNEL
  485. xvmaddadp vs0, vs48, alpha_r
  486. xvmaddadp vs1, vs49, alpha_r
  487. xvmaddadp vs2, vs50, alpha_r
  488. xvmaddadp vs3, vs51, alpha_r
  489. xvmaddadp vs4, vs52, alpha_r
  490. xvmaddadp vs5, vs53, alpha_r
  491. xvmaddadp vs6, vs54, alpha_r
  492. xvmaddadp vs7, vs55, alpha_r
  493. #else
  494. xvmuldp vs0, vs48, alpha_r
  495. xvmuldp vs1, vs49, alpha_r
  496. xvmuldp vs2, vs50, alpha_r
  497. xvmuldp vs3, vs51, alpha_r
  498. xvmuldp vs4, vs52, alpha_r
  499. xvmuldp vs5, vs53, alpha_r
  500. xvmuldp vs6, vs54, alpha_r
  501. xvmuldp vs7, vs55, alpha_r
  502. #endif
  503. stxvd2x vs0, 0, T1
  504. stxvd2x vs1, o16, T1
  505. stxvd2x vs2, o32, T1
  506. stxvd2x vs3, o48, T1
  507. stxvd2x vs4, 0, T2
  508. stxvd2x vs5, o16, T2
  509. stxvd2x vs6, o32, T2
  510. stxvd2x vs7, o48, T2
  511. add T1, T1, LDC
  512. add T2, T2, LDC
  513. #ifndef TRMMKERNEL
  514. lxvd2x vs8, 0, T1
  515. lxvd2x vs9, o16, T1
  516. lxvd2x vs10, o32, T1
  517. lxvd2x vs11, o48, T1
  518. lxvd2x vs12, 0, T2
  519. lxvd2x vs13, o16, T2
  520. lxvd2x vs14, o32, T2
  521. lxvd2x vs15, o48, T2
  522. #endif
  523. #ifndef TRMMKERNEL
  524. xvmaddadp vs8, vs56, alpha_r
  525. xvmaddadp vs9, vs57, alpha_r
  526. xvmaddadp vs10, vs58, alpha_r
  527. xvmaddadp vs11, vs59, alpha_r
  528. xvmaddadp vs12, vs60, alpha_r
  529. xvmaddadp vs13, vs61, alpha_r
  530. xvmaddadp vs14, vs62, alpha_r
  531. xvmaddadp vs15, vs63, alpha_r
  532. #else
  533. xvmuldp vs8, vs56, alpha_r
  534. xvmuldp vs9, vs57, alpha_r
  535. xvmuldp vs10, vs58, alpha_r
  536. xvmuldp vs11, vs59, alpha_r
  537. xvmuldp vs12, vs60, alpha_r
  538. xvmuldp vs13, vs61, alpha_r
  539. xvmuldp vs14, vs62, alpha_r
  540. xvmuldp vs15, vs63, alpha_r
  541. #endif
  542. stxvd2x vs8, 0, T1
  543. stxvd2x vs9, o16, T1
  544. stxvd2x vs10, o32, T1
  545. stxvd2x vs11, o48, T1
  546. stxvd2x vs12, 0, T2
  547. stxvd2x vs13, o16, T2
  548. stxvd2x vs14, o32, T2
  549. stxvd2x vs15, o48, T2
  550. addi CO, CO, 128
  551. #if defined(_AIX)
  552. ')
  553. #else
  554. .endm
  555. #endif
  556. /*********************************************************************
  557. * Macros for N=4, M=8 *
  558. *********************************************************************/
  559. #if defined(_AIX)
  560. define(`LOAD4x8_1', `
  561. #else
  562. .macro LOAD4x8_1
  563. #endif
  564. lxvd2x vs0, 0, AO
  565. lxvd2x vs1, o16, AO
  566. lxvdsx vs24, 0, BO
  567. lxvdsx vs25, o8, BO
  568. lxvd2x vs2, o32, AO
  569. lxvd2x vs3, o48, AO
  570. lxvdsx vs26, o16, BO
  571. lxvdsx vs27, o24, BO
  572. addi AO, AO, 64
  573. addi BO, BO, 32
  574. #if defined(_AIX)
  575. ')
  576. #else
  577. .endm
  578. #endif
  579. #if defined(_AIX)
  580. define(`KERNEL4x8_I1', `
  581. #else
  582. .macro KERNEL4x8_I1
  583. #endif
  584. xvmuldp vs32, vs0, vs24
  585. xvmuldp vs33, vs1, vs24
  586. xvmuldp vs34, vs2, vs24
  587. xvmuldp vs35, vs3, vs24
  588. lxvd2x vs8, 0, AO
  589. lxvd2x vs9, o16, AO
  590. xvmuldp vs40, vs0, vs25
  591. xvmuldp vs41, vs1, vs25
  592. lxvdsx vs28, 0, BO
  593. lxvdsx vs29, o8, BO
  594. xvmuldp vs42, vs2, vs25
  595. xvmuldp vs43, vs3, vs25
  596. xvmuldp vs48, vs0, vs26
  597. xvmuldp vs49, vs1, vs26
  598. lxvd2x vs10, o32, AO
  599. lxvd2x vs11, o48, AO
  600. xvmuldp vs50, vs2, vs26
  601. xvmuldp vs51, vs3, vs26
  602. lxvdsx vs30, o16, BO
  603. lxvdsx vs31, o24, BO
  604. xvmuldp vs56, vs0, vs27
  605. xvmuldp vs57, vs1, vs27
  606. xvmuldp vs58, vs2, vs27
  607. xvmuldp vs59, vs3, vs27
  608. addi AO, AO, 64
  609. addi BO, BO, 32
  610. #if defined(_AIX)
  611. ')
  612. #else
  613. .endm
  614. #endif
  615. #if defined(_AIX)
  616. define(`KERNEL4x8_1', `
  617. #else
  618. .macro KERNEL4x8_1
  619. #endif
  620. xvmaddadp vs32, vs0, vs24
  621. xvmaddadp vs33, vs1, vs24
  622. xvmaddadp vs34, vs2, vs24
  623. xvmaddadp vs35, vs3, vs24
  624. lxvd2x vs8, 0, AO
  625. lxvd2x vs9, o16, AO
  626. xvmaddadp vs40, vs0, vs25
  627. xvmaddadp vs41, vs1, vs25
  628. xvmaddadp vs42, vs2, vs25
  629. xvmaddadp vs43, vs3, vs25
  630. lxvdsx vs28, 0, BO
  631. lxvdsx vs29, o8, BO
  632. xvmaddadp vs48, vs0, vs26
  633. xvmaddadp vs49, vs1, vs26
  634. lxvd2x vs10, o32, AO
  635. lxvd2x vs11, o48, AO
  636. xvmaddadp vs50, vs2, vs26
  637. xvmaddadp vs51, vs3, vs26
  638. lxvdsx vs30, o16, BO
  639. lxvdsx vs31, o24, BO
  640. xvmaddadp vs56, vs0, vs27
  641. xvmaddadp vs57, vs1, vs27
  642. xvmaddadp vs58, vs2, vs27
  643. xvmaddadp vs59, vs3, vs27
  644. addi AO, AO, 64
  645. addi BO, BO, 32
  646. #if defined(_AIX)
  647. ')
  648. #else
  649. .endm
  650. #endif
  651. #if defined(_AIX)
  652. define(`KERNEL4x8_2', `
  653. #else
  654. .macro KERNEL4x8_2
  655. #endif
  656. xvmaddadp vs32, vs8, vs28
  657. xvmaddadp vs33, vs9, vs28
  658. xvmaddadp vs34, vs10, vs28
  659. xvmaddadp vs35, vs11, vs28
  660. lxvd2x vs0, 0, AO
  661. lxvd2x vs1, o16, AO
  662. xvmaddadp vs40, vs8, vs29
  663. xvmaddadp vs41, vs9, vs29
  664. xvmaddadp vs42, vs10, vs29
  665. xvmaddadp vs43, vs11, vs29
  666. lxvdsx vs24, 0, BO
  667. lxvdsx vs25, o8, BO
  668. xvmaddadp vs48, vs8, vs30
  669. xvmaddadp vs49, vs9, vs30
  670. lxvd2x vs2, o32, AO
  671. lxvd2x vs3, o48, AO
  672. xvmaddadp vs50, vs10, vs30
  673. xvmaddadp vs51, vs11, vs30
  674. lxvdsx vs26, o16, BO
  675. lxvdsx vs27, o24, BO
  676. xvmaddadp vs56, vs8, vs31
  677. xvmaddadp vs57, vs9, vs31
  678. xvmaddadp vs58, vs10, vs31
  679. xvmaddadp vs59, vs11, vs31
  680. addi AO, AO, 64
  681. addi BO, BO, 32
  682. #if defined(_AIX)
  683. ')
  684. #else
  685. .endm
  686. #endif
  687. #if defined(_AIX)
  688. define(`KERNEL4x8_E2', `
  689. #else
  690. .macro KERNEL4x8_E2
  691. #endif
  692. xvmaddadp vs32, vs8, vs28
  693. xvmaddadp vs33, vs9, vs28
  694. xvmaddadp vs34, vs10, vs28
  695. xvmaddadp vs35, vs11, vs28
  696. xvmaddadp vs40, vs8, vs29
  697. xvmaddadp vs41, vs9, vs29
  698. xvmaddadp vs42, vs10, vs29
  699. xvmaddadp vs43, vs11, vs29
  700. xvmaddadp vs48, vs8, vs30
  701. xvmaddadp vs49, vs9, vs30
  702. xvmaddadp vs50, vs10, vs30
  703. xvmaddadp vs51, vs11, vs30
  704. xvmaddadp vs56, vs8, vs31
  705. xvmaddadp vs57, vs9, vs31
  706. xvmaddadp vs58, vs10, vs31
  707. xvmaddadp vs59, vs11, vs31
  708. #if defined(_AIX)
  709. ')
  710. #else
  711. .endm
  712. #endif
  713. #if defined(_AIX)
  714. define(`KERNEL4x8_SUBI1', `
  715. #else
  716. .macro KERNEL4x8_SUBI1
  717. #endif
  718. lxvd2x vs0, 0, AO
  719. lxvd2x vs1, o16, AO
  720. lxvd2x vs2, o32, AO
  721. lxvd2x vs3, o48, AO
  722. lxvdsx vs24, 0, BO
  723. lxvdsx vs25, o8, BO
  724. lxvdsx vs26, o16, BO
  725. lxvdsx vs27, o24, BO
  726. addi AO, AO, 64
  727. addi BO, BO, 32
  728. xvmuldp vs32, vs0, vs24
  729. xvmuldp vs33, vs1, vs24
  730. xvmuldp vs34, vs2, vs24
  731. xvmuldp vs35, vs3, vs24
  732. xvmuldp vs40, vs0, vs25
  733. xvmuldp vs41, vs1, vs25
  734. xvmuldp vs42, vs2, vs25
  735. xvmuldp vs43, vs3, vs25
  736. xvmuldp vs48, vs0, vs26
  737. xvmuldp vs49, vs1, vs26
  738. xvmuldp vs50, vs2, vs26
  739. xvmuldp vs51, vs3, vs26
  740. xvmuldp vs56, vs0, vs27
  741. xvmuldp vs57, vs1, vs27
  742. xvmuldp vs58, vs2, vs27
  743. xvmuldp vs59, vs3, vs27
  744. #if defined(_AIX)
  745. ')
  746. #else
  747. .endm
  748. #endif
  749. #if defined(_AIX)
  750. define(`KERNEL4x8_SUB1', `
  751. #else
  752. .macro KERNEL4x8_SUB1
  753. #endif
  754. lxvd2x vs0, 0, AO
  755. lxvd2x vs1, o16, AO
  756. lxvd2x vs2, o32, AO
  757. lxvd2x vs3, o48, AO
  758. lxvdsx vs24, 0, BO
  759. lxvdsx vs25, o8, BO
  760. lxvdsx vs26, o16, BO
  761. lxvdsx vs27, o24, BO
  762. addi AO, AO, 64
  763. addi BO, BO, 32
  764. xvmaddadp vs32, vs0, vs24
  765. xvmaddadp vs33, vs1, vs24
  766. xvmaddadp vs34, vs2, vs24
  767. xvmaddadp vs35, vs3, vs24
  768. xvmaddadp vs40, vs0, vs25
  769. xvmaddadp vs41, vs1, vs25
  770. xvmaddadp vs42, vs2, vs25
  771. xvmaddadp vs43, vs3, vs25
  772. xvmaddadp vs48, vs0, vs26
  773. xvmaddadp vs49, vs1, vs26
  774. xvmaddadp vs50, vs2, vs26
  775. xvmaddadp vs51, vs3, vs26
  776. xvmaddadp vs56, vs0, vs27
  777. xvmaddadp vs57, vs1, vs27
  778. xvmaddadp vs58, vs2, vs27
  779. xvmaddadp vs59, vs3, vs27
  780. #if defined(_AIX)
  781. ')
  782. #else
  783. .endm
  784. #endif
  785. #if defined(_AIX)
  786. define(`SAVE4x8', `
  787. #else
  788. .macro SAVE4x8
  789. #endif
  790. mr T1, CO
  791. #ifndef TRMMKERNEL
  792. lxvd2x vs0, 0, T1
  793. lxvd2x vs1, o16, T1
  794. lxvd2x vs2, o32, T1
  795. lxvd2x vs3, o48, T1
  796. #endif
  797. #ifndef TRMMKERNEL
  798. xvmaddadp vs0, vs32, alpha_r
  799. xvmaddadp vs1, vs33, alpha_r
  800. xvmaddadp vs2, vs34, alpha_r
  801. xvmaddadp vs3, vs35, alpha_r
  802. #else
  803. xvmuldp vs0, vs32, alpha_r
  804. xvmuldp vs1, vs33, alpha_r
  805. xvmuldp vs2, vs34, alpha_r
  806. xvmuldp vs3, vs35, alpha_r
  807. #endif
  808. stxvd2x vs0, 0, T1
  809. stxvd2x vs1, o16, T1
  810. stxvd2x vs2, o32, T1
  811. stxvd2x vs3, o48, T1
  812. add T1, T1, LDC
  813. #ifndef TRMMKERNEL
  814. lxvd2x vs8, 0, T1
  815. lxvd2x vs9, o16, T1
  816. lxvd2x vs10, o32, T1
  817. lxvd2x vs11, o48, T1
  818. #endif
  819. #ifndef TRMMKERNEL
  820. xvmaddadp vs8, vs40, alpha_r
  821. xvmaddadp vs9, vs41, alpha_r
  822. xvmaddadp vs10, vs42, alpha_r
  823. xvmaddadp vs11, vs43, alpha_r
  824. #else
  825. xvmuldp vs8, vs40, alpha_r
  826. xvmuldp vs9, vs41, alpha_r
  827. xvmuldp vs10, vs42, alpha_r
  828. xvmuldp vs11, vs43, alpha_r
  829. #endif
  830. stxvd2x vs8, 0, T1
  831. stxvd2x vs9, o16, T1
  832. stxvd2x vs10, o32, T1
  833. stxvd2x vs11, o48, T1
  834. add T1, T1, LDC
  835. #ifndef TRMMKERNEL
  836. lxvd2x vs0, 0, T1
  837. lxvd2x vs1, o16, T1
  838. lxvd2x vs2, o32, T1
  839. lxvd2x vs3, o48, T1
  840. #endif
  841. #ifndef TRMMKERNEL
  842. xvmaddadp vs0, vs48, alpha_r
  843. xvmaddadp vs1, vs49, alpha_r
  844. xvmaddadp vs2, vs50, alpha_r
  845. xvmaddadp vs3, vs51, alpha_r
  846. #else
  847. xvmuldp vs0, vs48, alpha_r
  848. xvmuldp vs1, vs49, alpha_r
  849. xvmuldp vs2, vs50, alpha_r
  850. xvmuldp vs3, vs51, alpha_r
  851. #endif
  852. stxvd2x vs0, 0, T1
  853. stxvd2x vs1, o16, T1
  854. stxvd2x vs2, o32, T1
  855. stxvd2x vs3, o48, T1
  856. add T1, T1, LDC
  857. #ifndef TRMMKERNEL
  858. lxvd2x vs8, 0, T1
  859. lxvd2x vs9, o16, T1
  860. lxvd2x vs10, o32, T1
  861. lxvd2x vs11, o48, T1
  862. #endif
  863. #ifndef TRMMKERNEL
  864. xvmaddadp vs8, vs56, alpha_r
  865. xvmaddadp vs9, vs57, alpha_r
  866. xvmaddadp vs10, vs58, alpha_r
  867. xvmaddadp vs11, vs59, alpha_r
  868. #else
  869. xvmuldp vs8, vs56, alpha_r
  870. xvmuldp vs9, vs57, alpha_r
  871. xvmuldp vs10, vs58, alpha_r
  872. xvmuldp vs11, vs59, alpha_r
  873. #endif
  874. stxvd2x vs8, 0, T1
  875. stxvd2x vs9, o16, T1
  876. stxvd2x vs10, o32, T1
  877. stxvd2x vs11, o48, T1
  878. addi CO, CO, 64
  879. #if defined(_AIX)
  880. ')
  881. #else
  882. .endm
  883. #endif
  884. /*********************************************************************
  885. * Macros for N=4, M=4 *
  886. *********************************************************************/
  887. #if defined(_AIX)
  888. define(`LOAD4x4_1', `
  889. #else
  890. .macro LOAD4x4_1
  891. #endif
  892. lxvd2x vs0, 0, AO
  893. lxvd2x vs1, o16, AO
  894. lxvdsx vs24, 0, BO
  895. lxvdsx vs25, o8, BO
  896. lxvdsx vs26, o16, BO
  897. lxvdsx vs27, o24, BO
  898. addi AO, AO, 32
  899. addi BO, BO, 32
  900. #if defined(_AIX)
  901. ')
  902. #else
  903. .endm
  904. #endif
  905. #if defined(_AIX)
  906. define(`KERNEL4x4_I1', `
  907. #else
  908. .macro KERNEL4x4_I1
  909. #endif
  910. lxvd2x vs8, 0, AO
  911. lxvd2x vs9, o16, AO
  912. lxvdsx vs28, 0, BO
  913. lxvdsx vs29, o8, BO
  914. lxvdsx vs30, o16, BO
  915. lxvdsx vs31, o24, BO
  916. addi AO, AO, 32
  917. addi BO, BO, 32
  918. xvmuldp vs32, vs0, vs24
  919. xvmuldp vs33, vs1, vs24
  920. xvmuldp vs40, vs0, vs25
  921. xvmuldp vs41, vs1, vs25
  922. xvmuldp vs48, vs0, vs26
  923. xvmuldp vs49, vs1, vs26
  924. xvmuldp vs56, vs0, vs27
  925. xvmuldp vs57, vs1, vs27
  926. #if defined(_AIX)
  927. ')
  928. #else
  929. .endm
  930. #endif
  931. #if defined(_AIX)
  932. define(`KERNEL4x4_1', `
  933. #else
  934. .macro KERNEL4x4_1
  935. #endif
  936. lxvd2x vs8, 0, AO
  937. lxvd2x vs9, o16, AO
  938. lxvdsx vs28, 0, BO
  939. lxvdsx vs29, o8, BO
  940. lxvdsx vs30, o16, BO
  941. lxvdsx vs31, o24, BO
  942. addi AO, AO, 32
  943. addi BO, BO, 32
  944. xvmaddadp vs32, vs0, vs24
  945. xvmaddadp vs33, vs1, vs24
  946. xvmaddadp vs40, vs0, vs25
  947. xvmaddadp vs41, vs1, vs25
  948. xvmaddadp vs48, vs0, vs26
  949. xvmaddadp vs49, vs1, vs26
  950. xvmaddadp vs56, vs0, vs27
  951. xvmaddadp vs57, vs1, vs27
  952. #if defined(_AIX)
  953. ')
  954. #else
  955. .endm
  956. #endif
  957. #if defined(_AIX)
  958. define(`KERNEL4x4_2', `
  959. #else
  960. .macro KERNEL4x4_2
  961. #endif
  962. lxvd2x vs0, 0, AO
  963. lxvd2x vs1, o16, AO
  964. lxvdsx vs24, 0, BO
  965. lxvdsx vs25, o8, BO
  966. lxvdsx vs26, o16, BO
  967. lxvdsx vs27, o24, BO
  968. addi AO, AO, 32
  969. addi BO, BO, 32
  970. xvmaddadp vs32, vs8, vs28
  971. xvmaddadp vs33, vs9, vs28
  972. xvmaddadp vs40, vs8, vs29
  973. xvmaddadp vs41, vs9, vs29
  974. xvmaddadp vs48, vs8, vs30
  975. xvmaddadp vs49, vs9, vs30
  976. xvmaddadp vs56, vs8, vs31
  977. xvmaddadp vs57, vs9, vs31
  978. #if defined(_AIX)
  979. ')
  980. #else
  981. .endm
  982. #endif
  983. #if defined(_AIX)
  984. define(`KERNEL4x4_E2', `
  985. #else
  986. .macro KERNEL4x4_E2
  987. #endif
  988. xvmaddadp vs32, vs8, vs28
  989. xvmaddadp vs33, vs9, vs28
  990. xvmaddadp vs40, vs8, vs29
  991. xvmaddadp vs41, vs9, vs29
  992. xvmaddadp vs48, vs8, vs30
  993. xvmaddadp vs49, vs9, vs30
  994. xvmaddadp vs56, vs8, vs31
  995. xvmaddadp vs57, vs9, vs31
  996. #if defined(_AIX)
  997. ')
  998. #else
  999. .endm
  1000. #endif
  1001. #if defined(_AIX)
  1002. define(`KERNEL4x4_SUBI1', `
  1003. #else
  1004. .macro KERNEL4x4_SUBI1
  1005. #endif
  1006. lxvd2x vs0, 0, AO
  1007. lxvd2x vs1, o16, AO
  1008. lxvdsx vs24, 0, BO
  1009. lxvdsx vs25, o8, BO
  1010. lxvdsx vs26, o16, BO
  1011. lxvdsx vs27, o24, BO
  1012. addi AO, AO, 32
  1013. addi BO, BO, 32
  1014. xvmuldp vs32, vs0, vs24
  1015. xvmuldp vs33, vs1, vs24
  1016. xvmuldp vs40, vs0, vs25
  1017. xvmuldp vs41, vs1, vs25
  1018. xvmuldp vs48, vs0, vs26
  1019. xvmuldp vs49, vs1, vs26
  1020. xvmuldp vs56, vs0, vs27
  1021. xvmuldp vs57, vs1, vs27
  1022. #if defined(_AIX)
  1023. ')
  1024. #else
  1025. .endm
  1026. #endif
  1027. #if defined(_AIX)
  1028. define(`KERNEL4x4_SUB1', `
  1029. #else
  1030. .macro KERNEL4x4_SUB1
  1031. #endif
  1032. lxvd2x vs0, 0, AO
  1033. lxvd2x vs1, o16, AO
  1034. lxvdsx vs24, 0, BO
  1035. lxvdsx vs25, o8, BO
  1036. lxvdsx vs26, o16, BO
  1037. lxvdsx vs27, o24, BO
  1038. addi AO, AO, 32
  1039. addi BO, BO, 32
  1040. xvmaddadp vs32, vs0, vs24
  1041. xvmaddadp vs33, vs1, vs24
  1042. xvmaddadp vs40, vs0, vs25
  1043. xvmaddadp vs41, vs1, vs25
  1044. xvmaddadp vs48, vs0, vs26
  1045. xvmaddadp vs49, vs1, vs26
  1046. xvmaddadp vs56, vs0, vs27
  1047. xvmaddadp vs57, vs1, vs27
  1048. #if defined(_AIX)
  1049. ')
  1050. #else
  1051. .endm
  1052. #endif
  1053. #if defined(_AIX)
  1054. define(`SAVE4x4', `
  1055. #else
  1056. .macro SAVE4x4
  1057. #endif
  1058. mr T1, CO
  1059. #ifndef TRMMKERNEL
  1060. lxvd2x vs0, 0, T1
  1061. lxvd2x vs1, o16, T1
  1062. #endif
  1063. #ifndef TRMMKERNEL
  1064. xvmaddadp vs0, vs32, alpha_r
  1065. xvmaddadp vs1, vs33, alpha_r
  1066. #else
  1067. xvmuldp vs0, vs32, alpha_r
  1068. xvmuldp vs1, vs33, alpha_r
  1069. #endif
  1070. stxvd2x vs0, 0, T1
  1071. stxvd2x vs1, o16, T1
  1072. add T1, T1, LDC
  1073. #ifndef TRMMKERNEL
  1074. lxvd2x vs8, 0, T1
  1075. lxvd2x vs9, o16, T1
  1076. #endif
  1077. #ifndef TRMMKERNEL
  1078. xvmaddadp vs8, vs40, alpha_r
  1079. xvmaddadp vs9, vs41, alpha_r
  1080. #else
  1081. xvmuldp vs8, vs40, alpha_r
  1082. xvmuldp vs9, vs41, alpha_r
  1083. #endif
  1084. stxvd2x vs8, 0, T1
  1085. stxvd2x vs9, o16, T1
  1086. add T1, T1, LDC
  1087. #ifndef TRMMKERNEL
  1088. lxvd2x vs0, 0, T1
  1089. lxvd2x vs1, o16, T1
  1090. #endif
  1091. #ifndef TRMMKERNEL
  1092. xvmaddadp vs0, vs48, alpha_r
  1093. xvmaddadp vs1, vs49, alpha_r
  1094. #else
  1095. xvmuldp vs0, vs48, alpha_r
  1096. xvmuldp vs1, vs49, alpha_r
  1097. #endif
  1098. stxvd2x vs0, 0, T1
  1099. stxvd2x vs1, o16, T1
  1100. add T1, T1, LDC
  1101. #ifndef TRMMKERNEL
  1102. lxvd2x vs8, 0, T1
  1103. lxvd2x vs9, o16, T1
  1104. #endif
  1105. #ifndef TRMMKERNEL
  1106. xvmaddadp vs8, vs56, alpha_r
  1107. xvmaddadp vs9, vs57, alpha_r
  1108. #else
  1109. xvmuldp vs8, vs56, alpha_r
  1110. xvmuldp vs9, vs57, alpha_r
  1111. #endif
  1112. stxvd2x vs8, 0, T1
  1113. stxvd2x vs9, o16, T1
  1114. addi CO, CO, 32
  1115. #if defined(_AIX)
  1116. ')
  1117. #else
  1118. .endm
  1119. #endif
  1120. /*********************************************************************
  1121. * Macros for N=4, M=2 *
  1122. *********************************************************************/
  1123. #if defined(_AIX)
  1124. define(`LOAD4x2_1', `
  1125. #else
  1126. .macro LOAD4x2_1
  1127. #endif
  1128. lxvd2x vs0, 0, AO
  1129. lxvdsx vs24, 0, BO
  1130. lxvdsx vs25, o8, BO
  1131. lxvdsx vs26, o16, BO
  1132. lxvdsx vs27, o24, BO
  1133. addi AO, AO, 16
  1134. addi BO, BO, 32
  1135. #if defined(_AIX)
  1136. ')
  1137. #else
  1138. .endm
  1139. #endif
  1140. #if defined(_AIX)
  1141. define(`KERNEL4x2_I1', `
  1142. #else
  1143. .macro KERNEL4x2_I1
  1144. #endif
  1145. lxvd2x vs8, 0, AO
  1146. lxvdsx vs28, 0, BO
  1147. lxvdsx vs29, o8, BO
  1148. lxvdsx vs30, o16, BO
  1149. lxvdsx vs31, o24, BO
  1150. addi AO, AO, 16
  1151. addi BO, BO, 32
  1152. xvmuldp vs32, vs0, vs24
  1153. xvmuldp vs40, vs0, vs25
  1154. xvmuldp vs48, vs0, vs26
  1155. xvmuldp vs56, vs0, vs27
  1156. #if defined(_AIX)
  1157. ')
  1158. #else
  1159. .endm
  1160. #endif
  1161. #if defined(_AIX)
  1162. define(`KERNEL4x2_1', `
  1163. #else
  1164. .macro KERNEL4x2_1
  1165. #endif
  1166. lxvd2x vs8, 0, AO
  1167. lxvdsx vs28, 0, BO
  1168. lxvdsx vs29, o8, BO
  1169. lxvdsx vs30, o16, BO
  1170. lxvdsx vs31, o24, BO
  1171. addi AO, AO, 16
  1172. addi BO, BO, 32
  1173. xvmaddadp vs32, vs0, vs24
  1174. xvmaddadp vs40, vs0, vs25
  1175. xvmaddadp vs48, vs0, vs26
  1176. xvmaddadp vs56, vs0, vs27
  1177. #if defined(_AIX)
  1178. ')
  1179. #else
  1180. .endm
  1181. #endif
  1182. #if defined(_AIX)
  1183. define(`KERNEL4x2_2', `
  1184. #else
  1185. .macro KERNEL4x2_2
  1186. #endif
  1187. lxvd2x vs0, 0, AO
  1188. lxvdsx vs24, 0, BO
  1189. lxvdsx vs25, o8, BO
  1190. lxvdsx vs26, o16, BO
  1191. lxvdsx vs27, o24, BO
  1192. addi AO, AO, 16
  1193. addi BO, BO, 32
  1194. xvmaddadp vs32, vs8, vs28
  1195. xvmaddadp vs40, vs8, vs29
  1196. xvmaddadp vs48, vs8, vs30
  1197. xvmaddadp vs56, vs8, vs31
  1198. #if defined(_AIX)
  1199. ')
  1200. #else
  1201. .endm
  1202. #endif
  1203. #if defined(_AIX)
  1204. define(`KERNEL4x2_E2', `
  1205. #else
  1206. .macro KERNEL4x2_E2
  1207. #endif
  1208. xvmaddadp vs32, vs8, vs28
  1209. xvmaddadp vs40, vs8, vs29
  1210. xvmaddadp vs48, vs8, vs30
  1211. xvmaddadp vs56, vs8, vs31
  1212. #if defined(_AIX)
  1213. ')
  1214. #else
  1215. .endm
  1216. #endif
  1217. #if defined(_AIX)
  1218. define(`KERNEL4x2_SUBI1', `
  1219. #else
  1220. .macro KERNEL4x2_SUBI1
  1221. #endif
  1222. lxvd2x vs0, 0, AO
  1223. lxvdsx vs24, 0, BO
  1224. lxvdsx vs25, o8, BO
  1225. lxvdsx vs26, o16, BO
  1226. lxvdsx vs27, o24, BO
  1227. addi AO, AO, 16
  1228. addi BO, BO, 32
  1229. xvmuldp vs32, vs0, vs24
  1230. xvmuldp vs40, vs0, vs25
  1231. xvmuldp vs48, vs0, vs26
  1232. xvmuldp vs56, vs0, vs27
  1233. #if defined(_AIX)
  1234. ')
  1235. #else
  1236. .endm
  1237. #endif
  1238. #if defined(_AIX)
  1239. define(`KERNEL4x2_SUB1', `
  1240. #else
  1241. .macro KERNEL4x2_SUB1
  1242. #endif
  1243. lxvd2x vs0, 0, AO
  1244. lxvdsx vs24, 0, BO
  1245. lxvdsx vs25, o8, BO
  1246. lxvdsx vs26, o16, BO
  1247. lxvdsx vs27, o24, BO
  1248. addi AO, AO, 16
  1249. addi BO, BO, 32
  1250. xvmaddadp vs32, vs0, vs24
  1251. xvmaddadp vs40, vs0, vs25
  1252. xvmaddadp vs48, vs0, vs26
  1253. xvmaddadp vs56, vs0, vs27
  1254. #if defined(_AIX)
  1255. ')
  1256. #else
  1257. .endm
  1258. #endif
  1259. #if defined(_AIX)
  1260. define(`SAVE4x2', `
  1261. #else
  1262. .macro SAVE4x2
  1263. #endif
  1264. mr T1, CO
  1265. #ifndef TRMMKERNEL
  1266. lxvd2x vs0, 0, T1
  1267. #endif
  1268. #ifndef TRMMKERNEL
  1269. xvmaddadp vs0, vs32, alpha_r
  1270. #else
  1271. xvmuldp vs0, vs32, alpha_r
  1272. #endif
  1273. stxvd2x vs0, 0, T1
  1274. add T1, T1, LDC
  1275. #ifndef TRMMKERNEL
  1276. lxvd2x vs8, 0, T1
  1277. #endif
  1278. #ifndef TRMMKERNEL
  1279. xvmaddadp vs8, vs40, alpha_r
  1280. #else
  1281. xvmuldp vs8, vs40, alpha_r
  1282. #endif
  1283. stxvd2x vs8, 0, T1
  1284. add T1, T1, LDC
  1285. #ifndef TRMMKERNEL
  1286. lxvd2x vs0, 0, T1
  1287. #endif
  1288. #ifndef TRMMKERNEL
  1289. xvmaddadp vs0, vs48, alpha_r
  1290. #else
  1291. xvmuldp vs0, vs48, alpha_r
  1292. #endif
  1293. stxvd2x vs0, 0, T1
  1294. add T1, T1, LDC
  1295. #ifndef TRMMKERNEL
  1296. lxvd2x vs8, 0, T1
  1297. #endif
  1298. #ifndef TRMMKERNEL
  1299. xvmaddadp vs8, vs56, alpha_r
  1300. #else
  1301. xvmuldp vs8, vs56, alpha_r
  1302. #endif
  1303. stxvd2x vs8, 0, T1
  1304. addi CO, CO, 16
  1305. #if defined(_AIX)
  1306. ')
  1307. #else
  1308. .endm
  1309. #endif
  1310. /*********************************************************************
  1311. * Macros for N=4, M=1 *
  1312. *********************************************************************/
  1313. #if defined(_AIX)
  1314. define(`LOAD4x1_1', `
  1315. #else
  1316. .macro LOAD4x1_1
  1317. #endif
  1318. lxsdx vs0, 0, AO
  1319. lxsdx vs24, 0, BO
  1320. lxsdx vs25, o8, BO
  1321. lxsdx vs26, o16, BO
  1322. lxsdx vs27, o24, BO
  1323. addi AO, AO, 8
  1324. addi BO, BO, 32
  1325. #if defined(_AIX)
  1326. ')
  1327. #else
  1328. .endm
  1329. #endif
  1330. #if defined(_AIX)
  1331. define(`KERNEL4x1_I1', `
  1332. #else
  1333. .macro KERNEL4x1_I1
  1334. #endif
  1335. lxsdx vs8, 0, AO
  1336. lxsdx vs28, 0, BO
  1337. lxsdx vs29, o8, BO
  1338. lxsdx vs30, o16, BO
  1339. lxsdx vs31, o24, BO
  1340. addi AO, AO, 8
  1341. addi BO, BO, 32
  1342. xsmuldp vs32, vs0, vs24
  1343. xsmuldp vs40, vs0, vs25
  1344. xsmuldp vs48, vs0, vs26
  1345. xsmuldp vs56, vs0, vs27
  1346. #if defined(_AIX)
  1347. ')
  1348. #else
  1349. .endm
  1350. #endif
  1351. #if defined(_AIX)
  1352. define(`KERNEL4x1_1', `
  1353. #else
  1354. .macro KERNEL4x1_1
  1355. #endif
  1356. lxsdx vs8, 0, AO
  1357. lxsdx vs28, 0, BO
  1358. lxsdx vs29, o8, BO
  1359. lxsdx vs30, o16, BO
  1360. lxsdx vs31, o24, BO
  1361. addi AO, AO, 8
  1362. addi BO, BO, 32
  1363. xsmaddadp vs32, vs0, vs24
  1364. xsmaddadp vs40, vs0, vs25
  1365. xsmaddadp vs48, vs0, vs26
  1366. xsmaddadp vs56, vs0, vs27
  1367. #if defined(_AIX)
  1368. ')
  1369. #else
  1370. .endm
  1371. #endif
  1372. #if defined(_AIX)
  1373. define(`KERNEL4x1_2', `
  1374. #else
  1375. .macro KERNEL4x1_2
  1376. #endif
  1377. lxsdx vs0, 0, AO
  1378. lxsdx vs24, 0, BO
  1379. lxsdx vs25, o8, BO
  1380. lxsdx vs26, o16, BO
  1381. lxsdx vs27, o24, BO
  1382. addi AO, AO, 8
  1383. addi BO, BO, 32
  1384. xsmaddadp vs32, vs8, vs28
  1385. xsmaddadp vs40, vs8, vs29
  1386. xsmaddadp vs48, vs8, vs30
  1387. xsmaddadp vs56, vs8, vs31
  1388. #if defined(_AIX)
  1389. ')
  1390. #else
  1391. .endm
  1392. #endif
  1393. #if defined(_AIX)
  1394. define(`KERNEL4x1_E2', `
  1395. #else
  1396. .macro KERNEL4x1_E2
  1397. #endif
  1398. xsmaddadp vs32, vs8, vs28
  1399. xsmaddadp vs40, vs8, vs29
  1400. xsmaddadp vs48, vs8, vs30
  1401. xsmaddadp vs56, vs8, vs31
  1402. #if defined(_AIX)
  1403. ')
  1404. #else
  1405. .endm
  1406. #endif
  1407. #if defined(_AIX)
  1408. define(`KERNEL4x1_SUBI1', `
  1409. #else
  1410. .macro KERNEL4x1_SUBI1
  1411. #endif
  1412. lxsdx vs0, 0, AO
  1413. lxsdx vs24, 0, BO
  1414. lxsdx vs25, o8, BO
  1415. lxsdx vs26, o16, BO
  1416. lxsdx vs27, o24, BO
  1417. addi AO, AO, 8
  1418. addi BO, BO, 32
  1419. xsmuldp vs32, vs0, vs24
  1420. xsmuldp vs40, vs0, vs25
  1421. xsmuldp vs48, vs0, vs26
  1422. xsmuldp vs56, vs0, vs27
  1423. #if defined(_AIX)
  1424. ')
  1425. #else
  1426. .endm
  1427. #endif
  1428. #if defined(_AIX)
  1429. define(`KERNEL4x1_SUB1', `
  1430. #else
  1431. .macro KERNEL4x1_SUB1
  1432. #endif
  1433. lxsdx vs0, 0, AO
  1434. lxsdx vs24, 0, BO
  1435. lxsdx vs25, o8, BO
  1436. lxsdx vs26, o16, BO
  1437. lxsdx vs27, o24, BO
  1438. addi AO, AO, 8
  1439. addi BO, BO, 32
  1440. xsmaddadp vs32, vs0, vs24
  1441. xsmaddadp vs40, vs0, vs25
  1442. xsmaddadp vs48, vs0, vs26
  1443. xsmaddadp vs56, vs0, vs27
  1444. #if defined(_AIX)
  1445. ')
  1446. #else
  1447. .endm
  1448. #endif
  1449. #if defined(_AIX)
  1450. define(`SAVE4x1', `
  1451. #else
  1452. .macro SAVE4x1
  1453. #endif
  1454. mr T1, CO
  1455. #ifndef TRMMKERNEL
  1456. lxsdx vs0, 0, T1
  1457. #endif
  1458. #ifndef TRMMKERNEL
  1459. xsmaddadp vs0, vs32, alpha_r
  1460. #else
  1461. xsmuldp vs0, vs32, alpha_r
  1462. #endif
  1463. stxsdx vs0, 0, T1
  1464. add T1, T1, LDC
  1465. #ifndef TRMMKERNEL
  1466. lxsdx vs8, 0, T1
  1467. #endif
  1468. #ifndef TRMMKERNEL
  1469. xsmaddadp vs8, vs40, alpha_r
  1470. #else
  1471. xsmuldp vs8, vs40, alpha_r
  1472. #endif
  1473. stxsdx vs8, 0, T1
  1474. add T1, T1, LDC
  1475. #ifndef TRMMKERNEL
  1476. lxsdx vs0, 0, T1
  1477. #endif
  1478. #ifndef TRMMKERNEL
  1479. xsmaddadp vs0, vs48, alpha_r
  1480. #else
  1481. xsmuldp vs0, vs48, alpha_r
  1482. #endif
  1483. stxsdx vs0, 0, T1
  1484. add T1, T1, LDC
  1485. #ifndef TRMMKERNEL
  1486. lxsdx vs8, 0, T1
  1487. #endif
  1488. #ifndef TRMMKERNEL
  1489. xsmaddadp vs8, vs56, alpha_r
  1490. #else
  1491. xsmuldp vs8, vs56, alpha_r
  1492. #endif
  1493. stxsdx vs8, 0, T1
  1494. addi CO, CO, 8
  1495. #if defined(_AIX)
  1496. ')
  1497. #else
  1498. .endm
  1499. #endif
  1500. /*********************************************************************
  1501. * Macros for N=2, M=16 *
  1502. *********************************************************************/
  1503. #if defined(_AIX)
  1504. define(`LOAD2x16_1', `
  1505. #else
  1506. .macro LOAD2x16_1
  1507. #endif
  1508. lxvd2x vs0, 0, AO
  1509. lxvd2x vs1, o16, AO
  1510. lxvd2x vs2, o32, AO
  1511. lxvd2x vs3, o48, AO
  1512. lxvdsx vs24, 0, BO
  1513. lxvdsx vs25, o8, BO
  1514. addi AO, AO, 64
  1515. addi BO, BO, 16
  1516. lxvd2x vs4, 0, AO
  1517. lxvd2x vs5, o16, AO
  1518. lxvd2x vs6, o32, AO
  1519. lxvd2x vs7, o48, AO
  1520. addi AO, AO, 64
  1521. #if defined(_AIX)
  1522. ')
  1523. #else
  1524. .endm
  1525. #endif
  1526. #if defined(_AIX)
  1527. define(`KERNEL2x16_I1', `
  1528. #else
  1529. .macro KERNEL2x16_I1
  1530. #endif
  1531. lxvd2x vs8, 0, AO
  1532. lxvd2x vs9, o16, AO
  1533. lxvd2x vs10, o32, AO
  1534. lxvd2x vs11, o48, AO
  1535. lxvdsx vs28, 0, BO
  1536. lxvdsx vs29, o8, BO
  1537. addi AO, AO, 64
  1538. addi BO, BO, 16
  1539. lxvd2x vs12, 0, AO
  1540. lxvd2x vs13, o16, AO
  1541. lxvd2x vs14, o32, AO
  1542. lxvd2x vs15, o48, AO
  1543. addi AO, AO, 64
  1544. xvmuldp vs32, vs0, vs24
  1545. xvmuldp vs33, vs1, vs24
  1546. xvmuldp vs34, vs2, vs24
  1547. xvmuldp vs35, vs3, vs24
  1548. xvmuldp vs36, vs4, vs24
  1549. xvmuldp vs37, vs5, vs24
  1550. xvmuldp vs38, vs6, vs24
  1551. xvmuldp vs39, vs7, vs24
  1552. xvmuldp vs40, vs0, vs25
  1553. xvmuldp vs41, vs1, vs25
  1554. xvmuldp vs42, vs2, vs25
  1555. xvmuldp vs43, vs3, vs25
  1556. xvmuldp vs44, vs4, vs25
  1557. xvmuldp vs45, vs5, vs25
  1558. xvmuldp vs46, vs6, vs25
  1559. xvmuldp vs47, vs7, vs25
  1560. #if defined(_AIX)
  1561. ')
  1562. #else
  1563. .endm
  1564. #endif
  1565. #if defined(_AIX)
  1566. define(`KERNEL2x16_1', `
  1567. #else
  1568. .macro KERNEL2x16_1
  1569. #endif
  1570. lxvd2x vs8, 0, AO
  1571. lxvd2x vs9, o16, AO
  1572. lxvd2x vs10, o32, AO
  1573. lxvd2x vs11, o48, AO
  1574. lxvdsx vs28, 0, BO
  1575. lxvdsx vs29, o8, BO
  1576. addi AO, AO, 64
  1577. addi BO, BO, 16
  1578. lxvd2x vs12, 0, AO
  1579. lxvd2x vs13, o16, AO
  1580. lxvd2x vs14, o32, AO
  1581. lxvd2x vs15, o48, AO
  1582. addi AO, AO, 64
  1583. xvmaddadp vs32, vs0, vs24
  1584. xvmaddadp vs33, vs1, vs24
  1585. xvmaddadp vs34, vs2, vs24
  1586. xvmaddadp vs35, vs3, vs24
  1587. xvmaddadp vs36, vs4, vs24
  1588. xvmaddadp vs37, vs5, vs24
  1589. xvmaddadp vs38, vs6, vs24
  1590. xvmaddadp vs39, vs7, vs24
  1591. xvmaddadp vs40, vs0, vs25
  1592. xvmaddadp vs41, vs1, vs25
  1593. xvmaddadp vs42, vs2, vs25
  1594. xvmaddadp vs43, vs3, vs25
  1595. xvmaddadp vs44, vs4, vs25
  1596. xvmaddadp vs45, vs5, vs25
  1597. xvmaddadp vs46, vs6, vs25
  1598. xvmaddadp vs47, vs7, vs25
  1599. #if defined(_AIX)
  1600. ')
  1601. #else
  1602. .endm
  1603. #endif
  1604. #if defined(_AIX)
  1605. define(`KERNEL2x16_2', `
  1606. #else
  1607. .macro KERNEL2x16_2
  1608. #endif
  1609. lxvd2x vs0, 0, AO
  1610. lxvd2x vs1, o16, AO
  1611. lxvd2x vs2, o32, AO
  1612. lxvd2x vs3, o48, AO
  1613. lxvdsx vs24, 0, BO
  1614. lxvdsx vs25, o8, BO
  1615. addi AO, AO, 64
  1616. addi BO, BO, 16
  1617. lxvd2x vs4, 0, AO
  1618. lxvd2x vs5, o16, AO
  1619. lxvd2x vs6, o32, AO
  1620. lxvd2x vs7, o48, AO
  1621. addi AO, AO, 64
  1622. xvmaddadp vs32, vs8, vs28
  1623. xvmaddadp vs33, vs9, vs28
  1624. xvmaddadp vs34, vs10, vs28
  1625. xvmaddadp vs35, vs11, vs28
  1626. xvmaddadp vs36, vs12, vs28
  1627. xvmaddadp vs37, vs13, vs28
  1628. xvmaddadp vs38, vs14, vs28
  1629. xvmaddadp vs39, vs15, vs28
  1630. xvmaddadp vs40, vs8, vs29
  1631. xvmaddadp vs41, vs9, vs29
  1632. xvmaddadp vs42, vs10, vs29
  1633. xvmaddadp vs43, vs11, vs29
  1634. xvmaddadp vs44, vs12, vs29
  1635. xvmaddadp vs45, vs13, vs29
  1636. xvmaddadp vs46, vs14, vs29
  1637. xvmaddadp vs47, vs15, vs29
  1638. #if defined(_AIX)
  1639. ')
  1640. #else
  1641. .endm
  1642. #endif
  1643. #if defined(_AIX)
  1644. define(`KERNEL2x16_E2', `
  1645. #else
  1646. .macro KERNEL2x16_E2
  1647. #endif
  1648. xvmaddadp vs32, vs8, vs28
  1649. xvmaddadp vs33, vs9, vs28
  1650. xvmaddadp vs34, vs10, vs28
  1651. xvmaddadp vs35, vs11, vs28
  1652. xvmaddadp vs36, vs12, vs28
  1653. xvmaddadp vs37, vs13, vs28
  1654. xvmaddadp vs38, vs14, vs28
  1655. xvmaddadp vs39, vs15, vs28
  1656. xvmaddadp vs40, vs8, vs29
  1657. xvmaddadp vs41, vs9, vs29
  1658. xvmaddadp vs42, vs10, vs29
  1659. xvmaddadp vs43, vs11, vs29
  1660. xvmaddadp vs44, vs12, vs29
  1661. xvmaddadp vs45, vs13, vs29
  1662. xvmaddadp vs46, vs14, vs29
  1663. xvmaddadp vs47, vs15, vs29
  1664. #if defined(_AIX)
  1665. ')
  1666. #else
  1667. .endm
  1668. #endif
  1669. #if defined(_AIX)
  1670. define(`KERNEL2x16_SUBI1', `
  1671. #else
  1672. .macro KERNEL2x16_SUBI1
  1673. #endif
  1674. lxvd2x vs0, 0, AO
  1675. lxvd2x vs1, o16, AO
  1676. lxvd2x vs2, o32, AO
  1677. lxvd2x vs3, o48, AO
  1678. lxvdsx vs24, 0, BO
  1679. lxvdsx vs25, o8, BO
  1680. addi AO, AO, 64
  1681. addi BO, BO, 16
  1682. lxvd2x vs4, 0, AO
  1683. lxvd2x vs5, o16, AO
  1684. lxvd2x vs6, o32, AO
  1685. lxvd2x vs7, o48, AO
  1686. addi AO, AO, 64
  1687. xvmuldp vs32, vs0, vs24
  1688. xvmuldp vs33, vs1, vs24
  1689. xvmuldp vs34, vs2, vs24
  1690. xvmuldp vs35, vs3, vs24
  1691. xvmuldp vs36, vs4, vs24
  1692. xvmuldp vs37, vs5, vs24
  1693. xvmuldp vs38, vs6, vs24
  1694. xvmuldp vs39, vs7, vs24
  1695. xvmuldp vs40, vs0, vs25
  1696. xvmuldp vs41, vs1, vs25
  1697. xvmuldp vs42, vs2, vs25
  1698. xvmuldp vs43, vs3, vs25
  1699. xvmuldp vs44, vs4, vs25
  1700. xvmuldp vs45, vs5, vs25
  1701. xvmuldp vs46, vs6, vs25
  1702. xvmuldp vs47, vs7, vs25
  1703. #if defined(_AIX)
  1704. ')
  1705. #else
  1706. .endm
  1707. #endif
  1708. #if defined(_AIX)
  1709. define(`KERNEL2x16_SUB1', `
  1710. #else
  1711. .macro KERNEL2x16_SUB1
  1712. #endif
  1713. lxvd2x vs0, 0, AO
  1714. lxvd2x vs1, o16, AO
  1715. lxvd2x vs2, o32, AO
  1716. lxvd2x vs3, o48, AO
  1717. lxvdsx vs24, 0, BO
  1718. lxvdsx vs25, o8, BO
  1719. addi AO, AO, 64
  1720. addi BO, BO, 16
  1721. lxvd2x vs4, 0, AO
  1722. lxvd2x vs5, o16, AO
  1723. lxvd2x vs6, o32, AO
  1724. lxvd2x vs7, o48, AO
  1725. addi AO, AO, 64
  1726. xvmaddadp vs32, vs0, vs24
  1727. xvmaddadp vs33, vs1, vs24
  1728. xvmaddadp vs34, vs2, vs24
  1729. xvmaddadp vs35, vs3, vs24
  1730. xvmaddadp vs36, vs4, vs24
  1731. xvmaddadp vs37, vs5, vs24
  1732. xvmaddadp vs38, vs6, vs24
  1733. xvmaddadp vs39, vs7, vs24
  1734. xvmaddadp vs40, vs0, vs25
  1735. xvmaddadp vs41, vs1, vs25
  1736. xvmaddadp vs42, vs2, vs25
  1737. xvmaddadp vs43, vs3, vs25
  1738. xvmaddadp vs44, vs4, vs25
  1739. xvmaddadp vs45, vs5, vs25
  1740. xvmaddadp vs46, vs6, vs25
  1741. xvmaddadp vs47, vs7, vs25
  1742. #if defined(_AIX)
  1743. ')
  1744. #else
  1745. .endm
  1746. #endif
  1747. #if defined(_AIX)
  1748. define(`SAVE2x16', `
  1749. #else
  1750. .macro SAVE2x16
  1751. #endif
  1752. mr T1, CO
  1753. addi T2, T1, 64
  1754. #ifndef TRMMKERNEL
  1755. lxvd2x vs0, 0, T1
  1756. lxvd2x vs1, o16, T1
  1757. lxvd2x vs2, o32, T1
  1758. lxvd2x vs3, o48, T1
  1759. lxvd2x vs4, 0, T2
  1760. lxvd2x vs5, o16, T2
  1761. lxvd2x vs6, o32, T2
  1762. lxvd2x vs7, o48, T2
  1763. #endif
  1764. #ifndef TRMMKERNEL
  1765. xvmaddadp vs0, vs32, alpha_r
  1766. xvmaddadp vs1, vs33, alpha_r
  1767. xvmaddadp vs2, vs34, alpha_r
  1768. xvmaddadp vs3, vs35, alpha_r
  1769. xvmaddadp vs4, vs36, alpha_r
  1770. xvmaddadp vs5, vs37, alpha_r
  1771. xvmaddadp vs6, vs38, alpha_r
  1772. xvmaddadp vs7, vs39, alpha_r
  1773. #else
  1774. xvmuldp vs0, vs32, alpha_r
  1775. xvmuldp vs1, vs33, alpha_r
  1776. xvmuldp vs2, vs34, alpha_r
  1777. xvmuldp vs3, vs35, alpha_r
  1778. xvmuldp vs4, vs36, alpha_r
  1779. xvmuldp vs5, vs37, alpha_r
  1780. xvmuldp vs6, vs38, alpha_r
  1781. xvmuldp vs7, vs39, alpha_r
  1782. #endif
  1783. stxvd2x vs0, 0, T1
  1784. stxvd2x vs1, o16, T1
  1785. stxvd2x vs2, o32, T1
  1786. stxvd2x vs3, o48, T1
  1787. stxvd2x vs4, 0, T2
  1788. stxvd2x vs5, o16, T2
  1789. stxvd2x vs6, o32, T2
  1790. stxvd2x vs7, o48, T2
  1791. add T1, T1, LDC
  1792. add T2, T2, LDC
  1793. #ifndef TRMMKERNEL
  1794. lxvd2x vs8, 0, T1
  1795. lxvd2x vs9, o16, T1
  1796. lxvd2x vs10, o32, T1
  1797. lxvd2x vs11, o48, T1
  1798. lxvd2x vs12, 0, T2
  1799. lxvd2x vs13, o16, T2
  1800. lxvd2x vs14, o32, T2
  1801. lxvd2x vs15, o48, T2
  1802. #endif
  1803. #ifndef TRMMKERNEL
  1804. xvmaddadp vs8, vs40, alpha_r
  1805. xvmaddadp vs9, vs41, alpha_r
  1806. xvmaddadp vs10, vs42, alpha_r
  1807. xvmaddadp vs11, vs43, alpha_r
  1808. xvmaddadp vs12, vs44, alpha_r
  1809. xvmaddadp vs13, vs45, alpha_r
  1810. xvmaddadp vs14, vs46, alpha_r
  1811. xvmaddadp vs15, vs47, alpha_r
  1812. #else
  1813. xvmuldp vs8, vs40, alpha_r
  1814. xvmuldp vs9, vs41, alpha_r
  1815. xvmuldp vs10, vs42, alpha_r
  1816. xvmuldp vs11, vs43, alpha_r
  1817. xvmuldp vs12, vs44, alpha_r
  1818. xvmuldp vs13, vs45, alpha_r
  1819. xvmuldp vs14, vs46, alpha_r
  1820. xvmuldp vs15, vs47, alpha_r
  1821. #endif
  1822. stxvd2x vs8, 0, T1
  1823. stxvd2x vs9, o16, T1
  1824. stxvd2x vs10, o32, T1
  1825. stxvd2x vs11, o48, T1
  1826. stxvd2x vs12, 0, T2
  1827. stxvd2x vs13, o16, T2
  1828. stxvd2x vs14, o32, T2
  1829. stxvd2x vs15, o48, T2
  1830. addi CO, CO, 128
  1831. #if defined(_AIX)
  1832. ')
  1833. #else
  1834. .endm
  1835. #endif
  1836. /*********************************************************************
  1837. * Macros for N=4, M=8 *
  1838. *********************************************************************/
  1839. #if defined(_AIX)
  1840. define(`LOAD2x8_1', `
  1841. #else
  1842. .macro LOAD2x8_1
  1843. #endif
  1844. lxvd2x vs0, 0, AO
  1845. lxvd2x vs1, o16, AO
  1846. lxvd2x vs2, o32, AO
  1847. lxvd2x vs3, o48, AO
  1848. lxvdsx vs24, 0, BO
  1849. lxvdsx vs25, o8, BO
  1850. addi AO, AO, 64
  1851. addi BO, BO, 16
  1852. #if defined(_AIX)
  1853. ')
  1854. #else
  1855. .endm
  1856. #endif
  1857. #if defined(_AIX)
  1858. define(`KERNEL2x8_I1', `
  1859. #else
  1860. .macro KERNEL2x8_I1
  1861. #endif
  1862. lxvd2x vs8, 0, AO
  1863. lxvd2x vs9, o16, AO
  1864. lxvd2x vs10, o32, AO
  1865. lxvd2x vs11, o48, AO
  1866. lxvdsx vs28, 0, BO
  1867. lxvdsx vs29, o8, BO
  1868. addi AO, AO, 64
  1869. addi BO, BO, 16
  1870. xvmuldp vs32, vs0, vs24
  1871. xvmuldp vs33, vs1, vs24
  1872. xvmuldp vs34, vs2, vs24
  1873. xvmuldp vs35, vs3, vs24
  1874. xvmuldp vs40, vs0, vs25
  1875. xvmuldp vs41, vs1, vs25
  1876. xvmuldp vs42, vs2, vs25
  1877. xvmuldp vs43, vs3, vs25
  1878. #if defined(_AIX)
  1879. ')
  1880. #else
  1881. .endm
  1882. #endif
  1883. #if defined(_AIX)
  1884. define(`KERNEL2x8_1', `
  1885. #else
  1886. .macro KERNEL2x8_1
  1887. #endif
  1888. lxvd2x vs8, 0, AO
  1889. lxvd2x vs9, o16, AO
  1890. lxvd2x vs10, o32, AO
  1891. lxvd2x vs11, o48, AO
  1892. lxvdsx vs28, 0, BO
  1893. lxvdsx vs29, o8, BO
  1894. addi AO, AO, 64
  1895. addi BO, BO, 16
  1896. xvmaddadp vs32, vs0, vs24
  1897. xvmaddadp vs33, vs1, vs24
  1898. xvmaddadp vs34, vs2, vs24
  1899. xvmaddadp vs35, vs3, vs24
  1900. xvmaddadp vs40, vs0, vs25
  1901. xvmaddadp vs41, vs1, vs25
  1902. xvmaddadp vs42, vs2, vs25
  1903. xvmaddadp vs43, vs3, vs25
  1904. #if defined(_AIX)
  1905. ')
  1906. #else
  1907. .endm
  1908. #endif
  1909. #if defined(_AIX)
  1910. define(`KERNEL2x8_2', `
  1911. #else
  1912. .macro KERNEL2x8_2
  1913. #endif
  1914. lxvd2x vs0, 0, AO
  1915. lxvd2x vs1, o16, AO
  1916. lxvd2x vs2, o32, AO
  1917. lxvd2x vs3, o48, AO
  1918. lxvdsx vs24, 0, BO
  1919. lxvdsx vs25, o8, BO
  1920. addi AO, AO, 64
  1921. addi BO, BO, 16
  1922. xvmaddadp vs32, vs8, vs28
  1923. xvmaddadp vs33, vs9, vs28
  1924. xvmaddadp vs34, vs10, vs28
  1925. xvmaddadp vs35, vs11, vs28
  1926. xvmaddadp vs40, vs8, vs29
  1927. xvmaddadp vs41, vs9, vs29
  1928. xvmaddadp vs42, vs10, vs29
  1929. xvmaddadp vs43, vs11, vs29
  1930. #if defined(_AIX)
  1931. ')
  1932. #else
  1933. .endm
  1934. #endif
  1935. #if defined(_AIX)
  1936. define(`KERNEL2x8_E2', `
  1937. #else
  1938. .macro KERNEL2x8_E2
  1939. #endif
  1940. xvmaddadp vs32, vs8, vs28
  1941. xvmaddadp vs33, vs9, vs28
  1942. xvmaddadp vs34, vs10, vs28
  1943. xvmaddadp vs35, vs11, vs28
  1944. xvmaddadp vs40, vs8, vs29
  1945. xvmaddadp vs41, vs9, vs29
  1946. xvmaddadp vs42, vs10, vs29
  1947. xvmaddadp vs43, vs11, vs29
  1948. #if defined(_AIX)
  1949. ')
  1950. #else
  1951. .endm
  1952. #endif
  1953. #if defined(_AIX)
  1954. define(`KERNEL2x8_SUBI1', `
  1955. #else
  1956. .macro KERNEL2x8_SUBI1
  1957. #endif
  1958. lxvd2x vs0, 0, AO
  1959. lxvd2x vs1, o16, AO
  1960. lxvd2x vs2, o32, AO
  1961. lxvd2x vs3, o48, AO
  1962. lxvdsx vs24, 0, BO
  1963. lxvdsx vs25, o8, BO
  1964. addi AO, AO, 64
  1965. addi BO, BO, 16
  1966. xvmuldp vs32, vs0, vs24
  1967. xvmuldp vs33, vs1, vs24
  1968. xvmuldp vs34, vs2, vs24
  1969. xvmuldp vs35, vs3, vs24
  1970. xvmuldp vs40, vs0, vs25
  1971. xvmuldp vs41, vs1, vs25
  1972. xvmuldp vs42, vs2, vs25
  1973. xvmuldp vs43, vs3, vs25
  1974. #if defined(_AIX)
  1975. ')
  1976. #else
  1977. .endm
  1978. #endif
  1979. #if defined(_AIX)
  1980. define(`KERNEL2x8_SUB1', `
  1981. #else
  1982. .macro KERNEL2x8_SUB1
  1983. #endif
  1984. lxvd2x vs0, 0, AO
  1985. lxvd2x vs1, o16, AO
  1986. lxvd2x vs2, o32, AO
  1987. lxvd2x vs3, o48, AO
  1988. lxvdsx vs24, 0, BO
  1989. lxvdsx vs25, o8, BO
  1990. addi AO, AO, 64
  1991. addi BO, BO, 16
  1992. xvmaddadp vs32, vs0, vs24
  1993. xvmaddadp vs33, vs1, vs24
  1994. xvmaddadp vs34, vs2, vs24
  1995. xvmaddadp vs35, vs3, vs24
  1996. xvmaddadp vs40, vs0, vs25
  1997. xvmaddadp vs41, vs1, vs25
  1998. xvmaddadp vs42, vs2, vs25
  1999. xvmaddadp vs43, vs3, vs25
  2000. #if defined(_AIX)
  2001. ')
  2002. #else
  2003. .endm
  2004. #endif
  2005. #if defined(_AIX)
  2006. define(`SAVE2x8', `
  2007. #else
  2008. .macro SAVE2x8
  2009. #endif
  2010. mr T1, CO
  2011. #ifndef TRMMKERNEL
  2012. lxvd2x vs0, 0, T1
  2013. lxvd2x vs1, o16, T1
  2014. lxvd2x vs2, o32, T1
  2015. lxvd2x vs3, o48, T1
  2016. #endif
  2017. #ifndef TRMMKERNEL
  2018. xvmaddadp vs0, vs32, alpha_r
  2019. xvmaddadp vs1, vs33, alpha_r
  2020. xvmaddadp vs2, vs34, alpha_r
  2021. xvmaddadp vs3, vs35, alpha_r
  2022. #else
  2023. xvmuldp vs0, vs32, alpha_r
  2024. xvmuldp vs1, vs33, alpha_r
  2025. xvmuldp vs2, vs34, alpha_r
  2026. xvmuldp vs3, vs35, alpha_r
  2027. #endif
  2028. stxvd2x vs0, 0, T1
  2029. stxvd2x vs1, o16, T1
  2030. stxvd2x vs2, o32, T1
  2031. stxvd2x vs3, o48, T1
  2032. add T1, T1, LDC
  2033. #ifndef TRMMKERNEL
  2034. lxvd2x vs8, 0, T1
  2035. lxvd2x vs9, o16, T1
  2036. lxvd2x vs10, o32, T1
  2037. lxvd2x vs11, o48, T1
  2038. #endif
  2039. #ifndef TRMMKERNEL
  2040. xvmaddadp vs8, vs40, alpha_r
  2041. xvmaddadp vs9, vs41, alpha_r
  2042. xvmaddadp vs10, vs42, alpha_r
  2043. xvmaddadp vs11, vs43, alpha_r
  2044. #else
  2045. xvmuldp vs8, vs40, alpha_r
  2046. xvmuldp vs9, vs41, alpha_r
  2047. xvmuldp vs10, vs42, alpha_r
  2048. xvmuldp vs11, vs43, alpha_r
  2049. #endif
  2050. stxvd2x vs8, 0, T1
  2051. stxvd2x vs9, o16, T1
  2052. stxvd2x vs10, o32, T1
  2053. stxvd2x vs11, o48, T1
  2054. addi CO, CO, 64
  2055. #if defined(_AIX)
  2056. ')
  2057. #else
  2058. .endm
  2059. #endif
  2060. /*********************************************************************
  2061. * Macros for N=2, M=4 *
  2062. *********************************************************************/
  2063. #if defined(_AIX)
  2064. define(`LOAD2x4_1', `
  2065. #else
  2066. .macro LOAD2x4_1
  2067. #endif
  2068. lxvd2x vs0, 0, AO
  2069. lxvd2x vs1, o16, AO
  2070. lxvdsx vs24, 0, BO
  2071. lxvdsx vs25, o8, BO
  2072. addi AO, AO, 32
  2073. addi BO, BO, 16
  2074. #if defined(_AIX)
  2075. ')
  2076. #else
  2077. .endm
  2078. #endif
  2079. #if defined(_AIX)
  2080. define(`KERNEL2x4_I1', `
  2081. #else
  2082. .macro KERNEL2x4_I1
  2083. #endif
  2084. lxvd2x vs8, 0, AO
  2085. lxvd2x vs9, o16, AO
  2086. lxvdsx vs28, 0, BO
  2087. lxvdsx vs29, o8, BO
  2088. addi AO, AO, 32
  2089. addi BO, BO, 16
  2090. xvmuldp vs32, vs0, vs24
  2091. xvmuldp vs33, vs1, vs24
  2092. xvmuldp vs40, vs0, vs25
  2093. xvmuldp vs41, vs1, vs25
  2094. #if defined(_AIX)
  2095. ')
  2096. #else
  2097. .endm
  2098. #endif
  2099. #if defined(_AIX)
  2100. define(`KERNEL2x4_1', `
  2101. #else
  2102. .macro KERNEL2x4_1
  2103. #endif
  2104. lxvd2x vs8, 0, AO
  2105. lxvd2x vs9, o16, AO
  2106. lxvdsx vs28, 0, BO
  2107. lxvdsx vs29, o8, BO
  2108. addi AO, AO, 32
  2109. addi BO, BO, 16
  2110. xvmaddadp vs32, vs0, vs24
  2111. xvmaddadp vs33, vs1, vs24
  2112. xvmaddadp vs40, vs0, vs25
  2113. xvmaddadp vs41, vs1, vs25
  2114. #if defined(_AIX)
  2115. ')
  2116. #else
  2117. .endm
  2118. #endif
  2119. #if defined(_AIX)
  2120. define(`KERNEL2x4_2', `
  2121. #else
  2122. .macro KERNEL2x4_2
  2123. #endif
  2124. lxvd2x vs0, 0, AO
  2125. lxvd2x vs1, o16, AO
  2126. lxvdsx vs24, 0, BO
  2127. lxvdsx vs25, o8, BO
  2128. addi AO, AO, 32
  2129. addi BO, BO, 16
  2130. xvmaddadp vs32, vs8, vs28
  2131. xvmaddadp vs33, vs9, vs28
  2132. xvmaddadp vs40, vs8, vs29
  2133. xvmaddadp vs41, vs9, vs29
  2134. #if defined(_AIX)
  2135. ')
  2136. #else
  2137. .endm
  2138. #endif
  2139. #if defined(_AIX)
  2140. define(`KERNEL2x4_E2', `
  2141. #else
  2142. .macro KERNEL2x4_E2
  2143. #endif
  2144. xvmaddadp vs32, vs8, vs28
  2145. xvmaddadp vs33, vs9, vs28
  2146. xvmaddadp vs40, vs8, vs29
  2147. xvmaddadp vs41, vs9, vs29
  2148. #if defined(_AIX)
  2149. ')
  2150. #else
  2151. .endm
  2152. #endif
  2153. #if defined(_AIX)
  2154. define(`KERNEL2x4_SUBI1', `
  2155. #else
  2156. .macro KERNEL2x4_SUBI1
  2157. #endif
  2158. lxvd2x vs0, 0, AO
  2159. lxvd2x vs1, o16, AO
  2160. lxvdsx vs24, 0, BO
  2161. lxvdsx vs25, o8, BO
  2162. addi AO, AO, 32
  2163. addi BO, BO, 16
  2164. xvmuldp vs32, vs0, vs24
  2165. xvmuldp vs33, vs1, vs24
  2166. xvmuldp vs40, vs0, vs25
  2167. xvmuldp vs41, vs1, vs25
  2168. #if defined(_AIX)
  2169. ')
  2170. #else
  2171. .endm
  2172. #endif
  2173. #if defined(_AIX)
  2174. define(`KERNEL2x4_SUB1', `
  2175. #else
  2176. .macro KERNEL2x4_SUB1
  2177. #endif
  2178. lxvd2x vs0, 0, AO
  2179. lxvd2x vs1, o16, AO
  2180. lxvdsx vs24, 0, BO
  2181. lxvdsx vs25, o8, BO
  2182. addi AO, AO, 32
  2183. addi BO, BO, 16
  2184. xvmaddadp vs32, vs0, vs24
  2185. xvmaddadp vs33, vs1, vs24
  2186. xvmaddadp vs40, vs0, vs25
  2187. xvmaddadp vs41, vs1, vs25
  2188. #if defined(_AIX)
  2189. ')
  2190. #else
  2191. .endm
  2192. #endif
  2193. #if defined(_AIX)
  2194. define(`SAVE2x4', `
  2195. #else
  2196. .macro SAVE2x4
  2197. #endif
  2198. mr T1, CO
  2199. #ifndef TRMMKERNEL
  2200. lxvd2x vs0, 0, T1
  2201. lxvd2x vs1, o16, T1
  2202. #endif
  2203. #ifndef TRMMKERNEL
  2204. xvmaddadp vs0, vs32, alpha_r
  2205. xvmaddadp vs1, vs33, alpha_r
  2206. #else
  2207. xvmuldp vs0, vs32, alpha_r
  2208. xvmuldp vs1, vs33, alpha_r
  2209. #endif
  2210. stxvd2x vs0, 0, T1
  2211. stxvd2x vs1, o16, T1
  2212. add T1, T1, LDC
  2213. #ifndef TRMMKERNEL
  2214. lxvd2x vs8, 0, T1
  2215. lxvd2x vs9, o16, T1
  2216. #endif
  2217. #ifndef TRMMKERNEL
  2218. xvmaddadp vs8, vs40, alpha_r
  2219. xvmaddadp vs9, vs41, alpha_r
  2220. #else
  2221. xvmuldp vs8, vs40, alpha_r
  2222. xvmuldp vs9, vs41, alpha_r
  2223. #endif
  2224. stxvd2x vs8, 0, T1
  2225. stxvd2x vs9, o16, T1
  2226. addi CO, CO, 32
  2227. #if defined(_AIX)
  2228. ')
  2229. #else
  2230. .endm
  2231. #endif
  2232. /*********************************************************************
  2233. * Macros for N=2, M=2 *
  2234. *********************************************************************/
  2235. #if defined(_AIX)
  2236. define(`LOAD2x2_1', `
  2237. #else
  2238. .macro LOAD2x2_1
  2239. #endif
  2240. lxvd2x vs0, 0, AO
  2241. lxvdsx vs24, 0, BO
  2242. lxvdsx vs25, o8, BO
  2243. addi AO, AO, 16
  2244. addi BO, BO, 16
  2245. #if defined(_AIX)
  2246. ')
  2247. #else
  2248. .endm
  2249. #endif
  2250. #if defined(_AIX)
  2251. define(`KERNEL2x2_I1', `
  2252. #else
  2253. .macro KERNEL2x2_I1
  2254. #endif
  2255. lxvd2x vs8, 0, AO
  2256. lxvdsx vs28, 0, BO
  2257. lxvdsx vs29, o8, BO
  2258. addi AO, AO, 16
  2259. addi BO, BO, 16
  2260. xvmuldp vs32, vs0, vs24
  2261. xvmuldp vs40, vs0, vs25
  2262. #if defined(_AIX)
  2263. ')
  2264. #else
  2265. .endm
  2266. #endif
  2267. #if defined(_AIX)
  2268. define(`KERNEL2x2_1', `
  2269. #else
  2270. .macro KERNEL2x2_1
  2271. #endif
  2272. lxvd2x vs8, 0, AO
  2273. lxvdsx vs28, 0, BO
  2274. lxvdsx vs29, o8, BO
  2275. addi AO, AO, 16
  2276. addi BO, BO, 16
  2277. xvmaddadp vs32, vs0, vs24
  2278. xvmaddadp vs40, vs0, vs25
  2279. #if defined(_AIX)
  2280. ')
  2281. #else
  2282. .endm
  2283. #endif
  2284. #if defined(_AIX)
  2285. define(`KERNEL2x2_2', `
  2286. #else
  2287. .macro KERNEL2x2_2
  2288. #endif
  2289. lxvd2x vs0, 0, AO
  2290. lxvdsx vs24, 0, BO
  2291. lxvdsx vs25, o8, BO
  2292. addi AO, AO, 16
  2293. addi BO, BO, 16
  2294. xvmaddadp vs32, vs8, vs28
  2295. xvmaddadp vs40, vs8, vs29
  2296. #if defined(_AIX)
  2297. ')
  2298. #else
  2299. .endm
  2300. #endif
  2301. #if defined(_AIX)
  2302. define(`KERNEL2x2_E2', `
  2303. #else
  2304. .macro KERNEL2x2_E2
  2305. #endif
  2306. xvmaddadp vs32, vs8, vs28
  2307. xvmaddadp vs40, vs8, vs29
  2308. #if defined(_AIX)
  2309. ')
  2310. #else
  2311. .endm
  2312. #endif
  2313. #if defined(_AIX)
  2314. define(`KERNEL2x2_SUBI1', `
  2315. #else
  2316. .macro KERNEL2x2_SUBI1
  2317. #endif
  2318. lxvd2x vs0, 0, AO
  2319. lxvdsx vs24, 0, BO
  2320. lxvdsx vs25, o8, BO
  2321. addi AO, AO, 16
  2322. addi BO, BO, 16
  2323. xvmuldp vs32, vs0, vs24
  2324. xvmuldp vs40, vs0, vs25
  2325. #if defined(_AIX)
  2326. ')
  2327. #else
  2328. .endm
  2329. #endif
  2330. #if defined(_AIX)
  2331. define(`KERNEL2x2_SUB1', `
  2332. #else
  2333. .macro KERNEL2x2_SUB1
  2334. #endif
  2335. lxvd2x vs0, 0, AO
  2336. lxvdsx vs24, 0, BO
  2337. lxvdsx vs25, o8, BO
  2338. addi AO, AO, 16
  2339. addi BO, BO, 16
  2340. xvmaddadp vs32, vs0, vs24
  2341. xvmaddadp vs40, vs0, vs25
  2342. #if defined(_AIX)
  2343. ')
  2344. #else
  2345. .endm
  2346. #endif
  2347. #if defined(_AIX)
  2348. define(`SAVE2x2', `
  2349. #else
  2350. .macro SAVE2x2
  2351. #endif
  2352. mr T1, CO
  2353. #ifndef TRMMKERNEL
  2354. lxvd2x vs0, 0, T1
  2355. #endif
  2356. #ifndef TRMMKERNEL
  2357. xvmaddadp vs0, vs32, alpha_r
  2358. #else
  2359. xvmuldp vs0, vs32, alpha_r
  2360. #endif
  2361. stxvd2x vs0, 0, T1
  2362. add T1, T1, LDC
  2363. #ifndef TRMMKERNEL
  2364. lxvd2x vs8, 0, T1
  2365. #endif
  2366. #ifndef TRMMKERNEL
  2367. xvmaddadp vs8, vs40, alpha_r
  2368. #else
  2369. xvmuldp vs8, vs40, alpha_r
  2370. #endif
  2371. stxvd2x vs8, 0, T1
  2372. addi CO, CO, 16
  2373. #if defined(_AIX)
  2374. ')
  2375. #else
  2376. .endm
  2377. #endif
  2378. /*********************************************************************
  2379. * Macros for N=2, M=1 *
  2380. *********************************************************************/
  2381. #if defined(_AIX)
  2382. define(`LOAD2x1_1', `
  2383. #else
  2384. .macro LOAD2x1_1
  2385. #endif
  2386. lxsdx vs0, 0, AO
  2387. lxsdx vs24, 0, BO
  2388. lxsdx vs25, o8, BO
  2389. addi AO, AO, 8
  2390. addi BO, BO, 16
  2391. #if defined(_AIX)
  2392. ')
  2393. #else
  2394. .endm
  2395. #endif
  2396. #if defined(_AIX)
  2397. define(`KERNEL2x1_I1', `
  2398. #else
  2399. .macro KERNEL2x1_I1
  2400. #endif
  2401. lxsdx vs8, 0, AO
  2402. lxsdx vs28, 0, BO
  2403. lxsdx vs29, o8, BO
  2404. addi AO, AO, 8
  2405. addi BO, BO, 16
  2406. xsmuldp vs32, vs0, vs24
  2407. xsmuldp vs40, vs0, vs25
  2408. #if defined(_AIX)
  2409. ')
  2410. #else
  2411. .endm
  2412. #endif
  2413. #if defined(_AIX)
  2414. define(`KERNEL2x1_1', `
  2415. #else
  2416. .macro KERNEL2x1_1
  2417. #endif
  2418. lxsdx vs8, 0, AO
  2419. lxsdx vs28, 0, BO
  2420. lxsdx vs29, o8, BO
  2421. addi AO, AO, 8
  2422. addi BO, BO, 16
  2423. xsmaddadp vs32, vs0, vs24
  2424. xsmaddadp vs40, vs0, vs25
  2425. #if defined(_AIX)
  2426. ')
  2427. #else
  2428. .endm
  2429. #endif
  2430. #if defined(_AIX)
  2431. define(`KERNEL2x1_2', `
  2432. #else
  2433. .macro KERNEL2x1_2
  2434. #endif
  2435. lxsdx vs0, 0, AO
  2436. lxsdx vs24, 0, BO
  2437. lxsdx vs25, o8, BO
  2438. addi AO, AO, 8
  2439. addi BO, BO, 16
  2440. xsmaddadp vs32, vs8, vs28
  2441. xsmaddadp vs40, vs8, vs29
  2442. #if defined(_AIX)
  2443. ')
  2444. #else
  2445. .endm
  2446. #endif
  2447. #if defined(_AIX)
  2448. define(`KERNEL2x1_E2', `
  2449. #else
  2450. .macro KERNEL2x1_E2
  2451. #endif
  2452. xsmaddadp vs32, vs8, vs28
  2453. xsmaddadp vs40, vs8, vs29
  2454. #if defined(_AIX)
  2455. ')
  2456. #else
  2457. .endm
  2458. #endif
  2459. #if defined(_AIX)
  2460. define(`KERNEL2x1_SUBI1', `
  2461. #else
  2462. .macro KERNEL2x1_SUBI1
  2463. #endif
  2464. lxsdx vs0, 0, AO
  2465. lxsdx vs24, 0, BO
  2466. lxsdx vs25, o8, BO
  2467. addi AO, AO, 8
  2468. addi BO, BO, 16
  2469. xsmuldp vs32, vs0, vs24
  2470. xsmuldp vs40, vs0, vs25
  2471. #if defined(_AIX)
  2472. ')
  2473. #else
  2474. .endm
  2475. #endif
  2476. #if defined(_AIX)
  2477. define(`KERNEL2x1_SUB1', `
  2478. #else
  2479. .macro KERNEL2x1_SUB1
  2480. #endif
  2481. lxsdx vs0, 0, AO
  2482. lxsdx vs24, 0, BO
  2483. lxsdx vs25, o8, BO
  2484. addi AO, AO, 8
  2485. addi BO, BO, 16
  2486. xsmaddadp vs32, vs0, vs24
  2487. xsmaddadp vs40, vs0, vs25
  2488. #if defined(_AIX)
  2489. ')
  2490. #else
  2491. .endm
  2492. #endif
  2493. #if defined(_AIX)
  2494. define(`SAVE2x1', `
  2495. #else
  2496. .macro SAVE2x1
  2497. #endif
  2498. mr T1, CO
  2499. #ifndef TRMMKERNEL
  2500. lxsdx vs0, 0, T1
  2501. #endif
  2502. #ifndef TRMMKERNEL
  2503. xsmaddadp vs0, vs32, alpha_r
  2504. #else
  2505. xsmuldp vs0, vs32, alpha_r
  2506. #endif
  2507. stxsdx vs0, 0, T1
  2508. add T1, T1, LDC
  2509. #ifndef TRMMKERNEL
  2510. lxsdx vs8, 0, T1
  2511. #endif
  2512. #ifndef TRMMKERNEL
  2513. xsmaddadp vs8, vs40, alpha_r
  2514. #else
  2515. xsmuldp vs8, vs40, alpha_r
  2516. #endif
  2517. stxsdx vs8, 0, T1
  2518. addi CO, CO, 8
  2519. #if defined(_AIX)
  2520. ')
  2521. #else
  2522. .endm
  2523. #endif
  2524. /*********************************************************************
  2525. * Macros for N=1, M=16 *
  2526. *********************************************************************/
  2527. #if defined(_AIX)
  2528. define(`LOAD1x16_1', `
  2529. #else
  2530. .macro LOAD1x16_1
  2531. #endif
  2532. lxvd2x vs0, 0, AO
  2533. lxvd2x vs1, o16, AO
  2534. lxvd2x vs2, o32, AO
  2535. lxvd2x vs3, o48, AO
  2536. lxvdsx vs24, 0, BO
  2537. addi AO, AO, 64
  2538. addi BO, BO, 8
  2539. lxvd2x vs4, 0, AO
  2540. lxvd2x vs5, o16, AO
  2541. lxvd2x vs6, o32, AO
  2542. lxvd2x vs7, o48, AO
  2543. addi AO, AO, 64
  2544. #if defined(_AIX)
  2545. ')
  2546. #else
  2547. .endm
  2548. #endif
  2549. #if defined(_AIX)
  2550. define(`KERNEL1x16_I1', `
  2551. #else
  2552. .macro KERNEL1x16_I1
  2553. #endif
  2554. lxvd2x vs8, 0, AO
  2555. lxvd2x vs9, o16, AO
  2556. lxvd2x vs10, o32, AO
  2557. lxvd2x vs11, o48, AO
  2558. lxvdsx vs28, 0, BO
  2559. addi AO, AO, 64
  2560. addi BO, BO, 8
  2561. lxvd2x vs12, 0, AO
  2562. lxvd2x vs13, o16, AO
  2563. lxvd2x vs14, o32, AO
  2564. lxvd2x vs15, o48, AO
  2565. addi AO, AO, 64
  2566. xvmuldp vs32, vs0, vs24
  2567. xvmuldp vs33, vs1, vs24
  2568. xvmuldp vs34, vs2, vs24
  2569. xvmuldp vs35, vs3, vs24
  2570. xvmuldp vs36, vs4, vs24
  2571. xvmuldp vs37, vs5, vs24
  2572. xvmuldp vs38, vs6, vs24
  2573. xvmuldp vs39, vs7, vs24
  2574. #if defined(_AIX)
  2575. ')
  2576. #else
  2577. .endm
  2578. #endif
  2579. #if defined(_AIX)
  2580. define(`KERNEL1x16_1', `
  2581. #else
  2582. .macro KERNEL1x16_1
  2583. #endif
  2584. lxvd2x vs8, 0, AO
  2585. lxvd2x vs9, o16, AO
  2586. lxvd2x vs10, o32, AO
  2587. lxvd2x vs11, o48, AO
  2588. lxvdsx vs28, 0, BO
  2589. addi AO, AO, 64
  2590. addi BO, BO, 8
  2591. lxvd2x vs12, 0, AO
  2592. lxvd2x vs13, o16, AO
  2593. lxvd2x vs14, o32, AO
  2594. lxvd2x vs15, o48, AO
  2595. addi AO, AO, 64
  2596. xvmaddadp vs32, vs0, vs24
  2597. xvmaddadp vs33, vs1, vs24
  2598. xvmaddadp vs34, vs2, vs24
  2599. xvmaddadp vs35, vs3, vs24
  2600. xvmaddadp vs36, vs4, vs24
  2601. xvmaddadp vs37, vs5, vs24
  2602. xvmaddadp vs38, vs6, vs24
  2603. xvmaddadp vs39, vs7, vs24
  2604. #if defined(_AIX)
  2605. ')
  2606. #else
  2607. .endm
  2608. #endif
  2609. #if defined(_AIX)
  2610. define(`KERNEL1x16_2', `
  2611. #else
  2612. .macro KERNEL1x16_2
  2613. #endif
  2614. lxvd2x vs0, 0, AO
  2615. lxvd2x vs1, o16, AO
  2616. lxvd2x vs2, o32, AO
  2617. lxvd2x vs3, o48, AO
  2618. lxvdsx vs24, 0, BO
  2619. addi AO, AO, 64
  2620. addi BO, BO, 8
  2621. lxvd2x vs4, 0, AO
  2622. lxvd2x vs5, o16, AO
  2623. lxvd2x vs6, o32, AO
  2624. lxvd2x vs7, o48, AO
  2625. addi AO, AO, 64
  2626. xvmaddadp vs32, vs8, vs28
  2627. xvmaddadp vs33, vs9, vs28
  2628. xvmaddadp vs34, vs10, vs28
  2629. xvmaddadp vs35, vs11, vs28
  2630. xvmaddadp vs36, vs12, vs28
  2631. xvmaddadp vs37, vs13, vs28
  2632. xvmaddadp vs38, vs14, vs28
  2633. xvmaddadp vs39, vs15, vs28
  2634. #if defined(_AIX)
  2635. ')
  2636. #else
  2637. .endm
  2638. #endif
  2639. #if defined(_AIX)
  2640. define(`KERNEL1x16_E2', `
  2641. #else
  2642. .macro KERNEL1x16_E2
  2643. #endif
  2644. xvmaddadp vs32, vs8, vs28
  2645. xvmaddadp vs33, vs9, vs28
  2646. xvmaddadp vs34, vs10, vs28
  2647. xvmaddadp vs35, vs11, vs28
  2648. xvmaddadp vs36, vs12, vs28
  2649. xvmaddadp vs37, vs13, vs28
  2650. xvmaddadp vs38, vs14, vs28
  2651. xvmaddadp vs39, vs15, vs28
  2652. #if defined(_AIX)
  2653. ')
  2654. #else
  2655. .endm
  2656. #endif
  2657. #if defined(_AIX)
  2658. define(`KERNEL1x16_SUBI1', `
  2659. #else
  2660. .macro KERNEL1x16_SUBI1
  2661. #endif
  2662. lxvd2x vs0, 0, AO
  2663. lxvd2x vs1, o16, AO
  2664. lxvd2x vs2, o32, AO
  2665. lxvd2x vs3, o48, AO
  2666. lxvdsx vs24, 0, BO
  2667. addi AO, AO, 64
  2668. addi BO, BO, 8
  2669. lxvd2x vs4, 0, AO
  2670. lxvd2x vs5, o16, AO
  2671. lxvd2x vs6, o32, AO
  2672. lxvd2x vs7, o48, AO
  2673. addi AO, AO, 64
  2674. xvmuldp vs32, vs0, vs24
  2675. xvmuldp vs33, vs1, vs24
  2676. xvmuldp vs34, vs2, vs24
  2677. xvmuldp vs35, vs3, vs24
  2678. xvmuldp vs36, vs4, vs24
  2679. xvmuldp vs37, vs5, vs24
  2680. xvmuldp vs38, vs6, vs24
  2681. xvmuldp vs39, vs7, vs24
  2682. #if defined(_AIX)
  2683. ')
  2684. #else
  2685. .endm
  2686. #endif
  2687. #if defined(_AIX)
  2688. define(`KERNEL1x16_SUB1', `
  2689. #else
  2690. .macro KERNEL1x16_SUB1
  2691. #endif
  2692. lxvd2x vs0, 0, AO
  2693. lxvd2x vs1, o16, AO
  2694. lxvd2x vs2, o32, AO
  2695. lxvd2x vs3, o48, AO
  2696. lxvdsx vs24, 0, BO
  2697. addi AO, AO, 64
  2698. addi BO, BO, 8
  2699. lxvd2x vs4, 0, AO
  2700. lxvd2x vs5, o16, AO
  2701. lxvd2x vs6, o32, AO
  2702. lxvd2x vs7, o48, AO
  2703. addi AO, AO, 64
  2704. xvmaddadp vs32, vs0, vs24
  2705. xvmaddadp vs33, vs1, vs24
  2706. xvmaddadp vs34, vs2, vs24
  2707. xvmaddadp vs35, vs3, vs24
  2708. xvmaddadp vs36, vs4, vs24
  2709. xvmaddadp vs37, vs5, vs24
  2710. xvmaddadp vs38, vs6, vs24
  2711. xvmaddadp vs39, vs7, vs24
  2712. #if defined(_AIX)
  2713. ')
  2714. #else
  2715. .endm
  2716. #endif
  2717. #if defined(_AIX)
  2718. define(`SAVE1x16', `
  2719. #else
  2720. .macro SAVE1x16
  2721. #endif
  2722. mr T1, CO
  2723. addi T2, T1, 64
  2724. #ifndef TRMMKERNEL
  2725. lxvd2x vs0, 0, T1
  2726. lxvd2x vs1, o16, T1
  2727. lxvd2x vs2, o32, T1
  2728. lxvd2x vs3, o48, T1
  2729. lxvd2x vs4, 0, T2
  2730. lxvd2x vs5, o16, T2
  2731. lxvd2x vs6, o32, T2
  2732. lxvd2x vs7, o48, T2
  2733. #endif
  2734. #ifndef TRMMKERNEL
  2735. xvmaddadp vs0, vs32, alpha_r
  2736. xvmaddadp vs1, vs33, alpha_r
  2737. xvmaddadp vs2, vs34, alpha_r
  2738. xvmaddadp vs3, vs35, alpha_r
  2739. xvmaddadp vs4, vs36, alpha_r
  2740. xvmaddadp vs5, vs37, alpha_r
  2741. xvmaddadp vs6, vs38, alpha_r
  2742. xvmaddadp vs7, vs39, alpha_r
  2743. #else
  2744. xvmuldp vs0, vs32, alpha_r
  2745. xvmuldp vs1, vs33, alpha_r
  2746. xvmuldp vs2, vs34, alpha_r
  2747. xvmuldp vs3, vs35, alpha_r
  2748. xvmuldp vs4, vs36, alpha_r
  2749. xvmuldp vs5, vs37, alpha_r
  2750. xvmuldp vs6, vs38, alpha_r
  2751. xvmuldp vs7, vs39, alpha_r
  2752. #endif
  2753. stxvd2x vs0, 0, T1
  2754. stxvd2x vs1, o16, T1
  2755. stxvd2x vs2, o32, T1
  2756. stxvd2x vs3, o48, T1
  2757. stxvd2x vs4, 0, T2
  2758. stxvd2x vs5, o16, T2
  2759. stxvd2x vs6, o32, T2
  2760. stxvd2x vs7, o48, T2
  2761. addi CO, CO, 128
  2762. #if defined(_AIX)
  2763. ')
  2764. #else
  2765. .endm
  2766. #endif
  2767. /*********************************************************************
  2768. * Macros for N=4, M=8 *
  2769. *********************************************************************/
  2770. #if defined(_AIX)
  2771. define(`LOAD1x8_1', `
  2772. #else
  2773. .macro LOAD1x8_1
  2774. #endif
  2775. lxvd2x vs0, 0, AO
  2776. lxvd2x vs1, o16, AO
  2777. lxvd2x vs2, o32, AO
  2778. lxvd2x vs3, o48, AO
  2779. lxvdsx vs24, 0, BO
  2780. addi AO, AO, 64
  2781. addi BO, BO, 8
  2782. #if defined(_AIX)
  2783. ')
  2784. #else
  2785. .endm
  2786. #endif
  2787. #if defined(_AIX)
  2788. define(`KERNEL1x8_I1', `
  2789. #else
  2790. .macro KERNEL1x8_I1
  2791. #endif
  2792. lxvd2x vs8, 0, AO
  2793. lxvd2x vs9, o16, AO
  2794. lxvd2x vs10, o32, AO
  2795. lxvd2x vs11, o48, AO
  2796. lxvdsx vs28, 0, BO
  2797. addi AO, AO, 64
  2798. addi BO, BO, 8
  2799. xvmuldp vs32, vs0, vs24
  2800. xvmuldp vs33, vs1, vs24
  2801. xvmuldp vs34, vs2, vs24
  2802. xvmuldp vs35, vs3, vs24
  2803. #if defined(_AIX)
  2804. ')
  2805. #else
  2806. .endm
  2807. #endif
  2808. #if defined(_AIX)
  2809. define(`KERNEL1x8_1', `
  2810. #else
  2811. .macro KERNEL1x8_1
  2812. #endif
  2813. lxvd2x vs8, 0, AO
  2814. lxvd2x vs9, o16, AO
  2815. lxvd2x vs10, o32, AO
  2816. lxvd2x vs11, o48, AO
  2817. lxvdsx vs28, 0, BO
  2818. addi AO, AO, 64
  2819. addi BO, BO, 8
  2820. xvmaddadp vs32, vs0, vs24
  2821. xvmaddadp vs33, vs1, vs24
  2822. xvmaddadp vs34, vs2, vs24
  2823. xvmaddadp vs35, vs3, vs24
  2824. #if defined(_AIX)
  2825. ')
  2826. #else
  2827. .endm
  2828. #endif
  2829. #if defined(_AIX)
  2830. define(`KERNEL1x8_2', `
  2831. #else
  2832. .macro KERNEL1x8_2
  2833. #endif
  2834. lxvd2x vs0, 0, AO
  2835. lxvd2x vs1, o16, AO
  2836. lxvd2x vs2, o32, AO
  2837. lxvd2x vs3, o48, AO
  2838. lxvdsx vs24, 0, BO
  2839. addi AO, AO, 64
  2840. addi BO, BO, 8
  2841. xvmaddadp vs32, vs8, vs28
  2842. xvmaddadp vs33, vs9, vs28
  2843. xvmaddadp vs34, vs10, vs28
  2844. xvmaddadp vs35, vs11, vs28
  2845. #if defined(_AIX)
  2846. ')
  2847. #else
  2848. .endm
  2849. #endif
  2850. #if defined(_AIX)
  2851. define(`KERNEL1x8_E2', `
  2852. #else
  2853. .macro KERNEL1x8_E2
  2854. #endif
  2855. xvmaddadp vs32, vs8, vs28
  2856. xvmaddadp vs33, vs9, vs28
  2857. xvmaddadp vs34, vs10, vs28
  2858. xvmaddadp vs35, vs11, vs28
  2859. #if defined(_AIX)
  2860. ')
  2861. #else
  2862. .endm
  2863. #endif
  2864. #if defined(_AIX)
  2865. define(`KERNEL1x8_SUBI1', `
  2866. #else
  2867. .macro KERNEL1x8_SUBI1
  2868. #endif
  2869. lxvd2x vs0, 0, AO
  2870. lxvd2x vs1, o16, AO
  2871. lxvd2x vs2, o32, AO
  2872. lxvd2x vs3, o48, AO
  2873. lxvdsx vs24, 0, BO
  2874. addi AO, AO, 64
  2875. addi BO, BO, 8
  2876. xvmuldp vs32, vs0, vs24
  2877. xvmuldp vs33, vs1, vs24
  2878. xvmuldp vs34, vs2, vs24
  2879. xvmuldp vs35, vs3, vs24
  2880. #if defined(_AIX)
  2881. ')
  2882. #else
  2883. .endm
  2884. #endif
  2885. #if defined(_AIX)
  2886. define(`KERNEL1x8_SUB1', `
  2887. #else
  2888. .macro KERNEL1x8_SUB1
  2889. #endif
  2890. lxvd2x vs0, 0, AO
  2891. lxvd2x vs1, o16, AO
  2892. lxvd2x vs2, o32, AO
  2893. lxvd2x vs3, o48, AO
  2894. lxvdsx vs24, 0, BO
  2895. addi AO, AO, 64
  2896. addi BO, BO, 8
  2897. xvmaddadp vs32, vs0, vs24
  2898. xvmaddadp vs33, vs1, vs24
  2899. xvmaddadp vs34, vs2, vs24
  2900. xvmaddadp vs35, vs3, vs24
  2901. #if defined(_AIX)
  2902. ')
  2903. #else
  2904. .endm
  2905. #endif
  2906. #if defined(_AIX)
  2907. define(`SAVE1x8', `
  2908. #else
  2909. .macro SAVE1x8
  2910. #endif
  2911. mr T1, CO
  2912. #ifndef TRMMKERNEL
  2913. lxvd2x vs0, 0, T1
  2914. lxvd2x vs1, o16, T1
  2915. lxvd2x vs2, o32, T1
  2916. lxvd2x vs3, o48, T1
  2917. #endif
  2918. #ifndef TRMMKERNEL
  2919. xvmaddadp vs0, vs32, alpha_r
  2920. xvmaddadp vs1, vs33, alpha_r
  2921. xvmaddadp vs2, vs34, alpha_r
  2922. xvmaddadp vs3, vs35, alpha_r
  2923. #else
  2924. xvmuldp vs0, vs32, alpha_r
  2925. xvmuldp vs1, vs33, alpha_r
  2926. xvmuldp vs2, vs34, alpha_r
  2927. xvmuldp vs3, vs35, alpha_r
  2928. #endif
  2929. stxvd2x vs0, 0, T1
  2930. stxvd2x vs1, o16, T1
  2931. stxvd2x vs2, o32, T1
  2932. stxvd2x vs3, o48, T1
  2933. addi CO, CO, 64
  2934. #if defined(_AIX)
  2935. ')
  2936. #else
  2937. .endm
  2938. #endif
  2939. /*********************************************************************
  2940. * Macros for N=1, M=4 *
  2941. *********************************************************************/
  2942. #if defined(_AIX)
  2943. define(`LOAD1x4_1', `
  2944. #else
  2945. .macro LOAD1x4_1
  2946. #endif
  2947. lxvd2x vs0, 0, AO
  2948. lxvd2x vs1, o16, AO
  2949. lxvdsx vs24, 0, BO
  2950. addi AO, AO, 32
  2951. addi BO, BO, 8
  2952. #if defined(_AIX)
  2953. ')
  2954. #else
  2955. .endm
  2956. #endif
  2957. #if defined(_AIX)
  2958. define(`KERNEL1x4_I1', `
  2959. #else
  2960. .macro KERNEL1x4_I1
  2961. #endif
  2962. lxvd2x vs8, 0, AO
  2963. lxvd2x vs9, o16, AO
  2964. lxvdsx vs28, 0, BO
  2965. addi AO, AO, 32
  2966. addi BO, BO, 8
  2967. xvmuldp vs32, vs0, vs24
  2968. xvmuldp vs33, vs1, vs24
  2969. #if defined(_AIX)
  2970. ')
  2971. #else
  2972. .endm
  2973. #endif
  2974. #if defined(_AIX)
  2975. define(`KERNEL1x4_1', `
  2976. #else
  2977. .macro KERNEL1x4_1
  2978. #endif
  2979. lxvd2x vs8, 0, AO
  2980. lxvd2x vs9, o16, AO
  2981. lxvdsx vs28, 0, BO
  2982. addi AO, AO, 32
  2983. addi BO, BO, 8
  2984. xvmaddadp vs32, vs0, vs24
  2985. xvmaddadp vs33, vs1, vs24
  2986. #if defined(_AIX)
  2987. ')
  2988. #else
  2989. .endm
  2990. #endif
  2991. #if defined(_AIX)
  2992. define(`KERNEL1x4_2', `
  2993. #else
  2994. .macro KERNEL1x4_2
  2995. #endif
  2996. lxvd2x vs0, 0, AO
  2997. lxvd2x vs1, o16, AO
  2998. lxvdsx vs24, 0, BO
  2999. addi AO, AO, 32
  3000. addi BO, BO, 8
  3001. xvmaddadp vs32, vs8, vs28
  3002. xvmaddadp vs33, vs9, vs28
  3003. #if defined(_AIX)
  3004. ')
  3005. #else
  3006. .endm
  3007. #endif
  3008. #if defined(_AIX)
  3009. define(`KERNEL1x4_E2', `
  3010. #else
  3011. .macro KERNEL1x4_E2
  3012. #endif
  3013. xvmaddadp vs32, vs8, vs28
  3014. xvmaddadp vs33, vs9, vs28
  3015. #if defined(_AIX)
  3016. ')
  3017. #else
  3018. .endm
  3019. #endif
  3020. #if defined(_AIX)
  3021. define(`KERNEL1x4_SUBI1', `
  3022. #else
  3023. .macro KERNEL1x4_SUBI1
  3024. #endif
  3025. lxvd2x vs0, 0, AO
  3026. lxvd2x vs1, o16, AO
  3027. lxvdsx vs24, 0, BO
  3028. addi AO, AO, 32
  3029. addi BO, BO, 8
  3030. xvmuldp vs32, vs0, vs24
  3031. xvmuldp vs33, vs1, vs24
  3032. #if defined(_AIX)
  3033. ')
  3034. #else
  3035. .endm
  3036. #endif
  3037. #if defined(_AIX)
  3038. define(`KERNEL1x4_SUB1', `
  3039. #else
  3040. .macro KERNEL1x4_SUB1
  3041. #endif
  3042. lxvd2x vs0, 0, AO
  3043. lxvd2x vs1, o16, AO
  3044. lxvdsx vs24, 0, BO
  3045. addi AO, AO, 32
  3046. addi BO, BO, 8
  3047. xvmaddadp vs32, vs0, vs24
  3048. xvmaddadp vs33, vs1, vs24
  3049. #if defined(_AIX)
  3050. ')
  3051. #else
  3052. .endm
  3053. #endif
  3054. #if defined(_AIX)
  3055. define(`SAVE1x4', `
  3056. #else
  3057. .macro SAVE1x4
  3058. #endif
  3059. mr T1, CO
  3060. #ifndef TRMMKERNEL
  3061. lxvd2x vs0, 0, T1
  3062. lxvd2x vs1, o16, T1
  3063. #endif
  3064. #ifndef TRMMKERNEL
  3065. xvmaddadp vs0, vs32, alpha_r
  3066. xvmaddadp vs1, vs33, alpha_r
  3067. #else
  3068. xvmuldp vs0, vs32, alpha_r
  3069. xvmuldp vs1, vs33, alpha_r
  3070. #endif
  3071. stxvd2x vs0, 0, T1
  3072. stxvd2x vs1, o16, T1
  3073. addi CO, CO, 32
  3074. #if defined(_AIX)
  3075. ')
  3076. #else
  3077. .endm
  3078. #endif
  3079. /*********************************************************************
  3080. * Macros for N=1, M=2 *
  3081. *********************************************************************/
  3082. #if defined(_AIX)
  3083. define(`LOAD1x2_1', `
  3084. #else
  3085. .macro LOAD1x2_1
  3086. #endif
  3087. lxvd2x vs0, 0, AO
  3088. lxvdsx vs24, 0, BO
  3089. addi AO, AO, 16
  3090. addi BO, BO, 8
  3091. #if defined(_AIX)
  3092. ')
  3093. #else
  3094. .endm
  3095. #endif
  3096. #if defined(_AIX)
  3097. define(`KERNEL1x2_I1', `
  3098. #else
  3099. .macro KERNEL1x2_I1
  3100. #endif
  3101. lxvd2x vs8, 0, AO
  3102. lxvdsx vs28, 0, BO
  3103. addi AO, AO, 16
  3104. addi BO, BO, 8
  3105. xvmuldp vs32, vs0, vs24
  3106. #if defined(_AIX)
  3107. ')
  3108. #else
  3109. .endm
  3110. #endif
  3111. #if defined(_AIX)
  3112. define(`KERNEL1x2_1', `
  3113. #else
  3114. .macro KERNEL1x2_1
  3115. #endif
  3116. lxvd2x vs8, 0, AO
  3117. lxvdsx vs28, 0, BO
  3118. addi AO, AO, 16
  3119. addi BO, BO, 8
  3120. xvmaddadp vs32, vs0, vs24
  3121. #if defined(_AIX)
  3122. ')
  3123. #else
  3124. .endm
  3125. #endif
  3126. #if defined(_AIX)
  3127. define(`KERNEL1x2_2', `
  3128. #else
  3129. .macro KERNEL1x2_2
  3130. #endif
  3131. lxvd2x vs0, 0, AO
  3132. lxvdsx vs24, 0, BO
  3133. addi AO, AO, 16
  3134. addi BO, BO, 8
  3135. xvmaddadp vs32, vs8, vs28
  3136. #if defined(_AIX)
  3137. ')
  3138. #else
  3139. .endm
  3140. #endif
  3141. #if defined(_AIX)
  3142. define(`KERNEL1x2_E2', `
  3143. #else
  3144. .macro KERNEL1x2_E2
  3145. #endif
  3146. xvmaddadp vs32, vs8, vs28
  3147. #if defined(_AIX)
  3148. ')
  3149. #else
  3150. .endm
  3151. #endif
  3152. #if defined(_AIX)
  3153. define(`KERNEL1x2_SUBI1', `
  3154. #else
  3155. .macro KERNEL1x2_SUBI1
  3156. #endif
  3157. lxvd2x vs0, 0, AO
  3158. lxvdsx vs24, 0, BO
  3159. addi AO, AO, 16
  3160. addi BO, BO, 8
  3161. xvmuldp vs32, vs0, vs24
  3162. #if defined(_AIX)
  3163. ')
  3164. #else
  3165. .endm
  3166. #endif
  3167. #if defined(_AIX)
  3168. define(`KERNEL1x2_SUB1', `
  3169. #else
  3170. .macro KERNEL1x2_SUB1
  3171. #endif
  3172. lxvd2x vs0, 0, AO
  3173. lxvdsx vs24, 0, BO
  3174. addi AO, AO, 16
  3175. addi BO, BO, 8
  3176. xvmaddadp vs32, vs0, vs24
  3177. #if defined(_AIX)
  3178. ')
  3179. #else
  3180. .endm
  3181. #endif
  3182. #if defined(_AIX)
  3183. define(`SAVE1x2', `
  3184. #else
  3185. .macro SAVE1x2
  3186. #endif
  3187. mr T1, CO
  3188. #ifndef TRMMKERNEL
  3189. lxvd2x vs0, 0, T1
  3190. #endif
  3191. #ifndef TRMMKERNEL
  3192. xvmaddadp vs0, vs32, alpha_r
  3193. #else
  3194. xvmuldp vs0, vs32, alpha_r
  3195. #endif
  3196. stxvd2x vs0, 0, T1
  3197. addi CO, CO, 16
  3198. #if defined(_AIX)
  3199. ')
  3200. #else
  3201. .endm
  3202. #endif
  3203. /*********************************************************************
  3204. * Macros for N=1, M=1 *
  3205. *********************************************************************/
  3206. #if defined(_AIX)
  3207. define(`LOAD1x1_1', `
  3208. #else
  3209. .macro LOAD1x1_1
  3210. #endif
  3211. lxsdx vs0, 0, AO
  3212. lxsdx vs24, 0, BO
  3213. addi AO, AO, 8
  3214. addi BO, BO, 8
  3215. #if defined(_AIX)
  3216. ')
  3217. #else
  3218. .endm
  3219. #endif
  3220. #if defined(_AIX)
  3221. define(`KERNEL1x1_I1', `
  3222. #else
  3223. .macro KERNEL1x1_I1
  3224. #endif
  3225. lxsdx vs8, 0, AO
  3226. lxsdx vs28, 0, BO
  3227. addi AO, AO, 8
  3228. addi BO, BO, 8
  3229. xsmuldp vs32, vs0, vs24
  3230. #if defined(_AIX)
  3231. ')
  3232. #else
  3233. .endm
  3234. #endif
  3235. #if defined(_AIX)
  3236. define(`KERNEL1x1_1', `
  3237. #else
  3238. .macro KERNEL1x1_1
  3239. #endif
  3240. lxsdx vs8, 0, AO
  3241. lxsdx vs28, 0, BO
  3242. addi AO, AO, 8
  3243. addi BO, BO, 8
  3244. xsmaddadp vs32, vs0, vs24
  3245. #if defined(_AIX)
  3246. ')
  3247. #else
  3248. .endm
  3249. #endif
  3250. #if defined(_AIX)
  3251. define(`KERNEL1x1_2', `
  3252. #else
  3253. .macro KERNEL1x1_2
  3254. #endif
  3255. lxsdx vs0, 0, AO
  3256. lxsdx vs24, 0, BO
  3257. addi AO, AO, 8
  3258. addi BO, BO, 8
  3259. xsmaddadp vs32, vs8, vs28
  3260. #if defined(_AIX)
  3261. ')
  3262. #else
  3263. .endm
  3264. #endif
  3265. #if defined(_AIX)
  3266. define(`KERNEL1x1_E2', `
  3267. #else
  3268. .macro KERNEL1x1_E2
  3269. #endif
  3270. xsmaddadp vs32, vs8, vs28
  3271. #if defined(_AIX)
  3272. ')
  3273. #else
  3274. .endm
  3275. #endif
  3276. #if defined(_AIX)
  3277. define(`KERNEL1x1_SUBI1', `
  3278. #else
  3279. .macro KERNEL1x1_SUBI1
  3280. #endif
  3281. lxsdx vs0, 0, AO
  3282. lxsdx vs24, 0, BO
  3283. addi AO, AO, 8
  3284. addi BO, BO, 8
  3285. xsmuldp vs32, vs0, vs24
  3286. #if defined(_AIX)
  3287. ')
  3288. #else
  3289. .endm
  3290. #endif
  3291. #if defined(_AIX)
  3292. define(`KERNEL1x1_SUB1', `
  3293. #else
  3294. .macro KERNEL1x1_SUB1
  3295. #endif
  3296. lxsdx vs0, 0, AO
  3297. lxsdx vs24, 0, BO
  3298. addi AO, AO, 8
  3299. addi BO, BO, 8
  3300. xsmaddadp vs32, vs0, vs24
  3301. #if defined(_AIX)
  3302. ')
  3303. #else
  3304. .endm
  3305. #endif
  3306. #if defined(_AIX)
  3307. define(`SAVE1x1', `
  3308. #else
  3309. .macro SAVE1x1
  3310. #endif
  3311. mr T1, CO
  3312. #ifndef TRMMKERNEL
  3313. lxsdx vs0, 0, T1
  3314. #endif
  3315. #ifndef TRMMKERNEL
  3316. xsmaddadp vs0, vs32, alpha_r
  3317. #else
  3318. xsmuldp vs0, vs32, alpha_r
  3319. #endif
  3320. stxsdx vs0, 0, T1
  3321. addi CO, CO, 8
  3322. #if defined(_AIX)
  3323. ')
  3324. #else
  3325. .endm
  3326. #endif