You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_macros_power9.S 63 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623
  1. /***************************************************************************
  2. Copyright (c) 2013-2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * Abdelrauf(quickwritereader@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. /*********************************************************************
  35. * Macros for N=4, M=16 *
  36. *********************************************************************/
  37. .macro LOAD4x16_1
  38. LOAD4x16 1
  39. .endm
  40. .macro LOAD4x16_0
  41. LOAD4x16 0
  42. .endm
  43. .macro LOAD4x16 Zero
  44. lxv vs24, 0(BO)
  45. lxv vs26, 16(BO)
  46. xxpermdi vs25, vs24, vs24,2
  47. xxpermdi vs27, vs26, vs26,2
  48. lxv vs0, 0(AO)
  49. lxv vs1, 16(AO)
  50. lxv vs2, 32(AO)
  51. lxv vs3, 48(AO)
  52. lxv vs4, 64(AO)
  53. lxv vs5, 80(AO)
  54. lxv vs6, 96(AO)
  55. lxv vs7, 112(AO)
  56. .if \Zero==1
  57. xxlxor vs32,vs32,vs32
  58. xxlxor vs33,vs33,vs33
  59. xxlxor vs34,vs34,vs34
  60. xxlxor vs35,vs35,vs35
  61. xxlxor vs36,vs36,vs36
  62. xxlxor vs37,vs37,vs37
  63. xxlxor vs38,vs38,vs38
  64. xxlxor vs39,vs39,vs39
  65. xxlxor vs40, vs40, vs40
  66. xxlxor vs41, vs41, vs41
  67. xxlxor vs42, vs42, vs42
  68. xxlxor vs43, vs43, vs43
  69. xxlxor vs44, vs44, vs44
  70. xxlxor vs45, vs45, vs45
  71. xxlxor vs46, vs46, vs46
  72. xxlxor vs47, vs47, vs47
  73. xxlxor vs48, vs48, vs48
  74. xxlxor vs49, vs49, vs49
  75. xxlxor vs50, vs50, vs50
  76. xxlxor vs51, vs51, vs51
  77. xxlxor vs52, vs52, vs52
  78. xxlxor vs53, vs53, vs53
  79. xxlxor vs54, vs54, vs54
  80. xxlxor vs55, vs55, vs55
  81. xxlxor vs56, vs56, vs56
  82. xxlxor vs57, vs57, vs57
  83. xxlxor vs58, vs58, vs58
  84. xxlxor vs59, vs59, vs59
  85. xxlxor vs60, vs60, vs60
  86. xxlxor vs61, vs61, vs61
  87. xxlxor vs62, vs62, vs62
  88. xxlxor vs63, vs63, vs63
  89. .endif
  90. .endm
  91. #define unit_size 8
  92. #define DISP32(ind,disp) (ind*unit_size*32+disp)
  93. #define DISP16(ind,disp) (ind*unit_size*16+disp)
  94. #define DISP8(ind,disp) (ind*unit_size*8+disp)
  95. #define DISP4(ind,disp) (ind*unit_size*4+disp)
  96. #define DISP2(ind,disp) (ind*unit_size*2+disp)
  97. #define DISP1(ind,disp) (ind*unit_size+disp)
  98. .macro KERNEL4x16_L1_L2 Index,IsLast
  99. KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0
  100. .endm
  101. .macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast
  102. KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0
  103. .endm
  104. .macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast
  105. KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
  106. .endm
  107. .macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
  108. KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
  109. .endm
  110. .macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
  111. KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0
  112. .endm
  113. .macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
  114. KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
  115. .endm
  116. .macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
  117. KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
  118. .endm
  119. .macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete
  120. .if \First ==1
  121. xvmuldp vs32, vs0, vs24
  122. xvmuldp vs33, vs1, vs24
  123. xvmuldp vs34, vs2, vs24
  124. xvmuldp vs35, vs3, vs24
  125. .else
  126. xvmaddadp vs32, vs0, vs24
  127. xvmaddadp vs33, vs1, vs24
  128. xvmaddadp vs34, vs2, vs24
  129. xvmaddadp vs35, vs3, vs24
  130. .endif
  131. lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG)
  132. lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG)
  133. lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG)
  134. lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG)
  135. .if \First ==1
  136. xvmuldp vs36, vs4, vs24
  137. xvmuldp vs37, vs5, vs24
  138. xvmuldp vs38, vs6, vs24
  139. xvmuldp vs39, vs7, vs24
  140. .else
  141. xvmaddadp vs36, vs4, vs24
  142. xvmaddadp vs37, vs5, vs24
  143. xvmaddadp vs38, vs6, vs24
  144. xvmaddadp vs39, vs7, vs24
  145. .endif
  146. lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG)
  147. lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG)
  148. xxpermdi vs29, vs28, vs28,2
  149. xxpermdi vs31, vs30, vs30,2
  150. .if \First ==1
  151. xvmuldp vs40, vs0, vs25
  152. xvmuldp vs41, vs1, vs25
  153. xvmuldp vs42, vs2, vs25
  154. xvmuldp vs43, vs3, vs25
  155. xvmuldp vs44, vs4, vs25
  156. xvmuldp vs45, vs5, vs25
  157. xvmuldp vs46, vs6, vs25
  158. xvmuldp vs47, vs7, vs25
  159. xvmuldp vs48, vs0, vs26
  160. xvmuldp vs49, vs1, vs26
  161. xvmuldp vs50, vs2, vs26
  162. xvmuldp vs51, vs3, vs26
  163. .else
  164. xvmaddadp vs40, vs0, vs25
  165. xvmaddadp vs41, vs1, vs25
  166. xvmaddadp vs42, vs2, vs25
  167. xvmaddadp vs43, vs3, vs25
  168. xvmaddadp vs44, vs4, vs25
  169. xvmaddadp vs45, vs5, vs25
  170. xvmaddadp vs46, vs6, vs25
  171. xvmaddadp vs47, vs7, vs25
  172. xvmaddadp vs48, vs0, vs26
  173. xvmaddadp vs49, vs1, vs26
  174. xvmaddadp vs50, vs2, vs26
  175. xvmaddadp vs51, vs3, vs26
  176. .endif
  177. lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG)
  178. lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG)
  179. .if \First ==1
  180. xvmuldp vs52, vs4, vs26
  181. xvmuldp vs53, vs5, vs26
  182. xvmuldp vs54, vs6, vs26
  183. xvmuldp vs55, vs7, vs26
  184. .else
  185. xvmaddadp vs52, vs4, vs26
  186. xvmaddadp vs53, vs5, vs26
  187. xvmaddadp vs54, vs6, vs26
  188. xvmaddadp vs55, vs7, vs26
  189. .endif
  190. lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG)
  191. lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG)
  192. .if \First ==1
  193. xvmuldp vs56, vs0, vs27
  194. xvmuldp vs57, vs1, vs27
  195. xvmuldp vs58, vs2, vs27
  196. xvmuldp vs59, vs3, vs27
  197. xvmuldp vs60, vs4, vs27
  198. xvmuldp vs61, vs5, vs27
  199. xvmuldp vs62, vs6, vs27
  200. xvmuldp vs63, vs7, vs27
  201. .else
  202. xvmaddadp vs56, vs0, vs27
  203. xvmaddadp vs57, vs1, vs27
  204. xvmaddadp vs58, vs2, vs27
  205. xvmaddadp vs59, vs3, vs27
  206. xvmaddadp vs60, vs4, vs27
  207. xvmaddadp vs61, vs5, vs27
  208. xvmaddadp vs62, vs6, vs27
  209. xvmaddadp vs63, vs7, vs27
  210. .endif
  211. xvmaddadp vs32, vs8, vs28
  212. xvmaddadp vs33, vs9, vs28
  213. xvmaddadp vs34, vs10, vs28
  214. xvmaddadp vs35, vs11, vs28
  215. .if \Complete==0
  216. lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG)
  217. lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG)
  218. .endif
  219. xvmaddadp vs36, vs12, vs28
  220. xvmaddadp vs37, vs13, vs28
  221. xvmaddadp vs38, vs14, vs28
  222. xvmaddadp vs39, vs15, vs28
  223. .if \Complete==0
  224. lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG)
  225. lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG)
  226. xxpermdi vs25, vs24, vs24,2
  227. xxpermdi vs27, vs26, vs26,2
  228. .endif
  229. xvmaddadp vs40, vs8, vs29
  230. xvmaddadp vs41, vs9, vs29
  231. xvmaddadp vs42, vs10, vs29
  232. xvmaddadp vs43, vs11, vs29
  233. .if \Complete==0
  234. lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG)
  235. lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG)
  236. .endif
  237. xvmaddadp vs44, vs12, vs29
  238. xvmaddadp vs45, vs13, vs29
  239. xvmaddadp vs46, vs14, vs29
  240. xvmaddadp vs47, vs15, vs29
  241. xvmaddadp vs48, vs8, vs30
  242. xvmaddadp vs49, vs9, vs30
  243. xvmaddadp vs50, vs10, vs30
  244. xvmaddadp vs51, vs11, vs30
  245. .if \Complete==0
  246. lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG)
  247. lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG)
  248. .endif
  249. xvmaddadp vs52, vs12, vs30
  250. xvmaddadp vs53, vs13, vs30
  251. xvmaddadp vs54, vs14, vs30
  252. xvmaddadp vs55, vs15, vs30
  253. .if \Complete==0
  254. lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG)
  255. lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG)
  256. .endif
  257. xvmaddadp vs56, vs8, vs31
  258. xvmaddadp vs57, vs9, vs31
  259. xvmaddadp vs58, vs10, vs31
  260. xvmaddadp vs59, vs11, vs31
  261. xvmaddadp vs60, vs12, vs31
  262. xvmaddadp vs61, vs13, vs31
  263. xvmaddadp vs62, vs14, vs31
  264. xvmaddadp vs63, vs15, vs31
  265. .if \IsLast==1
  266. .if \Complete==1
  267. addi \AREG, \AREG, DISP32(\Index,128+\OffsetA)
  268. addi \BREG, \BREG, DISP8(\Index,32+\OffsetB)
  269. .else
  270. addi \AREG, \AREG, DISP32(\Index,256)
  271. addi \BREG, \BREG, DISP8(\Index,64)
  272. .endif
  273. .endif
  274. .endm
  275. .macro KERNEL4x16 First
  276. lxv vs24, 0(BO)
  277. lxv vs26, 16(BO)
  278. xxpermdi vs25, vs24, vs24,2
  279. xxpermdi vs27, vs26, vs26,2
  280. lxv vs0, 0(AO)
  281. lxv vs1, 16(AO)
  282. lxv vs2, 32(AO)
  283. lxv vs3, 48(AO)
  284. lxv vs4, 64(AO)
  285. lxv vs5, 80(AO)
  286. lxv vs6, 96(AO)
  287. lxv vs7, 112(AO)
  288. addi BO, BO, 32
  289. addi AO, AO, 128
  290. .if \First==1
  291. xvmuldp vs32, vs0, vs24
  292. xvmuldp vs33, vs1, vs24
  293. xvmuldp vs34, vs2, vs24
  294. xvmuldp vs35, vs3, vs24
  295. xvmuldp vs36, vs4, vs24
  296. xvmuldp vs37, vs5, vs24
  297. xvmuldp vs38, vs6, vs24
  298. xvmuldp vs39, vs7, vs24
  299. xvmuldp vs40, vs0, vs25
  300. xvmuldp vs41, vs1, vs25
  301. xvmuldp vs42, vs2, vs25
  302. xvmuldp vs43, vs3, vs25
  303. xvmuldp vs44, vs4, vs25
  304. xvmuldp vs45, vs5, vs25
  305. xvmuldp vs46, vs6, vs25
  306. xvmuldp vs47, vs7, vs25
  307. xvmuldp vs48, vs0, vs26
  308. xvmuldp vs49, vs1, vs26
  309. xvmuldp vs50, vs2, vs26
  310. xvmuldp vs51, vs3, vs26
  311. xvmuldp vs52, vs4, vs26
  312. xvmuldp vs53, vs5, vs26
  313. xvmuldp vs54, vs6, vs26
  314. xvmuldp vs55, vs7, vs26
  315. xvmuldp vs56, vs0, vs27
  316. xvmuldp vs57, vs1, vs27
  317. xvmuldp vs58, vs2, vs27
  318. xvmuldp vs59, vs3, vs27
  319. xvmuldp vs60, vs4, vs27
  320. xvmuldp vs61, vs5, vs27
  321. xvmuldp vs62, vs6, vs27
  322. xvmuldp vs63, vs7, vs27
  323. .else
  324. xvmaddadp vs32, vs0, vs24
  325. xvmaddadp vs33, vs1, vs24
  326. xvmaddadp vs34, vs2, vs24
  327. xvmaddadp vs35, vs3, vs24
  328. xvmaddadp vs36, vs4, vs24
  329. xvmaddadp vs37, vs5, vs24
  330. xvmaddadp vs38, vs6, vs24
  331. xvmaddadp vs39, vs7, vs24
  332. xvmaddadp vs40, vs0, vs25
  333. xvmaddadp vs41, vs1, vs25
  334. xvmaddadp vs42, vs2, vs25
  335. xvmaddadp vs43, vs3, vs25
  336. xvmaddadp vs44, vs4, vs25
  337. xvmaddadp vs45, vs5, vs25
  338. xvmaddadp vs46, vs6, vs25
  339. xvmaddadp vs47, vs7, vs25
  340. xvmaddadp vs48, vs0, vs26
  341. xvmaddadp vs49, vs1, vs26
  342. xvmaddadp vs50, vs2, vs26
  343. xvmaddadp vs51, vs3, vs26
  344. xvmaddadp vs52, vs4, vs26
  345. xvmaddadp vs53, vs5, vs26
  346. xvmaddadp vs54, vs6, vs26
  347. xvmaddadp vs55, vs7, vs26
  348. xvmaddadp vs56, vs0, vs27
  349. xvmaddadp vs57, vs1, vs27
  350. xvmaddadp vs58, vs2, vs27
  351. xvmaddadp vs59, vs3, vs27
  352. xvmaddadp vs60, vs4, vs27
  353. xvmaddadp vs61, vs5, vs27
  354. xvmaddadp vs62, vs6, vs27
  355. xvmaddadp vs63, vs7, vs27
  356. .endif
  357. .endm
  358. .macro SAVE4x16_REGS
  359. add C2, CO, LDC
  360. add C3, C2, LDC
  361. add C4, C3, LDC
  362. .endm
  363. .macro SAVE4x16
  364. #ifndef TRMMKERNEL
  365. lxv vs0, 0(CO)
  366. lxv vs2, 16(CO)
  367. lxv vs4, 32(CO)
  368. lxv vs6, 48(CO)
  369. #endif
  370. xxpermdi vs8, vs40,vs32,1
  371. xxpermdi vs9 ,vs32,vs40,1
  372. #ifndef TRMMKERNEL
  373. lxv vs24, 64(CO)
  374. lxv vs26, 80(CO)
  375. lxv vs28, 96(CO)
  376. lxv vs30, 112(CO)
  377. #endif
  378. xxpermdi vs10, vs41,vs33,1
  379. xxpermdi vs11 ,vs33,vs41,1
  380. #ifndef TRMMKERNEL
  381. lxv vs1, 0(C2)
  382. lxv vs3, 16(C2)
  383. lxv vs5, 32(C2)
  384. lxv vs7, 48(C2)
  385. #endif
  386. xxpermdi vs12, vs42,vs34,1
  387. xxpermdi vs13 ,vs34,vs42,1
  388. #ifndef TRMMKERNEL
  389. lxv vs25, 64(C2)
  390. lxv vs27, 80(C2)
  391. #endif
  392. xxpermdi vs14, vs43,vs35,1
  393. xxpermdi vs15 ,vs35,vs43,1
  394. #ifndef TRMMKERNEL
  395. lxv vs29, 96(C2)
  396. lxv vs31, 112(C2)
  397. #endif
  398. #ifndef TRMMKERNEL
  399. xvmaddadp vs0, vs8, alpha_r
  400. xvmaddadp vs1, vs9, alpha_r
  401. xvmaddadp vs2, vs10, alpha_r
  402. xvmaddadp vs3, vs11, alpha_r
  403. #else
  404. xvmuldp vs0, vs8, alpha_r
  405. xvmuldp vs1, vs9, alpha_r
  406. xvmuldp vs2, vs10, alpha_r
  407. xvmuldp vs3, vs11, alpha_r
  408. #endif
  409. xxpermdi vs8, vs44,vs36,1
  410. xxpermdi vs9 ,vs36,vs44,1
  411. xxpermdi vs10, vs45,vs37,1
  412. xxpermdi vs11 ,vs37,vs45,1
  413. #ifndef TRMMKERNEL
  414. xvmaddadp vs4, vs12, alpha_r
  415. xvmaddadp vs5, vs13, alpha_r
  416. xvmaddadp vs6, vs14, alpha_r
  417. xvmaddadp vs7, vs15, alpha_r
  418. #else
  419. xvmuldp vs4, vs12, alpha_r
  420. xvmuldp vs5, vs13, alpha_r
  421. xvmuldp vs6, vs14, alpha_r
  422. xvmuldp vs7, vs15, alpha_r
  423. #endif
  424. xxpermdi vs12, vs46,vs38,1
  425. xxpermdi vs13 ,vs38,vs46,1
  426. xxpermdi vs14, vs47,vs39,1
  427. xxpermdi vs15 ,vs39,vs47,1
  428. #ifndef TRMMKERNEL
  429. xvmaddadp vs24, vs8, alpha_r
  430. xvmaddadp vs25, vs9, alpha_r
  431. xvmaddadp vs26, vs10, alpha_r
  432. xvmaddadp vs27, vs11, alpha_r
  433. xvmaddadp vs28, vs12, alpha_r
  434. xvmaddadp vs29, vs13, alpha_r
  435. xvmaddadp vs30, vs14, alpha_r
  436. xvmaddadp vs31, vs15, alpha_r
  437. #else
  438. xvmuldp vs24, vs8, alpha_r
  439. xvmuldp vs25, vs9, alpha_r
  440. xvmuldp vs26, vs10, alpha_r
  441. xvmuldp vs27, vs11, alpha_r
  442. xvmuldp vs28, vs12, alpha_r
  443. xvmuldp vs29, vs13, alpha_r
  444. xvmuldp vs30, vs14, alpha_r
  445. xvmuldp vs31, vs15, alpha_r
  446. #endif
  447. stxv vs0, 0(CO)
  448. stxv vs2, 16(CO)
  449. stxv vs4, 32(CO)
  450. stxv vs6, 48(CO)
  451. stxv vs24, 64(CO)
  452. stxv vs26, 80(CO)
  453. stxv vs28, 96(CO)
  454. stxv vs30, 112(CO)
  455. stxv vs1, 0(C2)
  456. stxv vs3, 16(C2)
  457. stxv vs5, 32(C2)
  458. stxv vs7, 48(C2)
  459. stxv vs25, 64(C2)
  460. stxv vs27, 80(C2)
  461. stxv vs29, 96(C2)
  462. stxv vs31, 112(C2)
  463. #ifndef TRMMKERNEL
  464. lxv vs0, 0(C3)
  465. lxv vs2, 16(C3)
  466. lxv vs4, 32(C3)
  467. lxv vs6, 48(C3)
  468. #endif
  469. xxpermdi vs8, vs56,vs48,1
  470. xxpermdi vs9 ,vs48,vs56,1
  471. #ifndef TRMMKERNEL
  472. lxv vs24, 64(C3)
  473. lxv vs26, 80(C3)
  474. #endif
  475. xxpermdi vs10, vs57,vs49,1
  476. xxpermdi vs11 ,vs49,vs57,1
  477. #ifndef TRMMKERNEL
  478. lxv vs28, 96(C3)
  479. lxv vs30, 112(C3)
  480. #endif
  481. xxpermdi vs12, vs58,vs50,1
  482. xxpermdi vs13 ,vs50,vs58,1
  483. #ifndef TRMMKERNEL
  484. lxv vs1, 0(C4)
  485. lxv vs3, 16(C4)
  486. #endif
  487. xxpermdi vs14, vs59,vs51,1
  488. xxpermdi vs15 ,vs51,vs59,1
  489. #ifndef TRMMKERNEL
  490. lxv vs5, 32(C4)
  491. lxv vs7, 48(C4)
  492. lxv vs25, 64(C4)
  493. lxv vs27, 80(C4)
  494. lxv vs29, 96(C4)
  495. lxv vs31, 112(C4)
  496. #endif
  497. #ifndef TRMMKERNEL
  498. xvmaddadp vs0, vs8, alpha_r
  499. xvmaddadp vs1, vs9, alpha_r
  500. xvmaddadp vs2, vs10, alpha_r
  501. xvmaddadp vs3, vs11, alpha_r
  502. #else
  503. xvmuldp vs0, vs8, alpha_r
  504. xvmuldp vs1, vs9, alpha_r
  505. xvmuldp vs2, vs10, alpha_r
  506. xvmuldp vs3, vs11, alpha_r
  507. #endif
  508. xxpermdi vs8, vs60,vs52,1
  509. xxpermdi vs9 ,vs52,vs60,1
  510. xxpermdi vs10, vs61,vs53,1
  511. xxpermdi vs11 ,vs53,vs61,1
  512. #ifndef TRMMKERNEL
  513. xvmaddadp vs4, vs12, alpha_r
  514. xvmaddadp vs5, vs13, alpha_r
  515. xvmaddadp vs6, vs14, alpha_r
  516. xvmaddadp vs7, vs15, alpha_r
  517. #else
  518. xvmuldp vs4, vs12, alpha_r
  519. xvmuldp vs5, vs13, alpha_r
  520. xvmuldp vs6, vs14, alpha_r
  521. xvmuldp vs7, vs15, alpha_r
  522. #endif
  523. xxpermdi vs12, vs62,vs54,1
  524. xxpermdi vs13 ,vs54,vs62,1
  525. xxpermdi vs14, vs63,vs55,1
  526. xxpermdi vs15 ,vs55,vs63,1
  527. #ifndef TRMMKERNEL
  528. xvmaddadp vs24, vs8, alpha_r
  529. xvmaddadp vs25, vs9, alpha_r
  530. xvmaddadp vs26, vs10, alpha_r
  531. xvmaddadp vs27, vs11, alpha_r
  532. xvmaddadp vs28, vs12, alpha_r
  533. xvmaddadp vs29, vs13, alpha_r
  534. xvmaddadp vs30, vs14, alpha_r
  535. xvmaddadp vs31, vs15, alpha_r
  536. #else
  537. xvmuldp vs24, vs8, alpha_r
  538. xvmuldp vs25, vs9, alpha_r
  539. xvmuldp vs26, vs10, alpha_r
  540. xvmuldp vs27, vs11, alpha_r
  541. xvmuldp vs28, vs12, alpha_r
  542. xvmuldp vs29, vs13, alpha_r
  543. xvmuldp vs30, vs14, alpha_r
  544. xvmuldp vs31, vs15, alpha_r
  545. #endif
  546. stxv vs0, 0(C3)
  547. stxv vs2, 16(C3)
  548. stxv vs4, 32(C3)
  549. stxv vs6, 48(C3)
  550. stxv vs24, 64(C3)
  551. stxv vs26, 80(C3)
  552. stxv vs28, 96(C3)
  553. stxv vs30, 112(C3)
  554. stxv vs1, 0(C4)
  555. stxv vs3, 16(C4)
  556. stxv vs5, 32(C4)
  557. stxv vs7, 48(C4)
  558. stxv vs25, 64(C4)
  559. stxv vs27, 80(C4)
  560. stxv vs29, 96(C4)
  561. stxv vs31, 112(C4)
  562. addi CO, CO, 128
  563. .endm
  564. /*********************************************************************
  565. * Macros for N=4, M=8 *
  566. *********************************************************************/
  567. .macro LOAD4x8_1
  568. LOAD4x8 1
  569. .endm
  570. .macro LOAD4x8_0
  571. LOAD4x8 0
  572. .endm
  573. .macro LOAD4x8 Zero
  574. lxv vs24, 0(BO)
  575. lxv vs26, 16(BO)
  576. xxpermdi vs25, vs24, vs24,2
  577. xxpermdi vs27, vs26, vs26,2
  578. lxv vs0, 0(AO)
  579. lxv vs1, 16(AO)
  580. lxv vs2, 32(AO)
  581. lxv vs3, 48(AO)
  582. .if \Zero==1
  583. xxlxor vs32,vs32,vs32
  584. xxlxor vs33,vs33,vs33
  585. xxlxor vs34,vs34,vs34
  586. xxlxor vs35,vs35,vs35
  587. xxlxor vs40, vs40, vs40
  588. xxlxor vs41, vs41, vs41
  589. xxlxor vs42, vs42, vs42
  590. xxlxor vs43, vs43, vs43
  591. xxlxor vs48, vs48, vs48
  592. xxlxor vs49, vs49, vs49
  593. xxlxor vs50, vs50, vs50
  594. xxlxor vs51, vs51, vs51
  595. xxlxor vs56, vs56, vs56
  596. xxlxor vs57, vs57, vs57
  597. xxlxor vs58, vs58, vs58
  598. xxlxor vs59, vs59, vs59
  599. .endif
  600. .endm
  601. .macro KERNEL4x8_L1_L2 Index,IsLast
  602. KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0
  603. .endm
  604. .macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast
  605. KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0
  606. .endm
  607. .macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast
  608. KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0
  609. .endm
  610. .macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
  611. KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1
  612. .endm
  613. .macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete
  614. lxv vs8, DISP16(\Index,0+\OffsetA)(AO)
  615. lxv vs9, DISP16(\Index,16+\OffsetA)(AO)
  616. .if \First ==1
  617. xvmuldp vs32, vs0, vs24
  618. xvmuldp vs33, vs1, vs24
  619. xvmuldp vs34, vs2, vs24
  620. xvmuldp vs35, vs3, vs24
  621. .else
  622. xvmaddadp vs32, vs0, vs24
  623. xvmaddadp vs33, vs1, vs24
  624. xvmaddadp vs34, vs2, vs24
  625. xvmaddadp vs35, vs3, vs24
  626. .endif
  627. lxv vs10, DISP16(\Index,32+\OffsetA)(AO)
  628. lxv vs11, DISP16(\Index,48+\OffsetA)(AO)
  629. .if \First ==1
  630. xvmuldp vs40, vs0, vs25
  631. xvmuldp vs41, vs1, vs25
  632. xvmuldp vs42, vs2, vs25
  633. xvmuldp vs43, vs3, vs25
  634. xvmuldp vs48, vs0, vs26
  635. xvmuldp vs49, vs1, vs26
  636. xvmuldp vs50, vs2, vs26
  637. xvmuldp vs51, vs3, vs26
  638. .else
  639. lxv vs28, DISP8(\Index,0 +\OffsetB)(BO)
  640. lxv vs30, DISP8(\Index,16 +\OffsetB)(BO)
  641. xvmaddadp vs40, vs0, vs25
  642. xvmaddadp vs41, vs1, vs25
  643. xvmaddadp vs42, vs2, vs25
  644. xvmaddadp vs43, vs3, vs25
  645. xvmaddadp vs48, vs0, vs26
  646. xvmaddadp vs49, vs1, vs26
  647. xvmaddadp vs50, vs2, vs26
  648. xvmaddadp vs51, vs3, vs26
  649. .endif
  650. xxpermdi vs29, vs28, vs28,2
  651. xxpermdi vs31, vs30, vs30,2
  652. .if \First ==1
  653. xvmuldp vs56, vs0, vs27
  654. xvmuldp vs57, vs1, vs27
  655. xvmuldp vs58, vs2, vs27
  656. xvmuldp vs59, vs3, vs27
  657. .else
  658. xvmaddadp vs56, vs0, vs27
  659. xvmaddadp vs57, vs1, vs27
  660. xvmaddadp vs58, vs2, vs27
  661. xvmaddadp vs59, vs3, vs27
  662. .endif
  663. xvmaddadp vs32, vs8, vs28
  664. xvmaddadp vs33, vs9, vs28
  665. xvmaddadp vs34, vs10, vs28
  666. xvmaddadp vs35, vs11, vs28
  667. .if \Complete==0
  668. lxv vs0, DISP16(\Index,64+\OffsetA)(AO)
  669. lxv vs1, DISP16(\Index,80+\OffsetA)(AO)
  670. .endif
  671. xvmaddadp vs40, vs8, vs29
  672. xvmaddadp vs41, vs9, vs29
  673. xvmaddadp vs42, vs10, vs29
  674. xvmaddadp vs43, vs11, vs29
  675. .if \Complete==0
  676. lxv vs2, DISP16(\Index,96+\OffsetA)(AO)
  677. lxv vs3, DISP16(\Index,112+\OffsetA)(AO)
  678. .endif
  679. xvmaddadp vs48, vs8, vs30
  680. xvmaddadp vs49, vs9, vs30
  681. xvmaddadp vs50, vs10, vs30
  682. xvmaddadp vs51, vs11, vs30
  683. .if \Complete==0
  684. lxv vs24, DISP8(\Index,32 +\OffsetB)(BO)
  685. lxv vs26, DISP8(\Index,48 +\OffsetB)(BO)
  686. .endif
  687. xvmaddadp vs56, vs8, vs31
  688. xvmaddadp vs57, vs9, vs31
  689. xvmaddadp vs58, vs10, vs31
  690. xvmaddadp vs59, vs11, vs31
  691. .if \Complete==0
  692. xxpermdi vs25, vs24, vs24,2
  693. xxpermdi vs27, vs26, vs26,2
  694. .endif
  695. .if \IsLast==1
  696. .if \Complete==1
  697. addi AO, AO, DISP16(\Index,64+\OffsetA)
  698. addi BO, BO, DISP8(\Index,32+\OffsetB)
  699. .else
  700. addi AO, AO, DISP16(\Index,128)
  701. addi BO, BO, DISP8(\Index,64)
  702. .endif
  703. .endif
  704. .endm
  705. .macro KERNEL4x8 First
  706. lxv vs24, 0(BO)
  707. lxv vs26, 16(BO)
  708. xxpermdi vs25, vs24, vs24,2
  709. xxpermdi vs27, vs26, vs26,2
  710. lxv vs0, 0(AO)
  711. lxv vs1, 16(AO)
  712. lxv vs2, 32(AO)
  713. lxv vs3, 48(AO)
  714. addi BO, BO, 32
  715. addi AO, AO, 64
  716. .if \First==1
  717. xvmuldp vs32, vs0, vs24
  718. xvmuldp vs33, vs1, vs24
  719. xvmuldp vs34, vs2, vs24
  720. xvmuldp vs35, vs3, vs24
  721. xvmuldp vs40, vs0, vs25
  722. xvmuldp vs41, vs1, vs25
  723. xvmuldp vs42, vs2, vs25
  724. xvmuldp vs43, vs3, vs25
  725. xvmuldp vs48, vs0, vs26
  726. xvmuldp vs49, vs1, vs26
  727. xvmuldp vs50, vs2, vs26
  728. xvmuldp vs51, vs3, vs26
  729. xvmuldp vs56, vs0, vs27
  730. xvmuldp vs57, vs1, vs27
  731. xvmuldp vs58, vs2, vs27
  732. xvmuldp vs59, vs3, vs27
  733. .else
  734. xvmaddadp vs32, vs0, vs24
  735. xvmaddadp vs33, vs1, vs24
  736. xvmaddadp vs34, vs2, vs24
  737. xvmaddadp vs35, vs3, vs24
  738. xvmaddadp vs40, vs0, vs25
  739. xvmaddadp vs41, vs1, vs25
  740. xvmaddadp vs42, vs2, vs25
  741. xvmaddadp vs43, vs3, vs25
  742. xvmaddadp vs48, vs0, vs26
  743. xvmaddadp vs49, vs1, vs26
  744. xvmaddadp vs50, vs2, vs26
  745. xvmaddadp vs51, vs3, vs26
  746. xvmaddadp vs56, vs0, vs27
  747. xvmaddadp vs57, vs1, vs27
  748. xvmaddadp vs58, vs2, vs27
  749. xvmaddadp vs59, vs3, vs27
  750. .endif
  751. .endm
  752. .macro SAVE4x8
  753. add T2, CO, LDC
  754. add T3, T2, LDC
  755. add T4, T3, LDC
  756. #ifndef TRMMKERNEL
  757. lxv vs0, 0(CO)
  758. lxv vs2, 16(CO)
  759. #endif
  760. xxpermdi vs8, vs40,vs32,1
  761. xxpermdi vs9 ,vs32,vs40,1
  762. #ifndef TRMMKERNEL
  763. lxv vs4, 32(CO)
  764. lxv vs6, 48(CO)
  765. #endif
  766. xxpermdi vs10, vs41,vs33,1
  767. xxpermdi vs11 ,vs33,vs41,1
  768. #ifndef TRMMKERNEL
  769. lxv vs1, 0(T2)
  770. lxv vs3, 16(T2)
  771. #endif
  772. xxpermdi vs12, vs42,vs34,1
  773. xxpermdi vs13 ,vs34,vs42,1
  774. #ifndef TRMMKERNEL
  775. lxv vs5, 32(T2)
  776. lxv vs7, 48(T2)
  777. #endif
  778. xxpermdi vs14, vs43,vs35,1
  779. xxpermdi vs15 ,vs35,vs43,1
  780. #ifndef TRMMKERNEL
  781. xvmaddadp vs0, vs8, alpha_r
  782. xvmaddadp vs1, vs9, alpha_r
  783. xvmaddadp vs2, vs10, alpha_r
  784. xvmaddadp vs3, vs11, alpha_r
  785. xvmaddadp vs4, vs12, alpha_r
  786. xvmaddadp vs5, vs13, alpha_r
  787. xvmaddadp vs6, vs14, alpha_r
  788. xvmaddadp vs7, vs15, alpha_r
  789. #else
  790. xvmuldp vs0, vs8, alpha_r
  791. xvmuldp vs1, vs9, alpha_r
  792. xvmuldp vs2, vs10, alpha_r
  793. xvmuldp vs3, vs11, alpha_r
  794. xvmuldp vs4, vs12, alpha_r
  795. xvmuldp vs5, vs13, alpha_r
  796. xvmuldp vs6, vs14, alpha_r
  797. xvmuldp vs7, vs15, alpha_r
  798. #endif
  799. stxv vs0, 0(CO)
  800. stxv vs2, 16(CO)
  801. stxv vs4, 32(CO)
  802. stxv vs6, 48(CO)
  803. stxv vs1, 0(T2)
  804. stxv vs3, 16(T2)
  805. stxv vs5, 32(T2)
  806. stxv vs7, 48(T2)
  807. xxpermdi vs8, vs56,vs48,1
  808. xxpermdi vs9 ,vs48,vs56,1
  809. #ifndef TRMMKERNEL
  810. lxv vs0, 0(T3)
  811. lxv vs2, 16(T3)
  812. #endif
  813. xxpermdi vs10, vs57,vs49,1
  814. xxpermdi vs11 ,vs49,vs57,1
  815. #ifndef TRMMKERNEL
  816. lxv vs4, 32(T3)
  817. lxv vs6, 48(T3)
  818. #endif
  819. xxpermdi vs12, vs58,vs50,1
  820. xxpermdi vs13 ,vs50,vs58,1
  821. #ifndef TRMMKERNEL
  822. lxv vs1, 0(T4)
  823. lxv vs3, 16(T4)
  824. #endif
  825. xxpermdi vs14, vs59,vs51,1
  826. xxpermdi vs15 ,vs51,vs59,1
  827. #ifndef TRMMKERNEL
  828. lxv vs5, 32(T4)
  829. lxv vs7, 48(T4)
  830. xvmaddadp vs0, vs8, alpha_r
  831. xvmaddadp vs1, vs9, alpha_r
  832. xvmaddadp vs2, vs10, alpha_r
  833. xvmaddadp vs3, vs11, alpha_r
  834. xvmaddadp vs4, vs12, alpha_r
  835. xvmaddadp vs5, vs13, alpha_r
  836. xvmaddadp vs6, vs14, alpha_r
  837. xvmaddadp vs7, vs15, alpha_r
  838. #else
  839. xvmuldp vs0, vs8, alpha_r
  840. xvmuldp vs1, vs9, alpha_r
  841. xvmuldp vs2, vs10, alpha_r
  842. xvmuldp vs3, vs11, alpha_r
  843. xvmuldp vs4, vs12, alpha_r
  844. xvmuldp vs5, vs13, alpha_r
  845. xvmuldp vs6, vs14, alpha_r
  846. xvmuldp vs7, vs15, alpha_r
  847. #endif
  848. stxv vs0, 0(T3)
  849. stxv vs2, 16(T3)
  850. stxv vs4, 32(T3)
  851. stxv vs6, 48(T3)
  852. stxv vs1, 0(T4)
  853. stxv vs3, 16(T4)
  854. stxv vs5, 32(T4)
  855. stxv vs7, 48(T4)
  856. addi CO, CO, 64
  857. .endm
  858. /*********************************************************************
  859. * Macros for N=4, M=4 *
  860. *********************************************************************/
  861. .macro LOAD4x4_1
  862. lxvd2x vs0, 0, AO
  863. lxvd2x vs1, o16, AO
  864. lxvdsx vs24, 0, BO
  865. lxvdsx vs25, o8, BO
  866. lxvdsx vs26, o16, BO
  867. lxvdsx vs27, o24, BO
  868. addi AO, AO, 32
  869. addi BO, BO, 32
  870. .endm
  871. .macro KERNEL4x4_I1
  872. lxvd2x vs8, 0, AO
  873. lxvd2x vs9, o16, AO
  874. lxvdsx vs28, 0, BO
  875. lxvdsx vs29, o8, BO
  876. lxvdsx vs30, o16, BO
  877. lxvdsx vs31, o24, BO
  878. addi AO, AO, 32
  879. addi BO, BO, 32
  880. xvmuldp vs32, vs0, vs24
  881. xvmuldp vs33, vs1, vs24
  882. xvmuldp vs40, vs0, vs25
  883. xvmuldp vs41, vs1, vs25
  884. xvmuldp vs48, vs0, vs26
  885. xvmuldp vs49, vs1, vs26
  886. xvmuldp vs56, vs0, vs27
  887. xvmuldp vs57, vs1, vs27
  888. .endm
  889. .macro KERNEL4x4_1
  890. lxvd2x vs8, 0, AO
  891. lxvd2x vs9, o16, AO
  892. lxvdsx vs28, 0, BO
  893. lxvdsx vs29, o8, BO
  894. lxvdsx vs30, o16, BO
  895. lxvdsx vs31, o24, BO
  896. addi AO, AO, 32
  897. addi BO, BO, 32
  898. xvmaddadp vs32, vs0, vs24
  899. xvmaddadp vs33, vs1, vs24
  900. xvmaddadp vs40, vs0, vs25
  901. xvmaddadp vs41, vs1, vs25
  902. xvmaddadp vs48, vs0, vs26
  903. xvmaddadp vs49, vs1, vs26
  904. xvmaddadp vs56, vs0, vs27
  905. xvmaddadp vs57, vs1, vs27
  906. .endm
  907. .macro KERNEL4x4_2
  908. lxvd2x vs0, 0, AO
  909. lxvd2x vs1, o16, AO
  910. lxvdsx vs24, 0, BO
  911. lxvdsx vs25, o8, BO
  912. lxvdsx vs26, o16, BO
  913. lxvdsx vs27, o24, BO
  914. addi AO, AO, 32
  915. addi BO, BO, 32
  916. xvmaddadp vs32, vs8, vs28
  917. xvmaddadp vs33, vs9, vs28
  918. xvmaddadp vs40, vs8, vs29
  919. xvmaddadp vs41, vs9, vs29
  920. xvmaddadp vs48, vs8, vs30
  921. xvmaddadp vs49, vs9, vs30
  922. xvmaddadp vs56, vs8, vs31
  923. xvmaddadp vs57, vs9, vs31
  924. .endm
  925. .macro KERNEL4x4_E2
  926. xvmaddadp vs32, vs8, vs28
  927. xvmaddadp vs33, vs9, vs28
  928. xvmaddadp vs40, vs8, vs29
  929. xvmaddadp vs41, vs9, vs29
  930. xvmaddadp vs48, vs8, vs30
  931. xvmaddadp vs49, vs9, vs30
  932. xvmaddadp vs56, vs8, vs31
  933. xvmaddadp vs57, vs9, vs31
  934. .endm
  935. .macro KERNEL4x4_SUBI1
  936. lxvd2x vs0, 0, AO
  937. lxvd2x vs1, o16, AO
  938. lxvdsx vs24, 0, BO
  939. lxvdsx vs25, o8, BO
  940. lxvdsx vs26, o16, BO
  941. lxvdsx vs27, o24, BO
  942. addi AO, AO, 32
  943. addi BO, BO, 32
  944. xvmuldp vs32, vs0, vs24
  945. xvmuldp vs33, vs1, vs24
  946. xvmuldp vs40, vs0, vs25
  947. xvmuldp vs41, vs1, vs25
  948. xvmuldp vs48, vs0, vs26
  949. xvmuldp vs49, vs1, vs26
  950. xvmuldp vs56, vs0, vs27
  951. xvmuldp vs57, vs1, vs27
  952. .endm
  953. .macro KERNEL4x4_SUB1
  954. lxvd2x vs0, 0, AO
  955. lxvd2x vs1, o16, AO
  956. lxvdsx vs24, 0, BO
  957. lxvdsx vs25, o8, BO
  958. lxvdsx vs26, o16, BO
  959. lxvdsx vs27, o24, BO
  960. addi AO, AO, 32
  961. addi BO, BO, 32
  962. xvmaddadp vs32, vs0, vs24
  963. xvmaddadp vs33, vs1, vs24
  964. xvmaddadp vs40, vs0, vs25
  965. xvmaddadp vs41, vs1, vs25
  966. xvmaddadp vs48, vs0, vs26
  967. xvmaddadp vs49, vs1, vs26
  968. xvmaddadp vs56, vs0, vs27
  969. xvmaddadp vs57, vs1, vs27
  970. .endm
  971. .macro SAVE4x4
  972. mr T1, CO
  973. #ifndef TRMMKERNEL
  974. lxvd2x vs0, 0, T1
  975. lxvd2x vs1, o16, T1
  976. #endif
  977. #ifndef TRMMKERNEL
  978. xvmaddadp vs0, vs32, alpha_r
  979. xvmaddadp vs1, vs33, alpha_r
  980. #else
  981. xvmuldp vs0, vs32, alpha_r
  982. xvmuldp vs1, vs33, alpha_r
  983. #endif
  984. stxvd2x vs0, 0, T1
  985. stxvd2x vs1, o16, T1
  986. add T1, T1, LDC
  987. #ifndef TRMMKERNEL
  988. lxvd2x vs8, 0, T1
  989. lxvd2x vs9, o16, T1
  990. #endif
  991. #ifndef TRMMKERNEL
  992. xvmaddadp vs8, vs40, alpha_r
  993. xvmaddadp vs9, vs41, alpha_r
  994. #else
  995. xvmuldp vs8, vs40, alpha_r
  996. xvmuldp vs9, vs41, alpha_r
  997. #endif
  998. stxvd2x vs8, 0, T1
  999. stxvd2x vs9, o16, T1
  1000. add T1, T1, LDC
  1001. #ifndef TRMMKERNEL
  1002. lxvd2x vs0, 0, T1
  1003. lxvd2x vs1, o16, T1
  1004. #endif
  1005. #ifndef TRMMKERNEL
  1006. xvmaddadp vs0, vs48, alpha_r
  1007. xvmaddadp vs1, vs49, alpha_r
  1008. #else
  1009. xvmuldp vs0, vs48, alpha_r
  1010. xvmuldp vs1, vs49, alpha_r
  1011. #endif
  1012. stxvd2x vs0, 0, T1
  1013. stxvd2x vs1, o16, T1
  1014. add T1, T1, LDC
  1015. #ifndef TRMMKERNEL
  1016. lxvd2x vs8, 0, T1
  1017. lxvd2x vs9, o16, T1
  1018. #endif
  1019. #ifndef TRMMKERNEL
  1020. xvmaddadp vs8, vs56, alpha_r
  1021. xvmaddadp vs9, vs57, alpha_r
  1022. #else
  1023. xvmuldp vs8, vs56, alpha_r
  1024. xvmuldp vs9, vs57, alpha_r
  1025. #endif
  1026. stxvd2x vs8, 0, T1
  1027. stxvd2x vs9, o16, T1
  1028. addi CO, CO, 32
  1029. .endm
  1030. /*********************************************************************
  1031. * Macros for N=4, M=2 *
  1032. *********************************************************************/
  1033. .macro LOAD4x2_1
  1034. lxvd2x vs0, 0, AO
  1035. lxvdsx vs24, 0, BO
  1036. lxvdsx vs25, o8, BO
  1037. lxvdsx vs26, o16, BO
  1038. lxvdsx vs27, o24, BO
  1039. addi AO, AO, 16
  1040. addi BO, BO, 32
  1041. .endm
  1042. .macro KERNEL4x2_I1
  1043. lxvd2x vs8, 0, AO
  1044. lxvdsx vs28, 0, BO
  1045. lxvdsx vs29, o8, BO
  1046. lxvdsx vs30, o16, BO
  1047. lxvdsx vs31, o24, BO
  1048. addi AO, AO, 16
  1049. addi BO, BO, 32
  1050. xvmuldp vs32, vs0, vs24
  1051. xvmuldp vs40, vs0, vs25
  1052. xvmuldp vs48, vs0, vs26
  1053. xvmuldp vs56, vs0, vs27
  1054. .endm
  1055. .macro KERNEL4x2_1
  1056. lxvd2x vs8, 0, AO
  1057. lxvdsx vs28, 0, BO
  1058. lxvdsx vs29, o8, BO
  1059. lxvdsx vs30, o16, BO
  1060. lxvdsx vs31, o24, BO
  1061. addi AO, AO, 16
  1062. addi BO, BO, 32
  1063. xvmaddadp vs32, vs0, vs24
  1064. xvmaddadp vs40, vs0, vs25
  1065. xvmaddadp vs48, vs0, vs26
  1066. xvmaddadp vs56, vs0, vs27
  1067. .endm
  1068. .macro KERNEL4x2_2
  1069. lxvd2x vs0, 0, AO
  1070. lxvdsx vs24, 0, BO
  1071. lxvdsx vs25, o8, BO
  1072. lxvdsx vs26, o16, BO
  1073. lxvdsx vs27, o24, BO
  1074. addi AO, AO, 16
  1075. addi BO, BO, 32
  1076. xvmaddadp vs32, vs8, vs28
  1077. xvmaddadp vs40, vs8, vs29
  1078. xvmaddadp vs48, vs8, vs30
  1079. xvmaddadp vs56, vs8, vs31
  1080. .endm
  1081. .macro KERNEL4x2_E2
  1082. xvmaddadp vs32, vs8, vs28
  1083. xvmaddadp vs40, vs8, vs29
  1084. xvmaddadp vs48, vs8, vs30
  1085. xvmaddadp vs56, vs8, vs31
  1086. .endm
  1087. .macro KERNEL4x2_SUBI1
  1088. lxvd2x vs0, 0, AO
  1089. lxvdsx vs24, 0, BO
  1090. lxvdsx vs25, o8, BO
  1091. lxvdsx vs26, o16, BO
  1092. lxvdsx vs27, o24, BO
  1093. addi AO, AO, 16
  1094. addi BO, BO, 32
  1095. xvmuldp vs32, vs0, vs24
  1096. xvmuldp vs40, vs0, vs25
  1097. xvmuldp vs48, vs0, vs26
  1098. xvmuldp vs56, vs0, vs27
  1099. .endm
  1100. .macro KERNEL4x2_SUB1
  1101. lxvd2x vs0, 0, AO
  1102. lxvdsx vs24, 0, BO
  1103. lxvdsx vs25, o8, BO
  1104. lxvdsx vs26, o16, BO
  1105. lxvdsx vs27, o24, BO
  1106. addi AO, AO, 16
  1107. addi BO, BO, 32
  1108. xvmaddadp vs32, vs0, vs24
  1109. xvmaddadp vs40, vs0, vs25
  1110. xvmaddadp vs48, vs0, vs26
  1111. xvmaddadp vs56, vs0, vs27
  1112. .endm
  1113. .macro SAVE4x2
  1114. mr T1, CO
  1115. #ifndef TRMMKERNEL
  1116. lxvd2x vs0, 0, T1
  1117. #endif
  1118. #ifndef TRMMKERNEL
  1119. xvmaddadp vs0, vs32, alpha_r
  1120. #else
  1121. xvmuldp vs0, vs32, alpha_r
  1122. #endif
  1123. stxvd2x vs0, 0, T1
  1124. add T1, T1, LDC
  1125. #ifndef TRMMKERNEL
  1126. lxvd2x vs8, 0, T1
  1127. #endif
  1128. #ifndef TRMMKERNEL
  1129. xvmaddadp vs8, vs40, alpha_r
  1130. #else
  1131. xvmuldp vs8, vs40, alpha_r
  1132. #endif
  1133. stxvd2x vs8, 0, T1
  1134. add T1, T1, LDC
  1135. #ifndef TRMMKERNEL
  1136. lxvd2x vs0, 0, T1
  1137. #endif
  1138. #ifndef TRMMKERNEL
  1139. xvmaddadp vs0, vs48, alpha_r
  1140. #else
  1141. xvmuldp vs0, vs48, alpha_r
  1142. #endif
  1143. stxvd2x vs0, 0, T1
  1144. add T1, T1, LDC
  1145. #ifndef TRMMKERNEL
  1146. lxvd2x vs8, 0, T1
  1147. #endif
  1148. #ifndef TRMMKERNEL
  1149. xvmaddadp vs8, vs56, alpha_r
  1150. #else
  1151. xvmuldp vs8, vs56, alpha_r
  1152. #endif
  1153. stxvd2x vs8, 0, T1
  1154. addi CO, CO, 16
  1155. .endm
  1156. /*********************************************************************
  1157. * Macros for N=4, M=1 *
  1158. *********************************************************************/
  1159. .macro LOAD4x1_1
  1160. lxsdx vs0, 0, AO
  1161. lxsdx vs24, 0, BO
  1162. lxsdx vs25, o8, BO
  1163. lxsdx vs26, o16, BO
  1164. lxsdx vs27, o24, BO
  1165. addi AO, AO, 8
  1166. addi BO, BO, 32
  1167. .endm
  1168. .macro KERNEL4x1_I1
  1169. lxsdx vs8, 0, AO
  1170. lxsdx vs28, 0, BO
  1171. lxsdx vs29, o8, BO
  1172. lxsdx vs30, o16, BO
  1173. lxsdx vs31, o24, BO
  1174. addi AO, AO, 8
  1175. addi BO, BO, 32
  1176. xsmuldp vs32, vs0, vs24
  1177. xsmuldp vs40, vs0, vs25
  1178. xsmuldp vs48, vs0, vs26
  1179. xsmuldp vs56, vs0, vs27
  1180. .endm
  1181. .macro KERNEL4x1_1
  1182. lxsdx vs8, 0, AO
  1183. lxsdx vs28, 0, BO
  1184. lxsdx vs29, o8, BO
  1185. lxsdx vs30, o16, BO
  1186. lxsdx vs31, o24, BO
  1187. addi AO, AO, 8
  1188. addi BO, BO, 32
  1189. xsmaddadp vs32, vs0, vs24
  1190. xsmaddadp vs40, vs0, vs25
  1191. xsmaddadp vs48, vs0, vs26
  1192. xsmaddadp vs56, vs0, vs27
  1193. .endm
  1194. .macro KERNEL4x1_2
  1195. lxsdx vs0, 0, AO
  1196. lxsdx vs24, 0, BO
  1197. lxsdx vs25, o8, BO
  1198. lxsdx vs26, o16, BO
  1199. lxsdx vs27, o24, BO
  1200. addi AO, AO, 8
  1201. addi BO, BO, 32
  1202. xsmaddadp vs32, vs8, vs28
  1203. xsmaddadp vs40, vs8, vs29
  1204. xsmaddadp vs48, vs8, vs30
  1205. xsmaddadp vs56, vs8, vs31
  1206. .endm
  1207. .macro KERNEL4x1_E2
  1208. xsmaddadp vs32, vs8, vs28
  1209. xsmaddadp vs40, vs8, vs29
  1210. xsmaddadp vs48, vs8, vs30
  1211. xsmaddadp vs56, vs8, vs31
  1212. .endm
  1213. .macro KERNEL4x1_SUBI1
  1214. lxsdx vs0, 0, AO
  1215. lxsdx vs24, 0, BO
  1216. lxsdx vs25, o8, BO
  1217. lxsdx vs26, o16, BO
  1218. lxsdx vs27, o24, BO
  1219. addi AO, AO, 8
  1220. addi BO, BO, 32
  1221. xsmuldp vs32, vs0, vs24
  1222. xsmuldp vs40, vs0, vs25
  1223. xsmuldp vs48, vs0, vs26
  1224. xsmuldp vs56, vs0, vs27
  1225. .endm
  1226. .macro KERNEL4x1_SUB1
  1227. lxsdx vs0, 0, AO
  1228. lxsdx vs24, 0, BO
  1229. lxsdx vs25, o8, BO
  1230. lxsdx vs26, o16, BO
  1231. lxsdx vs27, o24, BO
  1232. addi AO, AO, 8
  1233. addi BO, BO, 32
  1234. xsmaddadp vs32, vs0, vs24
  1235. xsmaddadp vs40, vs0, vs25
  1236. xsmaddadp vs48, vs0, vs26
  1237. xsmaddadp vs56, vs0, vs27
  1238. .endm
  1239. .macro SAVE4x1
  1240. mr T1, CO
  1241. #ifndef TRMMKERNEL
  1242. lxsdx vs0, 0, T1
  1243. #endif
  1244. #ifndef TRMMKERNEL
  1245. xsmaddadp vs0, vs32, alpha_r
  1246. #else
  1247. xsmuldp vs0, vs32, alpha_r
  1248. #endif
  1249. stxsdx vs0, 0, T1
  1250. add T1, T1, LDC
  1251. #ifndef TRMMKERNEL
  1252. lxsdx vs8, 0, T1
  1253. #endif
  1254. #ifndef TRMMKERNEL
  1255. xsmaddadp vs8, vs40, alpha_r
  1256. #else
  1257. xsmuldp vs8, vs40, alpha_r
  1258. #endif
  1259. stxsdx vs8, 0, T1
  1260. add T1, T1, LDC
  1261. #ifndef TRMMKERNEL
  1262. lxsdx vs0, 0, T1
  1263. #endif
  1264. #ifndef TRMMKERNEL
  1265. xsmaddadp vs0, vs48, alpha_r
  1266. #else
  1267. xsmuldp vs0, vs48, alpha_r
  1268. #endif
  1269. stxsdx vs0, 0, T1
  1270. add T1, T1, LDC
  1271. #ifndef TRMMKERNEL
  1272. lxsdx vs8, 0, T1
  1273. #endif
  1274. #ifndef TRMMKERNEL
  1275. xsmaddadp vs8, vs56, alpha_r
  1276. #else
  1277. xsmuldp vs8, vs56, alpha_r
  1278. #endif
  1279. stxsdx vs8, 0, T1
  1280. addi CO, CO, 8
  1281. .endm
  1282. /*********************************************************************
  1283. * Macros for N=2, M=16 *
  1284. *********************************************************************/
  1285. .macro LOAD2x16_1
  1286. lxvd2x vs0, 0, AO
  1287. lxvd2x vs1, o16, AO
  1288. lxvd2x vs2, o32, AO
  1289. lxvd2x vs3, o48, AO
  1290. lxvdsx vs24, 0, BO
  1291. lxvdsx vs25, o8, BO
  1292. addi AO, AO, 64
  1293. addi BO, BO, 16
  1294. lxvd2x vs4, 0, AO
  1295. lxvd2x vs5, o16, AO
  1296. lxvd2x vs6, o32, AO
  1297. lxvd2x vs7, o48, AO
  1298. addi AO, AO, 64
  1299. .endm
  1300. .macro KERNEL2x16_I1
  1301. lxvd2x vs8, 0, AO
  1302. lxvd2x vs9, o16, AO
  1303. lxvd2x vs10, o32, AO
  1304. lxvd2x vs11, o48, AO
  1305. lxvdsx vs28, 0, BO
  1306. lxvdsx vs29, o8, BO
  1307. addi AO, AO, 64
  1308. addi BO, BO, 16
  1309. lxvd2x vs12, 0, AO
  1310. lxvd2x vs13, o16, AO
  1311. lxvd2x vs14, o32, AO
  1312. lxvd2x vs15, o48, AO
  1313. addi AO, AO, 64
  1314. xvmuldp vs32, vs0, vs24
  1315. xvmuldp vs33, vs1, vs24
  1316. xvmuldp vs34, vs2, vs24
  1317. xvmuldp vs35, vs3, vs24
  1318. xvmuldp vs36, vs4, vs24
  1319. xvmuldp vs37, vs5, vs24
  1320. xvmuldp vs38, vs6, vs24
  1321. xvmuldp vs39, vs7, vs24
  1322. xvmuldp vs40, vs0, vs25
  1323. xvmuldp vs41, vs1, vs25
  1324. xvmuldp vs42, vs2, vs25
  1325. xvmuldp vs43, vs3, vs25
  1326. xvmuldp vs44, vs4, vs25
  1327. xvmuldp vs45, vs5, vs25
  1328. xvmuldp vs46, vs6, vs25
  1329. xvmuldp vs47, vs7, vs25
  1330. .endm
  1331. .macro KERNEL2x16_1
  1332. lxvd2x vs8, 0, AO
  1333. lxvd2x vs9, o16, AO
  1334. lxvd2x vs10, o32, AO
  1335. lxvd2x vs11, o48, AO
  1336. lxvdsx vs28, 0, BO
  1337. lxvdsx vs29, o8, BO
  1338. addi AO, AO, 64
  1339. addi BO, BO, 16
  1340. lxvd2x vs12, 0, AO
  1341. lxvd2x vs13, o16, AO
  1342. lxvd2x vs14, o32, AO
  1343. lxvd2x vs15, o48, AO
  1344. addi AO, AO, 64
  1345. xvmaddadp vs32, vs0, vs24
  1346. xvmaddadp vs33, vs1, vs24
  1347. xvmaddadp vs34, vs2, vs24
  1348. xvmaddadp vs35, vs3, vs24
  1349. xvmaddadp vs36, vs4, vs24
  1350. xvmaddadp vs37, vs5, vs24
  1351. xvmaddadp vs38, vs6, vs24
  1352. xvmaddadp vs39, vs7, vs24
  1353. xvmaddadp vs40, vs0, vs25
  1354. xvmaddadp vs41, vs1, vs25
  1355. xvmaddadp vs42, vs2, vs25
  1356. xvmaddadp vs43, vs3, vs25
  1357. xvmaddadp vs44, vs4, vs25
  1358. xvmaddadp vs45, vs5, vs25
  1359. xvmaddadp vs46, vs6, vs25
  1360. xvmaddadp vs47, vs7, vs25
  1361. .endm
  1362. .macro KERNEL2x16_2
  1363. lxvd2x vs0, 0, AO
  1364. lxvd2x vs1, o16, AO
  1365. lxvd2x vs2, o32, AO
  1366. lxvd2x vs3, o48, AO
  1367. lxvdsx vs24, 0, BO
  1368. lxvdsx vs25, o8, BO
  1369. addi AO, AO, 64
  1370. addi BO, BO, 16
  1371. lxvd2x vs4, 0, AO
  1372. lxvd2x vs5, o16, AO
  1373. lxvd2x vs6, o32, AO
  1374. lxvd2x vs7, o48, AO
  1375. addi AO, AO, 64
  1376. xvmaddadp vs32, vs8, vs28
  1377. xvmaddadp vs33, vs9, vs28
  1378. xvmaddadp vs34, vs10, vs28
  1379. xvmaddadp vs35, vs11, vs28
  1380. xvmaddadp vs36, vs12, vs28
  1381. xvmaddadp vs37, vs13, vs28
  1382. xvmaddadp vs38, vs14, vs28
  1383. xvmaddadp vs39, vs15, vs28
  1384. xvmaddadp vs40, vs8, vs29
  1385. xvmaddadp vs41, vs9, vs29
  1386. xvmaddadp vs42, vs10, vs29
  1387. xvmaddadp vs43, vs11, vs29
  1388. xvmaddadp vs44, vs12, vs29
  1389. xvmaddadp vs45, vs13, vs29
  1390. xvmaddadp vs46, vs14, vs29
  1391. xvmaddadp vs47, vs15, vs29
  1392. .endm
  1393. .macro KERNEL2x16_E2
  1394. xvmaddadp vs32, vs8, vs28
  1395. xvmaddadp vs33, vs9, vs28
  1396. xvmaddadp vs34, vs10, vs28
  1397. xvmaddadp vs35, vs11, vs28
  1398. xvmaddadp vs36, vs12, vs28
  1399. xvmaddadp vs37, vs13, vs28
  1400. xvmaddadp vs38, vs14, vs28
  1401. xvmaddadp vs39, vs15, vs28
  1402. xvmaddadp vs40, vs8, vs29
  1403. xvmaddadp vs41, vs9, vs29
  1404. xvmaddadp vs42, vs10, vs29
  1405. xvmaddadp vs43, vs11, vs29
  1406. xvmaddadp vs44, vs12, vs29
  1407. xvmaddadp vs45, vs13, vs29
  1408. xvmaddadp vs46, vs14, vs29
  1409. xvmaddadp vs47, vs15, vs29
  1410. .endm
  1411. .macro KERNEL2x16_SUBI1
  1412. lxvd2x vs0, 0, AO
  1413. lxvd2x vs1, o16, AO
  1414. lxvd2x vs2, o32, AO
  1415. lxvd2x vs3, o48, AO
  1416. lxvdsx vs24, 0, BO
  1417. lxvdsx vs25, o8, BO
  1418. addi AO, AO, 64
  1419. addi BO, BO, 16
  1420. lxvd2x vs4, 0, AO
  1421. lxvd2x vs5, o16, AO
  1422. lxvd2x vs6, o32, AO
  1423. lxvd2x vs7, o48, AO
  1424. addi AO, AO, 64
  1425. xvmuldp vs32, vs0, vs24
  1426. xvmuldp vs33, vs1, vs24
  1427. xvmuldp vs34, vs2, vs24
  1428. xvmuldp vs35, vs3, vs24
  1429. xvmuldp vs36, vs4, vs24
  1430. xvmuldp vs37, vs5, vs24
  1431. xvmuldp vs38, vs6, vs24
  1432. xvmuldp vs39, vs7, vs24
  1433. xvmuldp vs40, vs0, vs25
  1434. xvmuldp vs41, vs1, vs25
  1435. xvmuldp vs42, vs2, vs25
  1436. xvmuldp vs43, vs3, vs25
  1437. xvmuldp vs44, vs4, vs25
  1438. xvmuldp vs45, vs5, vs25
  1439. xvmuldp vs46, vs6, vs25
  1440. xvmuldp vs47, vs7, vs25
  1441. .endm
  1442. .macro KERNEL2x16_SUB1
  1443. lxvd2x vs0, 0, AO
  1444. lxvd2x vs1, o16, AO
  1445. lxvd2x vs2, o32, AO
  1446. lxvd2x vs3, o48, AO
  1447. lxvdsx vs24, 0, BO
  1448. lxvdsx vs25, o8, BO
  1449. addi AO, AO, 64
  1450. addi BO, BO, 16
  1451. lxvd2x vs4, 0, AO
  1452. lxvd2x vs5, o16, AO
  1453. lxvd2x vs6, o32, AO
  1454. lxvd2x vs7, o48, AO
  1455. addi AO, AO, 64
  1456. xvmaddadp vs32, vs0, vs24
  1457. xvmaddadp vs33, vs1, vs24
  1458. xvmaddadp vs34, vs2, vs24
  1459. xvmaddadp vs35, vs3, vs24
  1460. xvmaddadp vs36, vs4, vs24
  1461. xvmaddadp vs37, vs5, vs24
  1462. xvmaddadp vs38, vs6, vs24
  1463. xvmaddadp vs39, vs7, vs24
  1464. xvmaddadp vs40, vs0, vs25
  1465. xvmaddadp vs41, vs1, vs25
  1466. xvmaddadp vs42, vs2, vs25
  1467. xvmaddadp vs43, vs3, vs25
  1468. xvmaddadp vs44, vs4, vs25
  1469. xvmaddadp vs45, vs5, vs25
  1470. xvmaddadp vs46, vs6, vs25
  1471. xvmaddadp vs47, vs7, vs25
  1472. .endm
  1473. .macro SAVE2x16
  1474. mr T1, CO
  1475. addi T2, T1, 64
  1476. #ifndef TRMMKERNEL
  1477. lxvd2x vs0, 0, T1
  1478. lxvd2x vs1, o16, T1
  1479. lxvd2x vs2, o32, T1
  1480. lxvd2x vs3, o48, T1
  1481. lxvd2x vs4, 0, T2
  1482. lxvd2x vs5, o16, T2
  1483. lxvd2x vs6, o32, T2
  1484. lxvd2x vs7, o48, T2
  1485. #endif
  1486. #ifndef TRMMKERNEL
  1487. xvmaddadp vs0, vs32, alpha_r
  1488. xvmaddadp vs1, vs33, alpha_r
  1489. xvmaddadp vs2, vs34, alpha_r
  1490. xvmaddadp vs3, vs35, alpha_r
  1491. xvmaddadp vs4, vs36, alpha_r
  1492. xvmaddadp vs5, vs37, alpha_r
  1493. xvmaddadp vs6, vs38, alpha_r
  1494. xvmaddadp vs7, vs39, alpha_r
  1495. #else
  1496. xvmuldp vs0, vs32, alpha_r
  1497. xvmuldp vs1, vs33, alpha_r
  1498. xvmuldp vs2, vs34, alpha_r
  1499. xvmuldp vs3, vs35, alpha_r
  1500. xvmuldp vs4, vs36, alpha_r
  1501. xvmuldp vs5, vs37, alpha_r
  1502. xvmuldp vs6, vs38, alpha_r
  1503. xvmuldp vs7, vs39, alpha_r
  1504. #endif
  1505. stxvd2x vs0, 0, T1
  1506. stxvd2x vs1, o16, T1
  1507. stxvd2x vs2, o32, T1
  1508. stxvd2x vs3, o48, T1
  1509. stxvd2x vs4, 0, T2
  1510. stxvd2x vs5, o16, T2
  1511. stxvd2x vs6, o32, T2
  1512. stxvd2x vs7, o48, T2
  1513. add T1, T1, LDC
  1514. add T2, T2, LDC
  1515. #ifndef TRMMKERNEL
  1516. lxvd2x vs8, 0, T1
  1517. lxvd2x vs9, o16, T1
  1518. lxvd2x vs10, o32, T1
  1519. lxvd2x vs11, o48, T1
  1520. lxvd2x vs12, 0, T2
  1521. lxvd2x vs13, o16, T2
  1522. lxvd2x vs14, o32, T2
  1523. lxvd2x vs15, o48, T2
  1524. #endif
  1525. #ifndef TRMMKERNEL
  1526. xvmaddadp vs8, vs40, alpha_r
  1527. xvmaddadp vs9, vs41, alpha_r
  1528. xvmaddadp vs10, vs42, alpha_r
  1529. xvmaddadp vs11, vs43, alpha_r
  1530. xvmaddadp vs12, vs44, alpha_r
  1531. xvmaddadp vs13, vs45, alpha_r
  1532. xvmaddadp vs14, vs46, alpha_r
  1533. xvmaddadp vs15, vs47, alpha_r
  1534. #else
  1535. xvmuldp vs8, vs40, alpha_r
  1536. xvmuldp vs9, vs41, alpha_r
  1537. xvmuldp vs10, vs42, alpha_r
  1538. xvmuldp vs11, vs43, alpha_r
  1539. xvmuldp vs12, vs44, alpha_r
  1540. xvmuldp vs13, vs45, alpha_r
  1541. xvmuldp vs14, vs46, alpha_r
  1542. xvmuldp vs15, vs47, alpha_r
  1543. #endif
  1544. stxvd2x vs8, 0, T1
  1545. stxvd2x vs9, o16, T1
  1546. stxvd2x vs10, o32, T1
  1547. stxvd2x vs11, o48, T1
  1548. stxvd2x vs12, 0, T2
  1549. stxvd2x vs13, o16, T2
  1550. stxvd2x vs14, o32, T2
  1551. stxvd2x vs15, o48, T2
  1552. addi CO, CO, 128
  1553. .endm
  1554. /*********************************************************************
  1555. * Macros for N=4, M=8 *
  1556. *********************************************************************/
  1557. .macro LOAD2x8_1
  1558. lxvd2x vs0, 0, AO
  1559. lxvd2x vs1, o16, AO
  1560. lxvd2x vs2, o32, AO
  1561. lxvd2x vs3, o48, AO
  1562. lxvdsx vs24, 0, BO
  1563. lxvdsx vs25, o8, BO
  1564. addi AO, AO, 64
  1565. addi BO, BO, 16
  1566. .endm
  1567. .macro KERNEL2x8_I1
  1568. lxvd2x vs8, 0, AO
  1569. lxvd2x vs9, o16, AO
  1570. lxvd2x vs10, o32, AO
  1571. lxvd2x vs11, o48, AO
  1572. lxvdsx vs28, 0, BO
  1573. lxvdsx vs29, o8, BO
  1574. addi AO, AO, 64
  1575. addi BO, BO, 16
  1576. xvmuldp vs32, vs0, vs24
  1577. xvmuldp vs33, vs1, vs24
  1578. xvmuldp vs34, vs2, vs24
  1579. xvmuldp vs35, vs3, vs24
  1580. xvmuldp vs40, vs0, vs25
  1581. xvmuldp vs41, vs1, vs25
  1582. xvmuldp vs42, vs2, vs25
  1583. xvmuldp vs43, vs3, vs25
  1584. .endm
  1585. .macro KERNEL2x8_1
  1586. lxvd2x vs8, 0, AO
  1587. lxvd2x vs9, o16, AO
  1588. lxvd2x vs10, o32, AO
  1589. lxvd2x vs11, o48, AO
  1590. lxvdsx vs28, 0, BO
  1591. lxvdsx vs29, o8, BO
  1592. addi AO, AO, 64
  1593. addi BO, BO, 16
  1594. xvmaddadp vs32, vs0, vs24
  1595. xvmaddadp vs33, vs1, vs24
  1596. xvmaddadp vs34, vs2, vs24
  1597. xvmaddadp vs35, vs3, vs24
  1598. xvmaddadp vs40, vs0, vs25
  1599. xvmaddadp vs41, vs1, vs25
  1600. xvmaddadp vs42, vs2, vs25
  1601. xvmaddadp vs43, vs3, vs25
  1602. .endm
  1603. .macro KERNEL2x8_2
  1604. lxvd2x vs0, 0, AO
  1605. lxvd2x vs1, o16, AO
  1606. lxvd2x vs2, o32, AO
  1607. lxvd2x vs3, o48, AO
  1608. lxvdsx vs24, 0, BO
  1609. lxvdsx vs25, o8, BO
  1610. addi AO, AO, 64
  1611. addi BO, BO, 16
  1612. xvmaddadp vs32, vs8, vs28
  1613. xvmaddadp vs33, vs9, vs28
  1614. xvmaddadp vs34, vs10, vs28
  1615. xvmaddadp vs35, vs11, vs28
  1616. xvmaddadp vs40, vs8, vs29
  1617. xvmaddadp vs41, vs9, vs29
  1618. xvmaddadp vs42, vs10, vs29
  1619. xvmaddadp vs43, vs11, vs29
  1620. .endm
  1621. .macro KERNEL2x8_E2
  1622. xvmaddadp vs32, vs8, vs28
  1623. xvmaddadp vs33, vs9, vs28
  1624. xvmaddadp vs34, vs10, vs28
  1625. xvmaddadp vs35, vs11, vs28
  1626. xvmaddadp vs40, vs8, vs29
  1627. xvmaddadp vs41, vs9, vs29
  1628. xvmaddadp vs42, vs10, vs29
  1629. xvmaddadp vs43, vs11, vs29
  1630. .endm
  1631. .macro KERNEL2x8_SUBI1
  1632. lxvd2x vs0, 0, AO
  1633. lxvd2x vs1, o16, AO
  1634. lxvd2x vs2, o32, AO
  1635. lxvd2x vs3, o48, AO
  1636. lxvdsx vs24, 0, BO
  1637. lxvdsx vs25, o8, BO
  1638. addi AO, AO, 64
  1639. addi BO, BO, 16
  1640. xvmuldp vs32, vs0, vs24
  1641. xvmuldp vs33, vs1, vs24
  1642. xvmuldp vs34, vs2, vs24
  1643. xvmuldp vs35, vs3, vs24
  1644. xvmuldp vs40, vs0, vs25
  1645. xvmuldp vs41, vs1, vs25
  1646. xvmuldp vs42, vs2, vs25
  1647. xvmuldp vs43, vs3, vs25
  1648. .endm
  1649. .macro KERNEL2x8_SUB1
  1650. lxvd2x vs0, 0, AO
  1651. lxvd2x vs1, o16, AO
  1652. lxvd2x vs2, o32, AO
  1653. lxvd2x vs3, o48, AO
  1654. lxvdsx vs24, 0, BO
  1655. lxvdsx vs25, o8, BO
  1656. addi AO, AO, 64
  1657. addi BO, BO, 16
  1658. xvmaddadp vs32, vs0, vs24
  1659. xvmaddadp vs33, vs1, vs24
  1660. xvmaddadp vs34, vs2, vs24
  1661. xvmaddadp vs35, vs3, vs24
  1662. xvmaddadp vs40, vs0, vs25
  1663. xvmaddadp vs41, vs1, vs25
  1664. xvmaddadp vs42, vs2, vs25
  1665. xvmaddadp vs43, vs3, vs25
  1666. .endm
  1667. .macro SAVE2x8
  1668. mr T1, CO
  1669. #ifndef TRMMKERNEL
  1670. lxvd2x vs0, 0, T1
  1671. lxvd2x vs1, o16, T1
  1672. lxvd2x vs2, o32, T1
  1673. lxvd2x vs3, o48, T1
  1674. #endif
  1675. #ifndef TRMMKERNEL
  1676. xvmaddadp vs0, vs32, alpha_r
  1677. xvmaddadp vs1, vs33, alpha_r
  1678. xvmaddadp vs2, vs34, alpha_r
  1679. xvmaddadp vs3, vs35, alpha_r
  1680. #else
  1681. xvmuldp vs0, vs32, alpha_r
  1682. xvmuldp vs1, vs33, alpha_r
  1683. xvmuldp vs2, vs34, alpha_r
  1684. xvmuldp vs3, vs35, alpha_r
  1685. #endif
  1686. stxvd2x vs0, 0, T1
  1687. stxvd2x vs1, o16, T1
  1688. stxvd2x vs2, o32, T1
  1689. stxvd2x vs3, o48, T1
  1690. add T1, T1, LDC
  1691. #ifndef TRMMKERNEL
  1692. lxvd2x vs8, 0, T1
  1693. lxvd2x vs9, o16, T1
  1694. lxvd2x vs10, o32, T1
  1695. lxvd2x vs11, o48, T1
  1696. #endif
  1697. #ifndef TRMMKERNEL
  1698. xvmaddadp vs8, vs40, alpha_r
  1699. xvmaddadp vs9, vs41, alpha_r
  1700. xvmaddadp vs10, vs42, alpha_r
  1701. xvmaddadp vs11, vs43, alpha_r
  1702. #else
  1703. xvmuldp vs8, vs40, alpha_r
  1704. xvmuldp vs9, vs41, alpha_r
  1705. xvmuldp vs10, vs42, alpha_r
  1706. xvmuldp vs11, vs43, alpha_r
  1707. #endif
  1708. stxvd2x vs8, 0, T1
  1709. stxvd2x vs9, o16, T1
  1710. stxvd2x vs10, o32, T1
  1711. stxvd2x vs11, o48, T1
  1712. addi CO, CO, 64
  1713. .endm
  1714. /*********************************************************************
  1715. * Macros for N=2, M=4 *
  1716. *********************************************************************/
  1717. .macro LOAD2x4_1
  1718. lxvd2x vs0, 0, AO
  1719. lxvd2x vs1, o16, AO
  1720. lxvdsx vs24, 0, BO
  1721. lxvdsx vs25, o8, BO
  1722. addi AO, AO, 32
  1723. addi BO, BO, 16
  1724. .endm
  1725. .macro KERNEL2x4_I1
  1726. lxvd2x vs8, 0, AO
  1727. lxvd2x vs9, o16, AO
  1728. lxvdsx vs28, 0, BO
  1729. lxvdsx vs29, o8, BO
  1730. addi AO, AO, 32
  1731. addi BO, BO, 16
  1732. xvmuldp vs32, vs0, vs24
  1733. xvmuldp vs33, vs1, vs24
  1734. xvmuldp vs40, vs0, vs25
  1735. xvmuldp vs41, vs1, vs25
  1736. .endm
  1737. .macro KERNEL2x4_1
  1738. lxvd2x vs8, 0, AO
  1739. lxvd2x vs9, o16, AO
  1740. lxvdsx vs28, 0, BO
  1741. lxvdsx vs29, o8, BO
  1742. addi AO, AO, 32
  1743. addi BO, BO, 16
  1744. xvmaddadp vs32, vs0, vs24
  1745. xvmaddadp vs33, vs1, vs24
  1746. xvmaddadp vs40, vs0, vs25
  1747. xvmaddadp vs41, vs1, vs25
  1748. .endm
  1749. .macro KERNEL2x4_2
  1750. lxvd2x vs0, 0, AO
  1751. lxvd2x vs1, o16, AO
  1752. lxvdsx vs24, 0, BO
  1753. lxvdsx vs25, o8, BO
  1754. addi AO, AO, 32
  1755. addi BO, BO, 16
  1756. xvmaddadp vs32, vs8, vs28
  1757. xvmaddadp vs33, vs9, vs28
  1758. xvmaddadp vs40, vs8, vs29
  1759. xvmaddadp vs41, vs9, vs29
  1760. .endm
  1761. .macro KERNEL2x4_E2
  1762. xvmaddadp vs32, vs8, vs28
  1763. xvmaddadp vs33, vs9, vs28
  1764. xvmaddadp vs40, vs8, vs29
  1765. xvmaddadp vs41, vs9, vs29
  1766. .endm
  1767. .macro KERNEL2x4_SUBI1
  1768. lxvd2x vs0, 0, AO
  1769. lxvd2x vs1, o16, AO
  1770. lxvdsx vs24, 0, BO
  1771. lxvdsx vs25, o8, BO
  1772. addi AO, AO, 32
  1773. addi BO, BO, 16
  1774. xvmuldp vs32, vs0, vs24
  1775. xvmuldp vs33, vs1, vs24
  1776. xvmuldp vs40, vs0, vs25
  1777. xvmuldp vs41, vs1, vs25
  1778. .endm
  1779. .macro KERNEL2x4_SUB1
  1780. lxvd2x vs0, 0, AO
  1781. lxvd2x vs1, o16, AO
  1782. lxvdsx vs24, 0, BO
  1783. lxvdsx vs25, o8, BO
  1784. addi AO, AO, 32
  1785. addi BO, BO, 16
  1786. xvmaddadp vs32, vs0, vs24
  1787. xvmaddadp vs33, vs1, vs24
  1788. xvmaddadp vs40, vs0, vs25
  1789. xvmaddadp vs41, vs1, vs25
  1790. .endm
  1791. .macro SAVE2x4
  1792. mr T1, CO
  1793. #ifndef TRMMKERNEL
  1794. lxvd2x vs0, 0, T1
  1795. lxvd2x vs1, o16, T1
  1796. #endif
  1797. #ifndef TRMMKERNEL
  1798. xvmaddadp vs0, vs32, alpha_r
  1799. xvmaddadp vs1, vs33, alpha_r
  1800. #else
  1801. xvmuldp vs0, vs32, alpha_r
  1802. xvmuldp vs1, vs33, alpha_r
  1803. #endif
  1804. stxvd2x vs0, 0, T1
  1805. stxvd2x vs1, o16, T1
  1806. add T1, T1, LDC
  1807. #ifndef TRMMKERNEL
  1808. lxvd2x vs8, 0, T1
  1809. lxvd2x vs9, o16, T1
  1810. #endif
  1811. #ifndef TRMMKERNEL
  1812. xvmaddadp vs8, vs40, alpha_r
  1813. xvmaddadp vs9, vs41, alpha_r
  1814. #else
  1815. xvmuldp vs8, vs40, alpha_r
  1816. xvmuldp vs9, vs41, alpha_r
  1817. #endif
  1818. stxvd2x vs8, 0, T1
  1819. stxvd2x vs9, o16, T1
  1820. addi CO, CO, 32
  1821. .endm
  1822. /*********************************************************************
  1823. * Macros for N=2, M=2 *
  1824. *********************************************************************/
  1825. .macro LOAD2x2_1
  1826. lxvd2x vs0, 0, AO
  1827. lxvdsx vs24, 0, BO
  1828. lxvdsx vs25, o8, BO
  1829. addi AO, AO, 16
  1830. addi BO, BO, 16
  1831. .endm
  1832. .macro KERNEL2x2_I1
  1833. lxvd2x vs8, 0, AO
  1834. lxvdsx vs28, 0, BO
  1835. lxvdsx vs29, o8, BO
  1836. addi AO, AO, 16
  1837. addi BO, BO, 16
  1838. xvmuldp vs32, vs0, vs24
  1839. xvmuldp vs40, vs0, vs25
  1840. .endm
  1841. .macro KERNEL2x2_1
  1842. lxvd2x vs8, 0, AO
  1843. lxvdsx vs28, 0, BO
  1844. lxvdsx vs29, o8, BO
  1845. addi AO, AO, 16
  1846. addi BO, BO, 16
  1847. xvmaddadp vs32, vs0, vs24
  1848. xvmaddadp vs40, vs0, vs25
  1849. .endm
  1850. .macro KERNEL2x2_2
  1851. lxvd2x vs0, 0, AO
  1852. lxvdsx vs24, 0, BO
  1853. lxvdsx vs25, o8, BO
  1854. addi AO, AO, 16
  1855. addi BO, BO, 16
  1856. xvmaddadp vs32, vs8, vs28
  1857. xvmaddadp vs40, vs8, vs29
  1858. .endm
  1859. .macro KERNEL2x2_E2
  1860. xvmaddadp vs32, vs8, vs28
  1861. xvmaddadp vs40, vs8, vs29
  1862. .endm
  1863. .macro KERNEL2x2_SUBI1
  1864. lxvd2x vs0, 0, AO
  1865. lxvdsx vs24, 0, BO
  1866. lxvdsx vs25, o8, BO
  1867. addi AO, AO, 16
  1868. addi BO, BO, 16
  1869. xvmuldp vs32, vs0, vs24
  1870. xvmuldp vs40, vs0, vs25
  1871. .endm
  1872. .macro KERNEL2x2_SUB1
  1873. lxvd2x vs0, 0, AO
  1874. lxvdsx vs24, 0, BO
  1875. lxvdsx vs25, o8, BO
  1876. addi AO, AO, 16
  1877. addi BO, BO, 16
  1878. xvmaddadp vs32, vs0, vs24
  1879. xvmaddadp vs40, vs0, vs25
  1880. .endm
  1881. .macro SAVE2x2
  1882. mr T1, CO
  1883. #ifndef TRMMKERNEL
  1884. lxvd2x vs0, 0, T1
  1885. #endif
  1886. #ifndef TRMMKERNEL
  1887. xvmaddadp vs0, vs32, alpha_r
  1888. #else
  1889. xvmuldp vs0, vs32, alpha_r
  1890. #endif
  1891. stxvd2x vs0, 0, T1
  1892. add T1, T1, LDC
  1893. #ifndef TRMMKERNEL
  1894. lxvd2x vs8, 0, T1
  1895. #endif
  1896. #ifndef TRMMKERNEL
  1897. xvmaddadp vs8, vs40, alpha_r
  1898. #else
  1899. xvmuldp vs8, vs40, alpha_r
  1900. #endif
  1901. stxvd2x vs8, 0, T1
  1902. addi CO, CO, 16
  1903. .endm
  1904. /*********************************************************************
  1905. * Macros for N=2, M=1 *
  1906. *********************************************************************/
  1907. .macro LOAD2x1_1
  1908. lxsdx vs0, 0, AO
  1909. lxsdx vs24, 0, BO
  1910. lxsdx vs25, o8, BO
  1911. addi AO, AO, 8
  1912. addi BO, BO, 16
  1913. .endm
  1914. .macro KERNEL2x1_I1
  1915. lxsdx vs8, 0, AO
  1916. lxsdx vs28, 0, BO
  1917. lxsdx vs29, o8, BO
  1918. addi AO, AO, 8
  1919. addi BO, BO, 16
  1920. xsmuldp vs32, vs0, vs24
  1921. xsmuldp vs40, vs0, vs25
  1922. .endm
  1923. .macro KERNEL2x1_1
  1924. lxsdx vs8, 0, AO
  1925. lxsdx vs28, 0, BO
  1926. lxsdx vs29, o8, BO
  1927. addi AO, AO, 8
  1928. addi BO, BO, 16
  1929. xsmaddadp vs32, vs0, vs24
  1930. xsmaddadp vs40, vs0, vs25
  1931. .endm
  1932. .macro KERNEL2x1_2
  1933. lxsdx vs0, 0, AO
  1934. lxsdx vs24, 0, BO
  1935. lxsdx vs25, o8, BO
  1936. addi AO, AO, 8
  1937. addi BO, BO, 16
  1938. xsmaddadp vs32, vs8, vs28
  1939. xsmaddadp vs40, vs8, vs29
  1940. .endm
  1941. .macro KERNEL2x1_E2
  1942. xsmaddadp vs32, vs8, vs28
  1943. xsmaddadp vs40, vs8, vs29
  1944. .endm
  1945. .macro KERNEL2x1_SUBI1
  1946. lxsdx vs0, 0, AO
  1947. lxsdx vs24, 0, BO
  1948. lxsdx vs25, o8, BO
  1949. addi AO, AO, 8
  1950. addi BO, BO, 16
  1951. xsmuldp vs32, vs0, vs24
  1952. xsmuldp vs40, vs0, vs25
  1953. .endm
  1954. .macro KERNEL2x1_SUB1
  1955. lxsdx vs0, 0, AO
  1956. lxsdx vs24, 0, BO
  1957. lxsdx vs25, o8, BO
  1958. addi AO, AO, 8
  1959. addi BO, BO, 16
  1960. xsmaddadp vs32, vs0, vs24
  1961. xsmaddadp vs40, vs0, vs25
  1962. .endm
  1963. .macro SAVE2x1
  1964. mr T1, CO
  1965. #ifndef TRMMKERNEL
  1966. lxsdx vs0, 0, T1
  1967. #endif
  1968. #ifndef TRMMKERNEL
  1969. xsmaddadp vs0, vs32, alpha_r
  1970. #else
  1971. xsmuldp vs0, vs32, alpha_r
  1972. #endif
  1973. stxsdx vs0, 0, T1
  1974. add T1, T1, LDC
  1975. #ifndef TRMMKERNEL
  1976. lxsdx vs8, 0, T1
  1977. #endif
  1978. #ifndef TRMMKERNEL
  1979. xsmaddadp vs8, vs40, alpha_r
  1980. #else
  1981. xsmuldp vs8, vs40, alpha_r
  1982. #endif
  1983. stxsdx vs8, 0, T1
  1984. addi CO, CO, 8
  1985. .endm
  1986. /*********************************************************************
  1987. * Macros for N=1, M=16 *
  1988. *********************************************************************/
  1989. .macro LOAD1x16_1
  1990. lxvd2x vs0, 0, AO
  1991. lxvd2x vs1, o16, AO
  1992. lxvd2x vs2, o32, AO
  1993. lxvd2x vs3, o48, AO
  1994. lxvdsx vs24, 0, BO
  1995. addi AO, AO, 64
  1996. addi BO, BO, 8
  1997. lxvd2x vs4, 0, AO
  1998. lxvd2x vs5, o16, AO
  1999. lxvd2x vs6, o32, AO
  2000. lxvd2x vs7, o48, AO
  2001. addi AO, AO, 64
  2002. .endm
  2003. .macro KERNEL1x16_I1
  2004. lxvd2x vs8, 0, AO
  2005. lxvd2x vs9, o16, AO
  2006. lxvd2x vs10, o32, AO
  2007. lxvd2x vs11, o48, AO
  2008. lxvdsx vs28, 0, BO
  2009. addi AO, AO, 64
  2010. addi BO, BO, 8
  2011. lxvd2x vs12, 0, AO
  2012. lxvd2x vs13, o16, AO
  2013. lxvd2x vs14, o32, AO
  2014. lxvd2x vs15, o48, AO
  2015. addi AO, AO, 64
  2016. xvmuldp vs32, vs0, vs24
  2017. xvmuldp vs33, vs1, vs24
  2018. xvmuldp vs34, vs2, vs24
  2019. xvmuldp vs35, vs3, vs24
  2020. xvmuldp vs36, vs4, vs24
  2021. xvmuldp vs37, vs5, vs24
  2022. xvmuldp vs38, vs6, vs24
  2023. xvmuldp vs39, vs7, vs24
  2024. .endm
  2025. .macro KERNEL1x16_1
  2026. lxvd2x vs8, 0, AO
  2027. lxvd2x vs9, o16, AO
  2028. lxvd2x vs10, o32, AO
  2029. lxvd2x vs11, o48, AO
  2030. lxvdsx vs28, 0, BO
  2031. addi AO, AO, 64
  2032. addi BO, BO, 8
  2033. lxvd2x vs12, 0, AO
  2034. lxvd2x vs13, o16, AO
  2035. lxvd2x vs14, o32, AO
  2036. lxvd2x vs15, o48, AO
  2037. addi AO, AO, 64
  2038. xvmaddadp vs32, vs0, vs24
  2039. xvmaddadp vs33, vs1, vs24
  2040. xvmaddadp vs34, vs2, vs24
  2041. xvmaddadp vs35, vs3, vs24
  2042. xvmaddadp vs36, vs4, vs24
  2043. xvmaddadp vs37, vs5, vs24
  2044. xvmaddadp vs38, vs6, vs24
  2045. xvmaddadp vs39, vs7, vs24
  2046. .endm
  2047. .macro KERNEL1x16_2
  2048. lxvd2x vs0, 0, AO
  2049. lxvd2x vs1, o16, AO
  2050. lxvd2x vs2, o32, AO
  2051. lxvd2x vs3, o48, AO
  2052. lxvdsx vs24, 0, BO
  2053. addi AO, AO, 64
  2054. addi BO, BO, 8
  2055. lxvd2x vs4, 0, AO
  2056. lxvd2x vs5, o16, AO
  2057. lxvd2x vs6, o32, AO
  2058. lxvd2x vs7, o48, AO
  2059. addi AO, AO, 64
  2060. xvmaddadp vs32, vs8, vs28
  2061. xvmaddadp vs33, vs9, vs28
  2062. xvmaddadp vs34, vs10, vs28
  2063. xvmaddadp vs35, vs11, vs28
  2064. xvmaddadp vs36, vs12, vs28
  2065. xvmaddadp vs37, vs13, vs28
  2066. xvmaddadp vs38, vs14, vs28
  2067. xvmaddadp vs39, vs15, vs28
  2068. .endm
  2069. .macro KERNEL1x16_E2
  2070. xvmaddadp vs32, vs8, vs28
  2071. xvmaddadp vs33, vs9, vs28
  2072. xvmaddadp vs34, vs10, vs28
  2073. xvmaddadp vs35, vs11, vs28
  2074. xvmaddadp vs36, vs12, vs28
  2075. xvmaddadp vs37, vs13, vs28
  2076. xvmaddadp vs38, vs14, vs28
  2077. xvmaddadp vs39, vs15, vs28
  2078. .endm
  2079. .macro KERNEL1x16_SUBI1
  2080. lxvd2x vs0, 0, AO
  2081. lxvd2x vs1, o16, AO
  2082. lxvd2x vs2, o32, AO
  2083. lxvd2x vs3, o48, AO
  2084. lxvdsx vs24, 0, BO
  2085. addi AO, AO, 64
  2086. addi BO, BO, 8
  2087. lxvd2x vs4, 0, AO
  2088. lxvd2x vs5, o16, AO
  2089. lxvd2x vs6, o32, AO
  2090. lxvd2x vs7, o48, AO
  2091. addi AO, AO, 64
  2092. xvmuldp vs32, vs0, vs24
  2093. xvmuldp vs33, vs1, vs24
  2094. xvmuldp vs34, vs2, vs24
  2095. xvmuldp vs35, vs3, vs24
  2096. xvmuldp vs36, vs4, vs24
  2097. xvmuldp vs37, vs5, vs24
  2098. xvmuldp vs38, vs6, vs24
  2099. xvmuldp vs39, vs7, vs24
  2100. .endm
  2101. .macro KERNEL1x16_SUB1
  2102. lxvd2x vs0, 0, AO
  2103. lxvd2x vs1, o16, AO
  2104. lxvd2x vs2, o32, AO
  2105. lxvd2x vs3, o48, AO
  2106. lxvdsx vs24, 0, BO
  2107. addi AO, AO, 64
  2108. addi BO, BO, 8
  2109. lxvd2x vs4, 0, AO
  2110. lxvd2x vs5, o16, AO
  2111. lxvd2x vs6, o32, AO
  2112. lxvd2x vs7, o48, AO
  2113. addi AO, AO, 64
  2114. xvmaddadp vs32, vs0, vs24
  2115. xvmaddadp vs33, vs1, vs24
  2116. xvmaddadp vs34, vs2, vs24
  2117. xvmaddadp vs35, vs3, vs24
  2118. xvmaddadp vs36, vs4, vs24
  2119. xvmaddadp vs37, vs5, vs24
  2120. xvmaddadp vs38, vs6, vs24
  2121. xvmaddadp vs39, vs7, vs24
  2122. .endm
  2123. .macro SAVE1x16
  2124. mr T1, CO
  2125. addi T2, T1, 64
  2126. #ifndef TRMMKERNEL
  2127. lxvd2x vs0, 0, T1
  2128. lxvd2x vs1, o16, T1
  2129. lxvd2x vs2, o32, T1
  2130. lxvd2x vs3, o48, T1
  2131. lxvd2x vs4, 0, T2
  2132. lxvd2x vs5, o16, T2
  2133. lxvd2x vs6, o32, T2
  2134. lxvd2x vs7, o48, T2
  2135. #endif
  2136. #ifndef TRMMKERNEL
  2137. xvmaddadp vs0, vs32, alpha_r
  2138. xvmaddadp vs1, vs33, alpha_r
  2139. xvmaddadp vs2, vs34, alpha_r
  2140. xvmaddadp vs3, vs35, alpha_r
  2141. xvmaddadp vs4, vs36, alpha_r
  2142. xvmaddadp vs5, vs37, alpha_r
  2143. xvmaddadp vs6, vs38, alpha_r
  2144. xvmaddadp vs7, vs39, alpha_r
  2145. #else
  2146. xvmuldp vs0, vs32, alpha_r
  2147. xvmuldp vs1, vs33, alpha_r
  2148. xvmuldp vs2, vs34, alpha_r
  2149. xvmuldp vs3, vs35, alpha_r
  2150. xvmuldp vs4, vs36, alpha_r
  2151. xvmuldp vs5, vs37, alpha_r
  2152. xvmuldp vs6, vs38, alpha_r
  2153. xvmuldp vs7, vs39, alpha_r
  2154. #endif
  2155. stxvd2x vs0, 0, T1
  2156. stxvd2x vs1, o16, T1
  2157. stxvd2x vs2, o32, T1
  2158. stxvd2x vs3, o48, T1
  2159. stxvd2x vs4, 0, T2
  2160. stxvd2x vs5, o16, T2
  2161. stxvd2x vs6, o32, T2
  2162. stxvd2x vs7, o48, T2
  2163. addi CO, CO, 128
  2164. .endm
  2165. /*********************************************************************
  2166. * Macros for N=4, M=8 *
  2167. *********************************************************************/
  2168. .macro LOAD1x8_1
  2169. lxvd2x vs0, 0, AO
  2170. lxvd2x vs1, o16, AO
  2171. lxvd2x vs2, o32, AO
  2172. lxvd2x vs3, o48, AO
  2173. lxvdsx vs24, 0, BO
  2174. addi AO, AO, 64
  2175. addi BO, BO, 8
  2176. .endm
  2177. .macro KERNEL1x8_I1
  2178. lxvd2x vs8, 0, AO
  2179. lxvd2x vs9, o16, AO
  2180. lxvd2x vs10, o32, AO
  2181. lxvd2x vs11, o48, AO
  2182. lxvdsx vs28, 0, BO
  2183. addi AO, AO, 64
  2184. addi BO, BO, 8
  2185. xvmuldp vs32, vs0, vs24
  2186. xvmuldp vs33, vs1, vs24
  2187. xvmuldp vs34, vs2, vs24
  2188. xvmuldp vs35, vs3, vs24
  2189. .endm
  2190. .macro KERNEL1x8_1
  2191. lxvd2x vs8, 0, AO
  2192. lxvd2x vs9, o16, AO
  2193. lxvd2x vs10, o32, AO
  2194. lxvd2x vs11, o48, AO
  2195. lxvdsx vs28, 0, BO
  2196. addi AO, AO, 64
  2197. addi BO, BO, 8
  2198. xvmaddadp vs32, vs0, vs24
  2199. xvmaddadp vs33, vs1, vs24
  2200. xvmaddadp vs34, vs2, vs24
  2201. xvmaddadp vs35, vs3, vs24
  2202. .endm
  2203. .macro KERNEL1x8_2
  2204. lxvd2x vs0, 0, AO
  2205. lxvd2x vs1, o16, AO
  2206. lxvd2x vs2, o32, AO
  2207. lxvd2x vs3, o48, AO
  2208. lxvdsx vs24, 0, BO
  2209. addi AO, AO, 64
  2210. addi BO, BO, 8
  2211. xvmaddadp vs32, vs8, vs28
  2212. xvmaddadp vs33, vs9, vs28
  2213. xvmaddadp vs34, vs10, vs28
  2214. xvmaddadp vs35, vs11, vs28
  2215. .endm
  2216. .macro KERNEL1x8_E2
  2217. xvmaddadp vs32, vs8, vs28
  2218. xvmaddadp vs33, vs9, vs28
  2219. xvmaddadp vs34, vs10, vs28
  2220. xvmaddadp vs35, vs11, vs28
  2221. .endm
  2222. .macro KERNEL1x8_SUBI1
  2223. lxvd2x vs0, 0, AO
  2224. lxvd2x vs1, o16, AO
  2225. lxvd2x vs2, o32, AO
  2226. lxvd2x vs3, o48, AO
  2227. lxvdsx vs24, 0, BO
  2228. addi AO, AO, 64
  2229. addi BO, BO, 8
  2230. xvmuldp vs32, vs0, vs24
  2231. xvmuldp vs33, vs1, vs24
  2232. xvmuldp vs34, vs2, vs24
  2233. xvmuldp vs35, vs3, vs24
  2234. .endm
  2235. .macro KERNEL1x8_SUB1
  2236. lxvd2x vs0, 0, AO
  2237. lxvd2x vs1, o16, AO
  2238. lxvd2x vs2, o32, AO
  2239. lxvd2x vs3, o48, AO
  2240. lxvdsx vs24, 0, BO
  2241. addi AO, AO, 64
  2242. addi BO, BO, 8
  2243. xvmaddadp vs32, vs0, vs24
  2244. xvmaddadp vs33, vs1, vs24
  2245. xvmaddadp vs34, vs2, vs24
  2246. xvmaddadp vs35, vs3, vs24
  2247. .endm
  2248. .macro SAVE1x8
  2249. mr T1, CO
  2250. #ifndef TRMMKERNEL
  2251. lxvd2x vs0, 0, T1
  2252. lxvd2x vs1, o16, T1
  2253. lxvd2x vs2, o32, T1
  2254. lxvd2x vs3, o48, T1
  2255. #endif
  2256. #ifndef TRMMKERNEL
  2257. xvmaddadp vs0, vs32, alpha_r
  2258. xvmaddadp vs1, vs33, alpha_r
  2259. xvmaddadp vs2, vs34, alpha_r
  2260. xvmaddadp vs3, vs35, alpha_r
  2261. #else
  2262. xvmuldp vs0, vs32, alpha_r
  2263. xvmuldp vs1, vs33, alpha_r
  2264. xvmuldp vs2, vs34, alpha_r
  2265. xvmuldp vs3, vs35, alpha_r
  2266. #endif
  2267. stxvd2x vs0, 0, T1
  2268. stxvd2x vs1, o16, T1
  2269. stxvd2x vs2, o32, T1
  2270. stxvd2x vs3, o48, T1
  2271. addi CO, CO, 64
  2272. .endm
  2273. /*********************************************************************
  2274. * Macros for N=1, M=4 *
  2275. *********************************************************************/
  2276. .macro LOAD1x4_1
  2277. lxvd2x vs0, 0, AO
  2278. lxvd2x vs1, o16, AO
  2279. lxvdsx vs24, 0, BO
  2280. addi AO, AO, 32
  2281. addi BO, BO, 8
  2282. .endm
  2283. .macro KERNEL1x4_I1
  2284. lxvd2x vs8, 0, AO
  2285. lxvd2x vs9, o16, AO
  2286. lxvdsx vs28, 0, BO
  2287. addi AO, AO, 32
  2288. addi BO, BO, 8
  2289. xvmuldp vs32, vs0, vs24
  2290. xvmuldp vs33, vs1, vs24
  2291. .endm
  2292. .macro KERNEL1x4_1
  2293. lxvd2x vs8, 0, AO
  2294. lxvd2x vs9, o16, AO
  2295. lxvdsx vs28, 0, BO
  2296. addi AO, AO, 32
  2297. addi BO, BO, 8
  2298. xvmaddadp vs32, vs0, vs24
  2299. xvmaddadp vs33, vs1, vs24
  2300. .endm
  2301. .macro KERNEL1x4_2
  2302. lxvd2x vs0, 0, AO
  2303. lxvd2x vs1, o16, AO
  2304. lxvdsx vs24, 0, BO
  2305. addi AO, AO, 32
  2306. addi BO, BO, 8
  2307. xvmaddadp vs32, vs8, vs28
  2308. xvmaddadp vs33, vs9, vs28
  2309. .endm
  2310. .macro KERNEL1x4_E2
  2311. xvmaddadp vs32, vs8, vs28
  2312. xvmaddadp vs33, vs9, vs28
  2313. .endm
  2314. .macro KERNEL1x4_SUBI1
  2315. lxvd2x vs0, 0, AO
  2316. lxvd2x vs1, o16, AO
  2317. lxvdsx vs24, 0, BO
  2318. addi AO, AO, 32
  2319. addi BO, BO, 8
  2320. xvmuldp vs32, vs0, vs24
  2321. xvmuldp vs33, vs1, vs24
  2322. .endm
  2323. .macro KERNEL1x4_SUB1
  2324. lxvd2x vs0, 0, AO
  2325. lxvd2x vs1, o16, AO
  2326. lxvdsx vs24, 0, BO
  2327. addi AO, AO, 32
  2328. addi BO, BO, 8
  2329. xvmaddadp vs32, vs0, vs24
  2330. xvmaddadp vs33, vs1, vs24
  2331. .endm
  2332. .macro SAVE1x4
  2333. mr T1, CO
  2334. #ifndef TRMMKERNEL
  2335. lxvd2x vs0, 0, T1
  2336. lxvd2x vs1, o16, T1
  2337. #endif
  2338. #ifndef TRMMKERNEL
  2339. xvmaddadp vs0, vs32, alpha_r
  2340. xvmaddadp vs1, vs33, alpha_r
  2341. #else
  2342. xvmuldp vs0, vs32, alpha_r
  2343. xvmuldp vs1, vs33, alpha_r
  2344. #endif
  2345. stxvd2x vs0, 0, T1
  2346. stxvd2x vs1, o16, T1
  2347. addi CO, CO, 32
  2348. .endm
  2349. /*********************************************************************
  2350. * Macros for N=1, M=2 *
  2351. *********************************************************************/
  2352. .macro LOAD1x2_1
  2353. lxvd2x vs0, 0, AO
  2354. lxvdsx vs24, 0, BO
  2355. addi AO, AO, 16
  2356. addi BO, BO, 8
  2357. .endm
  2358. .macro KERNEL1x2_I1
  2359. lxvd2x vs8, 0, AO
  2360. lxvdsx vs28, 0, BO
  2361. addi AO, AO, 16
  2362. addi BO, BO, 8
  2363. xvmuldp vs32, vs0, vs24
  2364. .endm
  2365. .macro KERNEL1x2_1
  2366. lxvd2x vs8, 0, AO
  2367. lxvdsx vs28, 0, BO
  2368. addi AO, AO, 16
  2369. addi BO, BO, 8
  2370. xvmaddadp vs32, vs0, vs24
  2371. .endm
  2372. .macro KERNEL1x2_2
  2373. lxvd2x vs0, 0, AO
  2374. lxvdsx vs24, 0, BO
  2375. addi AO, AO, 16
  2376. addi BO, BO, 8
  2377. xvmaddadp vs32, vs8, vs28
  2378. .endm
  2379. .macro KERNEL1x2_E2
  2380. xvmaddadp vs32, vs8, vs28
  2381. .endm
  2382. .macro KERNEL1x2_SUBI1
  2383. lxvd2x vs0, 0, AO
  2384. lxvdsx vs24, 0, BO
  2385. addi AO, AO, 16
  2386. addi BO, BO, 8
  2387. xvmuldp vs32, vs0, vs24
  2388. .endm
  2389. .macro KERNEL1x2_SUB1
  2390. lxvd2x vs0, 0, AO
  2391. lxvdsx vs24, 0, BO
  2392. addi AO, AO, 16
  2393. addi BO, BO, 8
  2394. xvmaddadp vs32, vs0, vs24
  2395. .endm
  2396. .macro SAVE1x2
  2397. mr T1, CO
  2398. #ifndef TRMMKERNEL
  2399. lxvd2x vs0, 0, T1
  2400. #endif
  2401. #ifndef TRMMKERNEL
  2402. xvmaddadp vs0, vs32, alpha_r
  2403. #else
  2404. xvmuldp vs0, vs32, alpha_r
  2405. #endif
  2406. stxvd2x vs0, 0, T1
  2407. addi CO, CO, 16
  2408. .endm
  2409. /*********************************************************************
  2410. * Macros for N=1, M=1 *
  2411. *********************************************************************/
  2412. .macro LOAD1x1_1
  2413. lxsdx vs0, 0, AO
  2414. lxsdx vs24, 0, BO
  2415. addi AO, AO, 8
  2416. addi BO, BO, 8
  2417. .endm
  2418. .macro KERNEL1x1_I1
  2419. lxsdx vs8, 0, AO
  2420. lxsdx vs28, 0, BO
  2421. addi AO, AO, 8
  2422. addi BO, BO, 8
  2423. xsmuldp vs32, vs0, vs24
  2424. .endm
  2425. .macro KERNEL1x1_1
  2426. lxsdx vs8, 0, AO
  2427. lxsdx vs28, 0, BO
  2428. addi AO, AO, 8
  2429. addi BO, BO, 8
  2430. xsmaddadp vs32, vs0, vs24
  2431. .endm
  2432. .macro KERNEL1x1_2
  2433. lxsdx vs0, 0, AO
  2434. lxsdx vs24, 0, BO
  2435. addi AO, AO, 8
  2436. addi BO, BO, 8
  2437. xsmaddadp vs32, vs8, vs28
  2438. .endm
  2439. .macro KERNEL1x1_E2
  2440. xsmaddadp vs32, vs8, vs28
  2441. .endm
  2442. .macro KERNEL1x1_SUBI1
  2443. lxsdx vs0, 0, AO
  2444. lxsdx vs24, 0, BO
  2445. addi AO, AO, 8
  2446. addi BO, BO, 8
  2447. xsmuldp vs32, vs0, vs24
  2448. .endm
  2449. .macro KERNEL1x1_SUB1
  2450. lxsdx vs0, 0, AO
  2451. lxsdx vs24, 0, BO
  2452. addi AO, AO, 8
  2453. addi BO, BO, 8
  2454. xsmaddadp vs32, vs0, vs24
  2455. .endm
  2456. .macro SAVE1x1
  2457. mr T1, CO
  2458. #ifndef TRMMKERNEL
  2459. lxsdx vs0, 0, T1
  2460. #endif
  2461. #ifndef TRMMKERNEL
  2462. xsmaddadp vs0, vs32, alpha_r
  2463. #else
  2464. xsmuldp vs0, vs32, alpha_r
  2465. #endif
  2466. stxsdx vs0, 0, T1
  2467. addi CO, CO, 8
  2468. .endm
  2469. /****************************TRMM POINTER REFRESH MACROSES*************************/
  2470. .macro SHIFT_REG REG1,REG2,SHIFT_VAL
  2471. .if \SHIFT_VAL==16
  2472. slwi \REG1, \REG2, 7
  2473. .elseif \SHIFT_VAL==8
  2474. slwi \REG1, \REG2, 6
  2475. .elseif \SHIFT_VAL==4
  2476. slwi \REG1, \REG2, 5
  2477. .elseif \SHIFT_VAL==2
  2478. slwi \REG1, \REG2, 4
  2479. .elseif \SHIFT_VAL==1
  2480. slwi \REG1, \REG2, 3
  2481. .endif
  2482. .endm
  2483. /*
  2484. //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2485. // ptrbb = bb;
  2486. // #else
  2487. // ptrba += off*16;
  2488. // ptrbb = bb + off*2;
  2489. // #endif
  2490. */
  2491. .macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
  2492. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2493. /* ptrbb = bb;*/
  2494. mr \PTR_B,\B_VAL /* refresh BPOINT */
  2495. #else
  2496. /*
  2497. // ptrba =ptrba+ off*C_A;
  2498. // ptrbb = bb + off*C_B;
  2499. */
  2500. SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
  2501. SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
  2502. add \PTR_B, \B_VAL , T4 /* Add values to BO */
  2503. add \PTR_A, \PTR_A, T2 /* Add values to AO */
  2504. #endif
  2505. .endm
  2506. /*
  2507. // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2508. // temp = bk-off;
  2509. // #elif defined(LEFT)
  2510. // temp = off+16; // number of values in A
  2511. // #else
  2512. // temp = off+2; // number of values in B
  2513. // #endif
  2514. */
  2515. .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
  2516. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2517. /* temp = bk-off;*/
  2518. sub \TEMP_BK,\BK_VAL,\OFF_VAL
  2519. #elif defined(LEFT)
  2520. /* temp = off+INCR_A; // number of values in A */
  2521. addi \TEMP_BK, \OFF_VAL, \INCR_A
  2522. #else
  2523. /* temp = off+INCR_B // number of values in B*/
  2524. addi \TEMP_BK,\OFF_VAL, \INCR_B
  2525. #endif
  2526. .endm
  2527. /*
  2528. // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2529. // temp = bk - off;
  2530. // #ifdef LEFT
  2531. // temp -= 16; // number of values in A
  2532. // #else
  2533. // temp -= 2; // number of values in B
  2534. // #endif
  2535. // ptrba += temp*16;
  2536. // ptrbb += temp*2;
  2537. // #endif
  2538. // #ifdef LEFT
  2539. // off += 16; // number of values in A
  2540. // #endif
  2541. */
  2542. .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
  2543. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2544. /*temp = bk - off;*/
  2545. sub \TEMP_BK,\BK_VAL,\OFF_VAL
  2546. #ifdef LEFT
  2547. /*temp -= 8; // number of values in A*/
  2548. addi \TEMP_BK,\TEMP_BK,-\C_A
  2549. #else
  2550. /*temp -= 4; // number of values in B*/
  2551. addi \TEMP_BK,\TEMP_BK,-\C_B
  2552. #endif
  2553. /*ptrba += temp*C_A;
  2554. ptrbb += temp*C_B;*/
  2555. SHIFT_REG T4,\TEMP_BK,\C_A
  2556. SHIFT_REG T2,\TEMP_BK,\C_B
  2557. add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
  2558. add \PTR_B, \PTR_B,T2
  2559. #endif
  2560. #ifdef LEFT
  2561. /*off += 8; // number of values in A*/
  2562. addi \OFF_VAL,\OFF_VAL,\C_A
  2563. #endif
  2564. .endm