You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_macros_8x2_power8.S 101 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/04/22 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  35. #define XSFADD_R1 xsadddp
  36. #define XSFADD_R2 xssubdp
  37. #define XSFADD_I1 xsadddp
  38. #define XSFADD_I2 xsadddp
  39. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  40. #define XSFADD_R1 xsadddp
  41. #define XSFADD_R2 xsadddp
  42. #define XSFADD_I1 xssubdp
  43. #define XSFADD_I2 xsadddp
  44. #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
  45. #define XSFADD_R1 xsadddp
  46. #define XSFADD_R2 xsadddp
  47. #define XSFADD_I1 xsadddp
  48. #define XSFADD_I2 xssubdp
  49. #else // CC || CR || RC || RR
  50. #define XSFADD_R1 xsadddp
  51. #define XSFADD_R2 xssubdp
  52. #define XSFADD_I1 xssubdp
  53. #define XSFADD_I2 xssubdp
  54. #endif
  55. /**********************************************************************************************
  56. * Macros for N=2 and M=8
  57. **********************************************************************************************/
  58. #if defined(_AIX)
  59. define(`LOAD2x8_1', `
  60. #else
  61. .macro LOAD2x8_1
  62. #endif
  63. lxvd2x vs16, o0, BO // load real part from B
  64. lxvd2x vs17, o16, BO // load imag part from B
  65. lxvd2x vs18, o32, BO // load real part from B
  66. lxvd2x vs19, o48, BO // load imag part from B
  67. addi BO, BO, 64
  68. lxvd2x vs0, o0, AO // load real,imag from A
  69. lxvd2x vs1, o16, AO // load real,imag from A
  70. lxvd2x vs2, o32, AO // load real,imag from A
  71. lxvd2x vs3, o48, AO // load real,imag from A
  72. addi AO, AO, 64
  73. lxvd2x vs4, o0, AO // load real,imag from A
  74. lxvd2x vs5, o16, AO // load real,imag from A
  75. lxvd2x vs6, o32, AO // load real,imag from A
  76. lxvd2x vs7, o48, AO // load real,imag from A
  77. addi AO, AO, 64
  78. #if defined(_AIX)
  79. ')
  80. #else
  81. .endm
  82. #endif
  83. #if defined(_AIX)
  84. define(`KERNEL2x8_I1', `
  85. #else
  86. .macro KERNEL2x8_I1
  87. #endif
  88. lxvd2x vs8, o0, AO // load real,imag from A
  89. lxvd2x vs9, o16, AO // load real,imag from A
  90. lxvd2x vs10, o32, AO // load real,imag from A
  91. lxvd2x vs11, o48, AO // load real,imag from A
  92. addi AO, AO, 64
  93. lxvd2x vs12, o0, AO // load real,imag from A
  94. lxvd2x vs13, o16, AO // load real,imag from A
  95. lxvd2x vs14, o32, AO // load real,imag from A
  96. lxvd2x vs15, o48, AO // load real,imag from A
  97. addi AO, AO, 64
  98. lxvd2x vs20, o0, BO // load real part from B
  99. lxvd2x vs21, o16, BO // load imag part from B
  100. lxvd2x vs22, o32, BO // load real part from B
  101. lxvd2x vs23, o48, BO // load imag part from B
  102. addi BO, BO, 64
  103. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  104. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  105. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  106. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  107. xvmuldp vs36, vs2, vs16 // real*real, imag*real
  108. xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
  109. xvmuldp vs38, vs3, vs16 // real*real, imag*real
  110. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
  111. xvmuldp vs40, vs4, vs16 // real*real, imag*real
  112. xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
  113. xvmuldp vs42, vs5, vs16 // real*real, imag*real
  114. xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
  115. xvmuldp vs44, vs6, vs16 // real*real, imag*real
  116. xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
  117. xvmuldp vs46, vs7, vs16 // real*real, imag*real
  118. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
  119. xvmuldp vs48, vs0, vs18 // real*real, imag*real
  120. xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
  121. xvmuldp vs50, vs1, vs18 // real*real, imag*real
  122. xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
  123. xvmuldp vs52, vs2, vs18 // real*real, imag*real
  124. xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
  125. xvmuldp vs54, vs3, vs18 // real*real, imag*real
  126. xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
  127. xvmuldp vs56, vs4, vs18 // real*real, imag*real
  128. xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
  129. xvmuldp vs58, vs5, vs18 // real*real, imag*real
  130. xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
  131. xvmuldp vs60, vs6, vs18 // real*real, imag*real
  132. xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
  133. xvmuldp vs62, vs7, vs18 // real*real, imag*real
  134. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
  135. #if defined(_AIX)
  136. ')
  137. #else
  138. .endm
  139. #endif
  140. #if defined(_AIX)
  141. define(`KERNEL2x8_1', `
  142. #else
  143. .macro KERNEL2x8_1
  144. #endif
  145. lxvd2x vs8, o0, AO // load real,imag from A
  146. lxvd2x vs9, o16, AO // load real,imag from A
  147. lxvd2x vs10, o32, AO // load real,imag from A
  148. lxvd2x vs11, o48, AO // load real,imag from A
  149. addi AO, AO, 64
  150. lxvd2x vs12, o0, AO // load real,imag from A
  151. lxvd2x vs13, o16, AO // load real,imag from A
  152. lxvd2x vs14, o32, AO // load real,imag from A
  153. lxvd2x vs15, o48, AO // load real,imag from A
  154. addi AO, AO, 64
  155. lxvd2x vs20, o0, BO // load real part from B
  156. lxvd2x vs21, o16, BO // load imag part from B
  157. lxvd2x vs22, o32, BO // load real part from B
  158. lxvd2x vs23, o48, BO // load imag part from B
  159. addi BO, BO, 64
  160. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  161. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  162. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  163. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  164. xvmaddadp vs36, vs2, vs16 // real*real, imag*real
  165. xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
  166. xvmaddadp vs38, vs3, vs16 // real*real, imag*real
  167. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
  168. xvmaddadp vs40, vs4, vs16 // real*real, imag*real
  169. xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
  170. xvmaddadp vs42, vs5, vs16 // real*real, imag*real
  171. xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
  172. xvmaddadp vs44, vs6, vs16 // real*real, imag*real
  173. xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
  174. xvmaddadp vs46, vs7, vs16 // real*real, imag*real
  175. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
  176. xvmaddadp vs48, vs0, vs18 // real*real, imag*real
  177. xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
  178. xvmaddadp vs50, vs1, vs18 // real*real, imag*real
  179. xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
  180. xvmaddadp vs52, vs2, vs18 // real*real, imag*real
  181. xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
  182. xvmaddadp vs54, vs3, vs18 // real*real, imag*real
  183. xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
  184. xvmaddadp vs56, vs4, vs18 // real*real, imag*real
  185. xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
  186. xvmaddadp vs58, vs5, vs18 // real*real, imag*real
  187. xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
  188. xvmaddadp vs60, vs6, vs18 // real*real, imag*real
  189. xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
  190. xvmaddadp vs62, vs7, vs18 // real*real, imag*real
  191. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
  192. #if defined(_AIX)
  193. ')
  194. #else
  195. .endm
  196. #endif
  197. #if defined(_AIX)
  198. define(`KERNEL2x8_2', `
  199. #else
  200. .macro KERNEL2x8_2
  201. #endif
  202. lxvd2x vs0, o0, AO // load real,imag from A
  203. lxvd2x vs1, o16, AO // load real,imag from A
  204. lxvd2x vs2, o32, AO // load real,imag from A
  205. lxvd2x vs3, o48, AO // load real,imag from A
  206. addi AO, AO, 64
  207. lxvd2x vs4, o0, AO // load real,imag from A
  208. lxvd2x vs5, o16, AO // load real,imag from A
  209. lxvd2x vs6, o32, AO // load real,imag from A
  210. lxvd2x vs7, o48, AO // load real,imag from A
  211. addi AO, AO, 64
  212. lxvd2x vs16, o0, BO // load real part from B
  213. lxvd2x vs17, o16, BO // load imag part from B
  214. lxvd2x vs18, o32, BO // load real part from B
  215. lxvd2x vs19, o48, BO // load imag part from B
  216. addi BO, BO, 64
  217. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  218. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  219. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  220. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  221. xvmaddadp vs36, vs10, vs20 // real*real, imag*real
  222. xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
  223. xvmaddadp vs38, vs11, vs20 // real*real, imag*real
  224. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
  225. xvmaddadp vs40, vs12, vs20 // real*real, imag*real
  226. xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
  227. xvmaddadp vs42, vs13, vs20 // real*real, imag*real
  228. xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
  229. xvmaddadp vs44, vs14, vs20 // real*real, imag*real
  230. xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
  231. xvmaddadp vs46, vs15, vs20 // real*real, imag*real
  232. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
  233. xvmaddadp vs48, vs8, vs22 // real*real, imag*real
  234. xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
  235. xvmaddadp vs50, vs9, vs22 // real*real, imag*real
  236. xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
  237. xvmaddadp vs52, vs10, vs22 // real*real, imag*real
  238. xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
  239. xvmaddadp vs54, vs11, vs22 // real*real, imag*real
  240. xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
  241. xvmaddadp vs56, vs12, vs22 // real*real, imag*real
  242. xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
  243. xvmaddadp vs58, vs13, vs22 // real*real, imag*real
  244. xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
  245. xvmaddadp vs60, vs14, vs22 // real*real, imag*real
  246. xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
  247. xvmaddadp vs62, vs15, vs22 // real*real, imag*real
  248. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
  249. #if defined(_AIX)
  250. ')
  251. #else
  252. .endm
  253. #endif
  254. #if defined(_AIX)
  255. define(`KERNEL2x8_E2', `
  256. #else
  257. .macro KERNEL2x8_E2
  258. #endif
  259. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  260. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  261. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  262. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  263. xvmaddadp vs36, vs10, vs20 // real*real, imag*real
  264. xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
  265. xvmaddadp vs38, vs11, vs20 // real*real, imag*real
  266. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
  267. xvmaddadp vs40, vs12, vs20 // real*real, imag*real
  268. xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
  269. xvmaddadp vs42, vs13, vs20 // real*real, imag*real
  270. xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
  271. xvmaddadp vs44, vs14, vs20 // real*real, imag*real
  272. xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
  273. xvmaddadp vs46, vs15, vs20 // real*real, imag*real
  274. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
  275. xvmaddadp vs48, vs8, vs22 // real*real, imag*real
  276. xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
  277. xvmaddadp vs50, vs9, vs22 // real*real, imag*real
  278. xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
  279. xvmaddadp vs52, vs10, vs22 // real*real, imag*real
  280. xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
  281. xvmaddadp vs54, vs11, vs22 // real*real, imag*real
  282. xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
  283. xvmaddadp vs56, vs12, vs22 // real*real, imag*real
  284. xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
  285. xvmaddadp vs58, vs13, vs22 // real*real, imag*real
  286. xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
  287. xvmaddadp vs60, vs14, vs22 // real*real, imag*real
  288. xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
  289. xvmaddadp vs62, vs15, vs22 // real*real, imag*real
  290. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
  291. #if defined(_AIX)
  292. ')
  293. #else
  294. .endm
  295. #endif
  296. #if defined(_AIX)
  297. define(`KERNEL2x8_SUBI1', `
  298. #else
  299. .macro KERNEL2x8_SUBI1
  300. #endif
  301. lxvd2x vs0, o0, AO // load real,imag from A
  302. lxvd2x vs1, o16, AO // load real,imag from A
  303. lxvd2x vs2, o32, AO // load real,imag from A
  304. lxvd2x vs3, o48, AO // load real,imag from A
  305. addi AO, AO, 64
  306. lxvd2x vs4, o0, AO // load real,imag from A
  307. lxvd2x vs5, o16, AO // load real,imag from A
  308. lxvd2x vs6, o32, AO // load real,imag from A
  309. lxvd2x vs7, o48, AO // load real,imag from A
  310. addi AO, AO, 64
  311. lxvd2x vs16, o0, BO // load real part from B
  312. lxvd2x vs17, o16, BO // load imag part from B
  313. lxvd2x vs18, o32, BO // load real part from B
  314. lxvd2x vs19, o48, BO // load imag part from B
  315. addi BO, BO, 64
  316. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  317. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  318. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  319. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  320. xvmuldp vs36, vs2, vs16 // real*real, imag*real
  321. xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
  322. xvmuldp vs38, vs3, vs16 // real*real, imag*real
  323. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
  324. xvmuldp vs40, vs4, vs16 // real*real, imag*real
  325. xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
  326. xvmuldp vs42, vs5, vs16 // real*real, imag*real
  327. xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
  328. xvmuldp vs44, vs6, vs16 // real*real, imag*real
  329. xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
  330. xvmuldp vs46, vs7, vs16 // real*real, imag*real
  331. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
  332. xvmuldp vs48, vs0, vs18 // real*real, imag*real
  333. xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
  334. xvmuldp vs50, vs1, vs18 // real*real, imag*real
  335. xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
  336. xvmuldp vs52, vs2, vs18 // real*real, imag*real
  337. xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
  338. xvmuldp vs54, vs3, vs18 // real*real, imag*real
  339. xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
  340. xvmuldp vs56, vs4, vs18 // real*real, imag*real
  341. xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
  342. xvmuldp vs58, vs5, vs18 // real*real, imag*real
  343. xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
  344. xvmuldp vs60, vs6, vs18 // real*real, imag*real
  345. xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
  346. xvmuldp vs62, vs7, vs18 // real*real, imag*real
  347. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
  348. #if defined(_AIX)
  349. ')
  350. #else
  351. .endm
  352. #endif
  353. #if defined(_AIX)
  354. define(`KERNEL2x8_SUB1', `
  355. #else
  356. .macro KERNEL2x8_SUB1
  357. #endif
  358. lxvd2x vs0, o0, AO // load real,imag from A
  359. lxvd2x vs1, o16, AO // load real,imag from A
  360. lxvd2x vs2, o32, AO // load real,imag from A
  361. lxvd2x vs3, o48, AO // load real,imag from A
  362. addi AO, AO, 64
  363. lxvd2x vs4, o0, AO // load real,imag from A
  364. lxvd2x vs5, o16, AO // load real,imag from A
  365. lxvd2x vs6, o32, AO // load real,imag from A
  366. lxvd2x vs7, o48, AO // load real,imag from A
  367. addi AO, AO, 64
  368. lxvd2x vs16, o0, BO // load real part from B
  369. lxvd2x vs17, o16, BO // load imag part from B
  370. lxvd2x vs18, o32, BO // load real part from B
  371. lxvd2x vs19, o48, BO // load imag part from B
  372. addi BO, BO, 64
  373. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  374. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  375. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  376. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  377. xvmaddadp vs36, vs2, vs16 // real*real, imag*real
  378. xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
  379. xvmaddadp vs38, vs3, vs16 // real*real, imag*real
  380. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
  381. xvmaddadp vs40, vs4, vs16 // real*real, imag*real
  382. xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
  383. xvmaddadp vs42, vs5, vs16 // real*real, imag*real
  384. xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
  385. xvmaddadp vs44, vs6, vs16 // real*real, imag*real
  386. xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
  387. xvmaddadp vs46, vs7, vs16 // real*real, imag*real
  388. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
  389. xvmaddadp vs48, vs0, vs18 // real*real, imag*real
  390. xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
  391. xvmaddadp vs50, vs1, vs18 // real*real, imag*real
  392. xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
  393. xvmaddadp vs52, vs2, vs18 // real*real, imag*real
  394. xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
  395. xvmaddadp vs54, vs3, vs18 // real*real, imag*real
  396. xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
  397. xvmaddadp vs56, vs4, vs18 // real*real, imag*real
  398. xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
  399. xvmaddadp vs58, vs5, vs18 // real*real, imag*real
  400. xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
  401. xvmaddadp vs60, vs6, vs18 // real*real, imag*real
  402. xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
  403. xvmaddadp vs62, vs7, vs18 // real*real, imag*real
  404. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
  405. #if defined(_AIX)
  406. ')
  407. #else
  408. .endm
  409. #endif
  410. #if defined(_AIX)
  411. define(`SAVE2x8', `
  412. #else
  413. .macro SAVE2x8
  414. #endif
  415. mr T1, CO
  416. addi T2, T1, 64
  417. #ifndef TRMMKERNEL
  418. lxvd2x vs16, o0, T1
  419. lxvd2x vs17, o16, T1
  420. lxvd2x vs18, o32, T1
  421. lxvd2x vs19, o48, T1
  422. lxvd2x vs20, o0, T2
  423. lxvd2x vs21, o16, T2
  424. lxvd2x vs22, o32, T2
  425. lxvd2x vs23, o48, T2
  426. #endif
  427. xxlxor vs0, vs0, vs0
  428. xxlxor vs1, vs1, vs1
  429. XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  430. XSFADD_R1 vs0, vs0, vs32 // realA*realB
  431. XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
  432. XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  433. XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  434. XSFADD_I1 vs1, vs1, vs32 // realA*imagB
  435. XSFADD_I2 vs1, vs1, vs33 // imagA*realB
  436. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  437. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  438. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  439. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  440. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  441. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  442. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  443. xxlxor vs0, vs0, vs0
  444. xxlxor vs1, vs1, vs1
  445. XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  446. XSFADD_R1 vs0, vs0, vs34 // realA*realB
  447. XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
  448. XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  449. XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  450. XSFADD_I1 vs1, vs1, vs34 // realA*imagB
  451. XSFADD_I2 vs1, vs1, vs35 // imagA*realB
  452. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  453. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  454. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  455. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  456. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  457. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  458. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  459. xxlxor vs0, vs0, vs0
  460. xxlxor vs1, vs1, vs1
  461. XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  462. XSFADD_R1 vs0, vs0, vs36 // realA*realB
  463. XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
  464. XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  465. XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  466. XSFADD_I1 vs1, vs1, vs36 // realA*imagB
  467. XSFADD_I2 vs1, vs1, vs37 // imagA*realB
  468. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  469. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  470. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  471. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  472. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  473. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  474. xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
  475. xxlxor vs0, vs0, vs0
  476. xxlxor vs1, vs1, vs1
  477. XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  478. XSFADD_R1 vs0, vs0, vs38 // realA*realB
  479. XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
  480. XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  481. XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  482. XSFADD_I1 vs1, vs1, vs38 // realA*imagB
  483. XSFADD_I2 vs1, vs1, vs39 // imagA*realB
  484. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  485. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  486. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  487. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  488. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  489. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  490. xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
  491. xxlxor vs0, vs0, vs0
  492. xxlxor vs1, vs1, vs1
  493. XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  494. XSFADD_R1 vs0, vs0, vs40 // realA*realB
  495. XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
  496. XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  497. XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  498. XSFADD_I1 vs1, vs1, vs40 // realA*imagB
  499. XSFADD_I2 vs1, vs1, vs41 // imagA*realB
  500. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  501. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  502. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  503. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  504. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  505. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  506. xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
  507. xxlxor vs0, vs0, vs0
  508. xxlxor vs1, vs1, vs1
  509. XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  510. XSFADD_R1 vs0, vs0, vs42 // realA*realB
  511. XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
  512. XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  513. XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  514. XSFADD_I1 vs1, vs1, vs42 // realA*imagB
  515. XSFADD_I2 vs1, vs1, vs43 // imagA*realB
  516. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  517. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  518. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  519. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  520. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  521. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  522. xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
  523. xxlxor vs0, vs0, vs0
  524. xxlxor vs1, vs1, vs1
  525. XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  526. XSFADD_R1 vs0, vs0, vs44 // realA*realB
  527. XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
  528. XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  529. XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  530. XSFADD_I1 vs1, vs1, vs44 // realA*imagB
  531. XSFADD_I2 vs1, vs1, vs45 // imagA*realB
  532. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  533. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  534. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  535. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  536. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  537. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  538. xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
  539. xxlxor vs0, vs0, vs0
  540. xxlxor vs1, vs1, vs1
  541. XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  542. XSFADD_R1 vs0, vs0, vs46 // realA*realB
  543. XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
  544. XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  545. XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  546. XSFADD_I1 vs1, vs1, vs46 // realA*imagB
  547. XSFADD_I2 vs1, vs1, vs47 // imagA*realB
  548. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  549. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  550. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  551. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  552. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  553. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  554. xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
  555. #ifndef TRMMKERNEL
  556. xvadddp vs8, vs8, vs16
  557. xvadddp vs9, vs9, vs17
  558. xvadddp vs10, vs10, vs18
  559. xvadddp vs11, vs11, vs19
  560. xvadddp vs12, vs12, vs20
  561. xvadddp vs13, vs13, vs21
  562. xvadddp vs14, vs14, vs22
  563. xvadddp vs15, vs15, vs23
  564. #endif
  565. stxvd2x vs8, o0, T1
  566. stxvd2x vs9, o16, T1
  567. stxvd2x vs10, o32, T1
  568. stxvd2x vs11, o48, T1
  569. stxvd2x vs12, o0, T2
  570. stxvd2x vs13, o16, T2
  571. stxvd2x vs14, o32, T2
  572. stxvd2x vs15, o48, T2
  573. add T1, T1, LDC
  574. add T2, T2, LDC
  575. #ifndef TRMMKERNEL
  576. lxvd2x vs16, o0, T1
  577. lxvd2x vs17, o16, T1
  578. lxvd2x vs18, o32, T1
  579. lxvd2x vs19, o48, T1
  580. lxvd2x vs20, o0, T2
  581. lxvd2x vs21, o16, T2
  582. lxvd2x vs22, o32, T2
  583. lxvd2x vs23, o48, T2
  584. #endif
  585. xxlxor vs0, vs0, vs0
  586. xxlxor vs1, vs1, vs1
  587. XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  588. XSFADD_R1 vs0, vs0, vs48 // realA*realB
  589. XSFADD_R2 vs0, vs0, vs49 // imagA*imagB
  590. XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  591. XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  592. XSFADD_I1 vs1, vs1, vs48 // realA*imagB
  593. XSFADD_I2 vs1, vs1, vs49 // imagA*realB
  594. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  595. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  596. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  597. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  598. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  599. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  600. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  601. xxlxor vs0, vs0, vs0
  602. xxlxor vs1, vs1, vs1
  603. XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  604. XSFADD_R1 vs0, vs0, vs50 // realA*realB
  605. XSFADD_R2 vs0, vs0, vs51 // imagA*imagB
  606. XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  607. XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  608. XSFADD_I1 vs1, vs1, vs50 // realA*imagB
  609. XSFADD_I2 vs1, vs1, vs51 // imagA*realB
  610. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  611. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  612. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  613. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  614. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  615. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  616. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  617. xxlxor vs0, vs0, vs0
  618. xxlxor vs1, vs1, vs1
  619. XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  620. XSFADD_R1 vs0, vs0, vs52 // realA*realB
  621. XSFADD_R2 vs0, vs0, vs53 // imagA*imagB
  622. XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  623. XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  624. XSFADD_I1 vs1, vs1, vs52 // realA*imagB
  625. XSFADD_I2 vs1, vs1, vs53 // imagA*realB
  626. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  627. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  628. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  629. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  630. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  631. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  632. xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
  633. xxlxor vs0, vs0, vs0
  634. xxlxor vs1, vs1, vs1
  635. XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  636. XSFADD_R1 vs0, vs0, vs54 // realA*realB
  637. XSFADD_R2 vs0, vs0, vs55 // imagA*imagB
  638. XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  639. XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  640. XSFADD_I1 vs1, vs1, vs54 // realA*imagB
  641. XSFADD_I2 vs1, vs1, vs55 // imagA*realB
  642. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  643. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  644. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  645. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  646. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  647. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  648. xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
  649. xxlxor vs0, vs0, vs0
  650. xxlxor vs1, vs1, vs1
  651. XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  652. XSFADD_R1 vs0, vs0, vs56 // realA*realB
  653. XSFADD_R2 vs0, vs0, vs57 // imagA*imagB
  654. XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  655. XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  656. XSFADD_I1 vs1, vs1, vs56 // realA*imagB
  657. XSFADD_I2 vs1, vs1, vs57 // imagA*realB
  658. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  659. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  660. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  661. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  662. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  663. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  664. xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
  665. xxlxor vs0, vs0, vs0
  666. xxlxor vs1, vs1, vs1
  667. XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  668. XSFADD_R1 vs0, vs0, vs58 // realA*realB
  669. XSFADD_R2 vs0, vs0, vs59 // imagA*imagB
  670. XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  671. XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  672. XSFADD_I1 vs1, vs1, vs58 // realA*imagB
  673. XSFADD_I2 vs1, vs1, vs59 // imagA*realB
  674. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  675. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  676. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  677. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  678. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  679. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  680. xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
  681. xxlxor vs0, vs0, vs0
  682. xxlxor vs1, vs1, vs1
  683. XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  684. XSFADD_R1 vs0, vs0, vs60 // realA*realB
  685. XSFADD_R2 vs0, vs0, vs61 // imagA*imagB
  686. XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  687. XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  688. XSFADD_I1 vs1, vs1, vs60 // realA*imagB
  689. XSFADD_I2 vs1, vs1, vs61 // imagA*realB
  690. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  691. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  692. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  693. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  694. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  695. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  696. xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
  697. xxlxor vs0, vs0, vs0
  698. xxlxor vs1, vs1, vs1
  699. XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  700. XSFADD_R1 vs0, vs0, vs62 // realA*realB
  701. XSFADD_R2 vs0, vs0, vs63 // imagA*imagB
  702. XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  703. XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  704. XSFADD_I1 vs1, vs1, vs62 // realA*imagB
  705. XSFADD_I2 vs1, vs1, vs63 // imagA*realB
  706. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  707. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  708. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  709. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  710. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  711. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  712. xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
  713. #ifndef TRMMKERNEL
  714. xvadddp vs8, vs8, vs16
  715. xvadddp vs9, vs9, vs17
  716. xvadddp vs10, vs10, vs18
  717. xvadddp vs11, vs11, vs19
  718. xvadddp vs12, vs12, vs20
  719. xvadddp vs13, vs13, vs21
  720. xvadddp vs14, vs14, vs22
  721. xvadddp vs15, vs15, vs23
  722. #endif
  723. stxvd2x vs8, o0, T1
  724. stxvd2x vs9, o16, T1
  725. stxvd2x vs10, o32, T1
  726. stxvd2x vs11, o48, T1
  727. stxvd2x vs12, o0, T2
  728. stxvd2x vs13, o16, T2
  729. stxvd2x vs14, o32, T2
  730. stxvd2x vs15, o48, T2
  731. add T1, T1, LDC
  732. add T2, T2, LDC
  733. addi CO, CO, 128
  734. #if defined(_AIX)
  735. ')
  736. #else
  737. .endm
  738. #endif
  739. /**********************************************************************************************
  740. * Macros for N=2 and M=4
  741. **********************************************************************************************/
  742. #if defined(_AIX)
  743. define(`LOAD2x4_1', `
  744. #else
  745. .macro LOAD2x4_1
  746. #endif
  747. lxvd2x vs16, o0, BO // load real part from B
  748. lxvd2x vs17, o16, BO // load imag part from B
  749. lxvd2x vs18, o32, BO // load real part from B
  750. lxvd2x vs19, o48, BO // load imag part from B
  751. addi BO, BO, 64
  752. lxvd2x vs0, o0, AO // load real,imag from A
  753. lxvd2x vs1, o16, AO // load real,imag from A
  754. lxvd2x vs2, o32, AO // load real,imag from A
  755. lxvd2x vs3, o48, AO // load real,imag from A
  756. addi AO, AO, 64
  757. #if defined(_AIX)
  758. ')
  759. #else
  760. .endm
  761. #endif
  762. #if defined(_AIX)
  763. define(`KERNEL2x4_I1', `
  764. #else
  765. .macro KERNEL2x4_I1
  766. #endif
  767. lxvd2x vs8, o0, AO // load real,imag from A
  768. lxvd2x vs9, o16, AO // load real,imag from A
  769. lxvd2x vs10, o32, AO // load real,imag from A
  770. lxvd2x vs11, o48, AO // load real,imag from A
  771. addi AO, AO, 64
  772. lxvd2x vs20, o0, BO // load real part from B
  773. lxvd2x vs21, o16, BO // load imag part from B
  774. lxvd2x vs22, o32, BO // load real part from B
  775. lxvd2x vs23, o48, BO // load imag part from B
  776. addi BO, BO, 64
  777. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  778. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  779. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  780. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  781. xvmuldp vs36, vs2, vs16 // real*real, imag*real
  782. xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
  783. xvmuldp vs38, vs3, vs16 // real*real, imag*real
  784. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
  785. xvmuldp vs40, vs0, vs18 // real*real, imag*real
  786. xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
  787. xvmuldp vs42, vs1, vs18 // real*real, imag*real
  788. xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
  789. xvmuldp vs44, vs2, vs18 // real*real, imag*real
  790. xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
  791. xvmuldp vs46, vs3, vs18 // real*real, imag*real
  792. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
  793. #if defined(_AIX)
  794. ')
  795. #else
  796. .endm
  797. #endif
  798. #if defined(_AIX)
  799. define(`KERNEL2x4_1', `
  800. #else
  801. .macro KERNEL2x4_1
  802. #endif
  803. lxvd2x vs8, o0, AO // load real,imag from A
  804. lxvd2x vs9, o16, AO // load real,imag from A
  805. lxvd2x vs10, o32, AO // load real,imag from A
  806. lxvd2x vs11, o48, AO // load real,imag from A
  807. addi AO, AO, 64
  808. lxvd2x vs20, o0, BO // load real part from B
  809. lxvd2x vs21, o16, BO // load imag part from B
  810. lxvd2x vs22, o32, BO // load real part from B
  811. lxvd2x vs23, o48, BO // load imag part from B
  812. addi BO, BO, 64
  813. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  814. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  815. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  816. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  817. xvmaddadp vs36, vs2, vs16 // real*real, imag*real
  818. xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
  819. xvmaddadp vs38, vs3, vs16 // real*real, imag*real
  820. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
  821. xvmaddadp vs40, vs0, vs18 // real*real, imag*real
  822. xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
  823. xvmaddadp vs42, vs1, vs18 // real*real, imag*real
  824. xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
  825. xvmaddadp vs44, vs2, vs18 // real*real, imag*real
  826. xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
  827. xvmaddadp vs46, vs3, vs18 // real*real, imag*real
  828. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
  829. #if defined(_AIX)
  830. ')
  831. #else
  832. .endm
  833. #endif
  834. #if defined(_AIX)
  835. define(`KERNEL2x4_2', `
  836. #else
  837. .macro KERNEL2x4_2
  838. #endif
  839. lxvd2x vs0, o0, AO // load real,imag from A
  840. lxvd2x vs1, o16, AO // load real,imag from A
  841. lxvd2x vs2, o32, AO // load real,imag from A
  842. lxvd2x vs3, o48, AO // load real,imag from A
  843. addi AO, AO, 64
  844. lxvd2x vs16, o0, BO // load real part from B
  845. lxvd2x vs17, o16, BO // load imag part from B
  846. lxvd2x vs18, o32, BO // load real part from B
  847. lxvd2x vs19, o48, BO // load imag part from B
  848. addi BO, BO, 64
  849. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  850. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  851. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  852. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  853. xvmaddadp vs36, vs10, vs20 // real*real, imag*real
  854. xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
  855. xvmaddadp vs38, vs11, vs20 // real*real, imag*real
  856. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
  857. xvmaddadp vs40, vs8, vs22 // real*real, imag*real
  858. xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
  859. xvmaddadp vs42, vs9, vs22 // real*real, imag*real
  860. xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
  861. xvmaddadp vs44, vs10, vs22 // real*real, imag*real
  862. xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
  863. xvmaddadp vs46, vs11, vs22 // real*real, imag*real
  864. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
  865. #if defined(_AIX)
  866. ')
  867. #else
  868. .endm
  869. #endif
  870. #if defined(_AIX)
  871. define(`KERNEL2x4_E2', `
  872. #else
  873. .macro KERNEL2x4_E2
  874. #endif
  875. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  876. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  877. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  878. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  879. xvmaddadp vs36, vs10, vs20 // real*real, imag*real
  880. xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
  881. xvmaddadp vs38, vs11, vs20 // real*real, imag*real
  882. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
  883. xvmaddadp vs40, vs8, vs22 // real*real, imag*real
  884. xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
  885. xvmaddadp vs42, vs9, vs22 // real*real, imag*real
  886. xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
  887. xvmaddadp vs44, vs10, vs22 // real*real, imag*real
  888. xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
  889. xvmaddadp vs46, vs11, vs22 // real*real, imag*real
  890. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
  891. #if defined(_AIX)
  892. ')
  893. #else
  894. .endm
  895. #endif
  896. #if defined(_AIX)
  897. define(`KERNEL2x4_SUBI1', `
  898. #else
  899. .macro KERNEL2x4_SUBI1
  900. #endif
  901. lxvd2x vs0, o0, AO // load real,imag from A
  902. lxvd2x vs1, o16, AO // load real,imag from A
  903. lxvd2x vs2, o32, AO // load real,imag from A
  904. lxvd2x vs3, o48, AO // load real,imag from A
  905. addi AO, AO, 64
  906. lxvd2x vs16, o0, BO // load real part from B
  907. lxvd2x vs17, o16, BO // load imag part from B
  908. lxvd2x vs18, o32, BO // load real part from B
  909. lxvd2x vs19, o48, BO // load imag part from B
  910. addi BO, BO, 64
  911. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  912. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  913. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  914. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  915. xvmuldp vs36, vs2, vs16 // real*real, imag*real
  916. xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
  917. xvmuldp vs38, vs3, vs16 // real*real, imag*real
  918. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
  919. xvmuldp vs40, vs0, vs18 // real*real, imag*real
  920. xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
  921. xvmuldp vs42, vs1, vs18 // real*real, imag*real
  922. xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
  923. xvmuldp vs44, vs2, vs18 // real*real, imag*real
  924. xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
  925. xvmuldp vs46, vs3, vs18 // real*real, imag*real
  926. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
  927. #if defined(_AIX)
  928. ')
  929. #else
  930. .endm
  931. #endif
  932. #if defined(_AIX)
  933. define(`KERNEL2x4_SUB1', `
  934. #else
  935. .macro KERNEL2x4_SUB1
  936. #endif
  937. lxvd2x vs0, o0, AO // load real,imag from A
  938. lxvd2x vs1, o16, AO // load real,imag from A
  939. lxvd2x vs2, o32, AO // load real,imag from A
  940. lxvd2x vs3, o48, AO // load real,imag from A
  941. addi AO, AO, 64
  942. lxvd2x vs16, o0, BO // load real part from B
  943. lxvd2x vs17, o16, BO // load imag part from B
  944. lxvd2x vs18, o32, BO // load real part from B
  945. lxvd2x vs19, o48, BO // load imag part from B
  946. addi BO, BO, 64
  947. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  948. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  949. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  950. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  951. xvmaddadp vs36, vs2, vs16 // real*real, imag*real
  952. xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
  953. xvmaddadp vs38, vs3, vs16 // real*real, imag*real
  954. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
  955. xvmaddadp vs40, vs0, vs18 // real*real, imag*real
  956. xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
  957. xvmaddadp vs42, vs1, vs18 // real*real, imag*real
  958. xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
  959. xvmaddadp vs44, vs2, vs18 // real*real, imag*real
  960. xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
  961. xvmaddadp vs46, vs3, vs18 // real*real, imag*real
  962. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
  963. #if defined(_AIX)
  964. ')
  965. #else
  966. .endm
  967. #endif
  968. #if defined(_AIX)
  969. define(`SAVE2x4', `
  970. #else
  971. .macro SAVE2x4
  972. #endif
  973. mr T1, CO
  974. #ifndef TRMMKERNEL
  975. lxvd2x vs16, o0, T1
  976. lxvd2x vs17, o16, T1
  977. lxvd2x vs18, o32, T1
  978. lxvd2x vs19, o48, T1
  979. #endif
  980. xxlxor vs0, vs0, vs0
  981. xxlxor vs1, vs1, vs1
  982. XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  983. XSFADD_R1 vs0, vs0, vs32 // realA*realB
  984. XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
  985. XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  986. XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  987. XSFADD_I1 vs1, vs1, vs32 // realA*imagB
  988. XSFADD_I2 vs1, vs1, vs33 // imagA*realB
  989. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  990. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  991. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  992. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  993. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  994. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  995. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  996. xxlxor vs0, vs0, vs0
  997. xxlxor vs1, vs1, vs1
  998. XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  999. XSFADD_R1 vs0, vs0, vs34 // realA*realB
  1000. XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
  1001. XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1002. XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1003. XSFADD_I1 vs1, vs1, vs34 // realA*imagB
  1004. XSFADD_I2 vs1, vs1, vs35 // imagA*realB
  1005. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1006. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1007. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1008. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1009. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1010. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1011. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  1012. xxlxor vs0, vs0, vs0
  1013. xxlxor vs1, vs1, vs1
  1014. XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1015. XSFADD_R1 vs0, vs0, vs36 // realA*realB
  1016. XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
  1017. XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1018. XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1019. XSFADD_I1 vs1, vs1, vs36 // realA*imagB
  1020. XSFADD_I2 vs1, vs1, vs37 // imagA*realB
  1021. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1022. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1023. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1024. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1025. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1026. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1027. xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
  1028. xxlxor vs0, vs0, vs0
  1029. xxlxor vs1, vs1, vs1
  1030. XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1031. XSFADD_R1 vs0, vs0, vs38 // realA*realB
  1032. XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
  1033. XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1034. XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1035. XSFADD_I1 vs1, vs1, vs38 // realA*imagB
  1036. XSFADD_I2 vs1, vs1, vs39 // imagA*realB
  1037. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1038. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1039. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1040. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1041. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1042. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1043. xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
  1044. #ifndef TRMMKERNEL
  1045. xvadddp vs8, vs8, vs16
  1046. xvadddp vs9, vs9, vs17
  1047. xvadddp vs10, vs10, vs18
  1048. xvadddp vs11, vs11, vs19
  1049. #endif
  1050. stxvd2x vs8, o0, T1
  1051. stxvd2x vs9, o16, T1
  1052. stxvd2x vs10, o32, T1
  1053. stxvd2x vs11, o48, T1
  1054. add T1, T1, LDC
  1055. #ifndef TRMMKERNEL
  1056. lxvd2x vs16, o0, T1
  1057. lxvd2x vs17, o16, T1
  1058. lxvd2x vs18, o32, T1
  1059. lxvd2x vs19, o48, T1
  1060. #endif
  1061. xxlxor vs0, vs0, vs0
  1062. xxlxor vs1, vs1, vs1
  1063. XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1064. XSFADD_R1 vs0, vs0, vs40 // realA*realB
  1065. XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
  1066. XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1067. XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1068. XSFADD_I1 vs1, vs1, vs40 // realA*imagB
  1069. XSFADD_I2 vs1, vs1, vs41 // imagA*realB
  1070. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1071. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1072. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1073. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1074. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1075. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1076. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  1077. xxlxor vs0, vs0, vs0
  1078. xxlxor vs1, vs1, vs1
  1079. XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1080. XSFADD_R1 vs0, vs0, vs42 // realA*realB
  1081. XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
  1082. XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1083. XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1084. XSFADD_I1 vs1, vs1, vs42 // realA*imagB
  1085. XSFADD_I2 vs1, vs1, vs43 // imagA*realB
  1086. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1087. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1088. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1089. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1090. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1091. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1092. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  1093. xxlxor vs0, vs0, vs0
  1094. xxlxor vs1, vs1, vs1
  1095. XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1096. XSFADD_R1 vs0, vs0, vs44 // realA*realB
  1097. XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
  1098. XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1099. XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1100. XSFADD_I1 vs1, vs1, vs44 // realA*imagB
  1101. XSFADD_I2 vs1, vs1, vs45 // imagA*realB
  1102. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1103. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1104. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1105. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1106. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1107. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1108. xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
  1109. xxlxor vs0, vs0, vs0
  1110. xxlxor vs1, vs1, vs1
  1111. XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1112. XSFADD_R1 vs0, vs0, vs46 // realA*realB
  1113. XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
  1114. XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1115. XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1116. XSFADD_I1 vs1, vs1, vs46 // realA*imagB
  1117. XSFADD_I2 vs1, vs1, vs47 // imagA*realB
  1118. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1119. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1120. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1121. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1122. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1123. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1124. xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
  1125. #ifndef TRMMKERNEL
  1126. xvadddp vs8, vs8, vs16
  1127. xvadddp vs9, vs9, vs17
  1128. xvadddp vs10, vs10, vs18
  1129. xvadddp vs11, vs11, vs19
  1130. #endif
  1131. stxvd2x vs8, o0, T1
  1132. stxvd2x vs9, o16, T1
  1133. stxvd2x vs10, o32, T1
  1134. stxvd2x vs11, o48, T1
  1135. add T1, T1, LDC
  1136. addi CO, CO, 64
  1137. #if defined(_AIX)
  1138. ')
  1139. #else
  1140. .endm
  1141. #endif
  1142. /**********************************************************************************************
  1143. * Macros for N=2 and M=2
  1144. **********************************************************************************************/
  1145. #if defined(_AIX)
  1146. define(`LOAD2x2_1', `
  1147. #else
  1148. .macro LOAD2x2_1
  1149. #endif
  1150. lxvd2x vs16, o0, BO // load real part from B
  1151. lxvd2x vs17, o16, BO // load imag part from B
  1152. lxvd2x vs18, o32, BO // load real part from B
  1153. lxvd2x vs19, o48, BO // load imag part from B
  1154. addi BO, BO, 64
  1155. lxvd2x vs0, o0, AO // load real,imag from A
  1156. lxvd2x vs1, o16, AO // load real,imag from A
  1157. addi AO, AO, 32
  1158. #if defined(_AIX)
  1159. ')
  1160. #else
  1161. .endm
  1162. #endif
  1163. #if defined(_AIX)
  1164. define(`KERNEL2x2_I1', `
  1165. #else
  1166. .macro KERNEL2x2_I1
  1167. #endif
  1168. lxvd2x vs8, o0, AO // load real,imag from A
  1169. lxvd2x vs9, o16, AO // load real,imag from A
  1170. addi AO, AO, 32
  1171. lxvd2x vs20, o0, BO // load real part from B
  1172. lxvd2x vs21, o16, BO // load imag part from B
  1173. lxvd2x vs22, o32, BO // load real part from B
  1174. lxvd2x vs23, o48, BO // load imag part from B
  1175. addi BO, BO, 64
  1176. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  1177. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  1178. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  1179. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  1180. xvmuldp vs36, vs0, vs18 // real*real, imag*real
  1181. xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
  1182. xvmuldp vs38, vs1, vs18 // real*real, imag*real
  1183. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
  1184. #if defined(_AIX)
  1185. ')
  1186. #else
  1187. .endm
  1188. #endif
  1189. #if defined(_AIX)
  1190. define(`KERNEL2x2_1', `
  1191. #else
  1192. .macro KERNEL2x2_1
  1193. #endif
  1194. lxvd2x vs8, o0, AO // load real,imag from A
  1195. lxvd2x vs9, o16, AO // load real,imag from A
  1196. addi AO, AO, 32
  1197. lxvd2x vs20, o0, BO // load real part from B
  1198. lxvd2x vs21, o16, BO // load imag part from B
  1199. lxvd2x vs22, o32, BO // load real part from B
  1200. lxvd2x vs23, o48, BO // load imag part from B
  1201. addi BO, BO, 64
  1202. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  1203. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  1204. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  1205. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  1206. xvmaddadp vs36, vs0, vs18 // real*real, imag*real
  1207. xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
  1208. xvmaddadp vs38, vs1, vs18 // real*real, imag*real
  1209. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
  1210. #if defined(_AIX)
  1211. ')
  1212. #else
  1213. .endm
  1214. #endif
  1215. #if defined(_AIX)
  1216. define(`KERNEL2x2_2', `
  1217. #else
  1218. .macro KERNEL2x2_2
  1219. #endif
  1220. lxvd2x vs0, o0, AO // load real,imag from A
  1221. lxvd2x vs1, o16, AO // load real,imag from A
  1222. addi AO, AO, 32
  1223. lxvd2x vs16, o0, BO // load real part from B
  1224. lxvd2x vs17, o16, BO // load imag part from B
  1225. lxvd2x vs18, o32, BO // load real part from B
  1226. lxvd2x vs19, o48, BO // load imag part from B
  1227. addi BO, BO, 64
  1228. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  1229. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  1230. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  1231. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  1232. xvmaddadp vs36, vs8, vs22 // real*real, imag*real
  1233. xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
  1234. xvmaddadp vs38, vs9, vs22 // real*real, imag*real
  1235. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
  1236. #if defined(_AIX)
  1237. ')
  1238. #else
  1239. .endm
  1240. #endif
  1241. #if defined(_AIX)
  1242. define(`KERNEL2x2_E2', `
  1243. #else
  1244. .macro KERNEL2x2_E2
  1245. #endif
  1246. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  1247. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  1248. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  1249. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  1250. xvmaddadp vs36, vs8, vs22 // real*real, imag*real
  1251. xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
  1252. xvmaddadp vs38, vs9, vs22 // real*real, imag*real
  1253. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
  1254. #if defined(_AIX)
  1255. ')
  1256. #else
  1257. .endm
  1258. #endif
  1259. #if defined(_AIX)
  1260. define(`KERNEL2x2_SUBI1', `
  1261. #else
  1262. .macro KERNEL2x2_SUBI1
  1263. #endif
  1264. lxvd2x vs0, o0, AO // load real,imag from A
  1265. lxvd2x vs1, o16, AO // load real,imag from A
  1266. addi AO, AO, 32
  1267. lxvd2x vs16, o0, BO // load real part from B
  1268. lxvd2x vs17, o16, BO // load imag part from B
  1269. lxvd2x vs18, o32, BO // load real part from B
  1270. lxvd2x vs19, o48, BO // load imag part from B
  1271. addi BO, BO, 64
  1272. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  1273. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  1274. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  1275. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  1276. xvmuldp vs36, vs0, vs18 // real*real, imag*real
  1277. xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
  1278. xvmuldp vs38, vs1, vs18 // real*real, imag*real
  1279. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
  1280. #if defined(_AIX)
  1281. ')
  1282. #else
  1283. .endm
  1284. #endif
  1285. #if defined(_AIX)
  1286. define(`KERNEL2x2_SUB1', `
  1287. #else
  1288. .macro KERNEL2x2_SUB1
  1289. #endif
  1290. lxvd2x vs0, o0, AO // load real,imag from A
  1291. lxvd2x vs1, o16, AO // load real,imag from A
  1292. addi AO, AO, 32
  1293. lxvd2x vs16, o0, BO // load real part from B
  1294. lxvd2x vs17, o16, BO // load imag part from B
  1295. lxvd2x vs18, o32, BO // load real part from B
  1296. lxvd2x vs19, o48, BO // load imag part from B
  1297. addi BO, BO, 64
  1298. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  1299. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  1300. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  1301. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  1302. xvmaddadp vs36, vs0, vs18 // real*real, imag*real
  1303. xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
  1304. xvmaddadp vs38, vs1, vs18 // real*real, imag*real
  1305. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
  1306. #if defined(_AIX)
  1307. ')
  1308. #else
  1309. .endm
  1310. #endif
  1311. #if defined(_AIX)
  1312. define(`SAVE2x2', `
  1313. #else
  1314. .macro SAVE2x2
  1315. #endif
  1316. mr T1, CO
  1317. #ifndef TRMMKERNEL
  1318. lxvd2x vs16, o0, T1
  1319. lxvd2x vs17, o16, T1
  1320. #endif
  1321. xxlxor vs0, vs0, vs0
  1322. xxlxor vs1, vs1, vs1
  1323. XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1324. XSFADD_R1 vs0, vs0, vs32 // realA*realB
  1325. XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
  1326. XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1327. XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1328. XSFADD_I1 vs1, vs1, vs32 // realA*imagB
  1329. XSFADD_I2 vs1, vs1, vs33 // imagA*realB
  1330. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1331. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1332. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1333. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1334. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1335. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1336. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  1337. xxlxor vs0, vs0, vs0
  1338. xxlxor vs1, vs1, vs1
  1339. XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1340. XSFADD_R1 vs0, vs0, vs34 // realA*realB
  1341. XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
  1342. XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1343. XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1344. XSFADD_I1 vs1, vs1, vs34 // realA*imagB
  1345. XSFADD_I2 vs1, vs1, vs35 // imagA*realB
  1346. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1347. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1348. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1349. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1350. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1351. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1352. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  1353. #ifndef TRMMKERNEL
  1354. xvadddp vs8, vs8, vs16
  1355. xvadddp vs9, vs9, vs17
  1356. #endif
  1357. stxvd2x vs8, o0, T1
  1358. stxvd2x vs9, o16, T1
  1359. add T1, T1, LDC
  1360. #ifndef TRMMKERNEL
  1361. lxvd2x vs16, o0, T1
  1362. lxvd2x vs17, o16, T1
  1363. #endif
  1364. xxlxor vs0, vs0, vs0
  1365. xxlxor vs1, vs1, vs1
  1366. XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1367. XSFADD_R1 vs0, vs0, vs36 // realA*realB
  1368. XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
  1369. XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1370. XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1371. XSFADD_I1 vs1, vs1, vs36 // realA*imagB
  1372. XSFADD_I2 vs1, vs1, vs37 // imagA*realB
  1373. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1374. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1375. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1376. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1377. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1378. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1379. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  1380. xxlxor vs0, vs0, vs0
  1381. xxlxor vs1, vs1, vs1
  1382. XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1383. XSFADD_R1 vs0, vs0, vs38 // realA*realB
  1384. XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
  1385. XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1386. XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1387. XSFADD_I1 vs1, vs1, vs38 // realA*imagB
  1388. XSFADD_I2 vs1, vs1, vs39 // imagA*realB
  1389. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1390. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1391. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1392. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1393. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1394. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1395. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  1396. #ifndef TRMMKERNEL
  1397. xvadddp vs8, vs8, vs16
  1398. xvadddp vs9, vs9, vs17
  1399. #endif
  1400. stxvd2x vs8, o0, T1
  1401. stxvd2x vs9, o16, T1
  1402. add T1, T1, LDC
  1403. addi CO, CO, 32
  1404. #if defined(_AIX)
  1405. ')
  1406. #else
  1407. .endm
  1408. #endif
  1409. /**********************************************************************************************
  1410. * Macros for N=2 and M=1
  1411. **********************************************************************************************/
  1412. #if defined(_AIX)
  1413. define(`LOAD2x1_1', `
  1414. #else
  1415. .macro LOAD2x1_1
  1416. #endif
  1417. lxvd2x vs16, o0, BO // load real part from B
  1418. lxvd2x vs17, o16, BO // load imag part from B
  1419. lxvd2x vs18, o32, BO // load real part from B
  1420. lxvd2x vs19, o48, BO // load imag part from B
  1421. addi BO, BO, 64
  1422. lxvd2x vs0, o0, AO // load real,imag from A
  1423. addi AO, AO, 16
  1424. #if defined(_AIX)
  1425. ')
  1426. #else
  1427. .endm
  1428. #endif
  1429. #if defined(_AIX)
  1430. define(`KERNEL2x1_I1', `
  1431. #else
  1432. .macro KERNEL2x1_I1
  1433. #endif
  1434. lxvd2x vs8, o0, AO // load real,imag from A
  1435. addi AO, AO, 16
  1436. lxvd2x vs20, o0, BO // load real part from B
  1437. lxvd2x vs21, o16, BO // load imag part from B
  1438. lxvd2x vs22, o32, BO // load real part from B
  1439. lxvd2x vs23, o48, BO // load imag part from B
  1440. addi BO, BO, 64
  1441. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  1442. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  1443. xvmuldp vs34, vs0, vs18 // real*real, imag*real
  1444. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
  1445. #if defined(_AIX)
  1446. ')
  1447. #else
  1448. .endm
  1449. #endif
  1450. #if defined(_AIX)
  1451. define(`KERNEL2x1_1', `
  1452. #else
  1453. .macro KERNEL2x1_1
  1454. #endif
  1455. lxvd2x vs8, o0, AO // load real,imag from A
  1456. addi AO, AO, 16
  1457. lxvd2x vs20, o0, BO // load real part from B
  1458. lxvd2x vs21, o16, BO // load imag part from B
  1459. lxvd2x vs22, o32, BO // load real part from B
  1460. lxvd2x vs23, o48, BO // load imag part from B
  1461. addi BO, BO, 64
  1462. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  1463. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  1464. xvmaddadp vs34, vs0, vs18 // real*real, imag*real
  1465. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
  1466. #if defined(_AIX)
  1467. ')
  1468. #else
  1469. .endm
  1470. #endif
  1471. #if defined(_AIX)
  1472. define(`KERNEL2x1_2', `
  1473. #else
  1474. .macro KERNEL2x1_2
  1475. #endif
  1476. lxvd2x vs0, o0, AO // load real,imag from A
  1477. addi AO, AO, 16
  1478. lxvd2x vs16, o0, BO // load real part from B
  1479. lxvd2x vs17, o16, BO // load imag part from B
  1480. lxvd2x vs18, o32, BO // load real part from B
  1481. lxvd2x vs19, o48, BO // load imag part from B
  1482. addi BO, BO, 64
  1483. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  1484. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  1485. xvmaddadp vs34, vs8, vs22 // real*real, imag*real
  1486. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
  1487. #if defined(_AIX)
  1488. ')
  1489. #else
  1490. .endm
  1491. #endif
  1492. #if defined(_AIX)
  1493. define(`KERNEL2x1_E2', `
  1494. #else
  1495. .macro KERNEL2x1_E2
  1496. #endif
  1497. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  1498. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  1499. xvmaddadp vs34, vs8, vs22 // real*real, imag*real
  1500. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
  1501. #if defined(_AIX)
  1502. ')
  1503. #else
  1504. .endm
  1505. #endif
  1506. #if defined(_AIX)
  1507. define(`KERNEL2x1_SUBI1', `
  1508. #else
  1509. .macro KERNEL2x1_SUBI1
  1510. #endif
  1511. lxvd2x vs0, o0, AO // load real,imag from A
  1512. addi AO, AO, 16
  1513. lxvd2x vs16, o0, BO // load real part from B
  1514. lxvd2x vs17, o16, BO // load imag part from B
  1515. lxvd2x vs18, o32, BO // load real part from B
  1516. lxvd2x vs19, o48, BO // load imag part from B
  1517. addi BO, BO, 64
  1518. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  1519. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  1520. xvmuldp vs34, vs0, vs18 // real*real, imag*real
  1521. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
  1522. #if defined(_AIX)
  1523. ')
  1524. #else
  1525. .endm
  1526. #endif
  1527. #if defined(_AIX)
  1528. define(`KERNEL2x1_SUB1', `
  1529. #else
  1530. .macro KERNEL2x1_SUB1
  1531. #endif
  1532. lxvd2x vs0, o0, AO // load real,imag from A
  1533. addi AO, AO, 16
  1534. lxvd2x vs16, o0, BO // load real part from B
  1535. lxvd2x vs17, o16, BO // load imag part from B
  1536. lxvd2x vs18, o32, BO // load real part from B
  1537. lxvd2x vs19, o48, BO // load imag part from B
  1538. addi BO, BO, 64
  1539. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  1540. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  1541. xvmaddadp vs34, vs0, vs18 // real*real, imag*real
  1542. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
  1543. #if defined(_AIX)
  1544. ')
  1545. #else
  1546. .endm
  1547. #endif
  1548. #if defined(_AIX)
  1549. define(`SAVE2x1', `
  1550. #else
  1551. .macro SAVE2x1
  1552. #endif
  1553. mr T1, CO
  1554. #ifndef TRMMKERNEL
  1555. lxvd2x vs16, o0, T1
  1556. #endif
  1557. xxlxor vs0, vs0, vs0
  1558. xxlxor vs1, vs1, vs1
  1559. XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1560. XSFADD_R1 vs0, vs0, vs32 // realA*realB
  1561. XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
  1562. XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1563. XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1564. XSFADD_I1 vs1, vs1, vs32 // realA*imagB
  1565. XSFADD_I2 vs1, vs1, vs33 // imagA*realB
  1566. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1567. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1568. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1569. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1570. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1571. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1572. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  1573. #ifndef TRMMKERNEL
  1574. xvadddp vs8, vs8, vs16
  1575. #endif
  1576. stxvd2x vs8, o0, T1
  1577. add T1, T1, LDC
  1578. #ifndef TRMMKERNEL
  1579. lxvd2x vs16, o0, T1
  1580. #endif
  1581. xxlxor vs0, vs0, vs0
  1582. xxlxor vs1, vs1, vs1
  1583. XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1584. XSFADD_R1 vs0, vs0, vs34 // realA*realB
  1585. XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
  1586. XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1587. XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1588. XSFADD_I1 vs1, vs1, vs34 // realA*imagB
  1589. XSFADD_I2 vs1, vs1, vs35 // imagA*realB
  1590. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1591. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1592. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1593. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1594. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1595. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1596. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  1597. #ifndef TRMMKERNEL
  1598. xvadddp vs8, vs8, vs16
  1599. #endif
  1600. stxvd2x vs8, o0, T1
  1601. add T1, T1, LDC
  1602. addi CO, CO, 16
  1603. #if defined(_AIX)
  1604. ')
  1605. #else
  1606. .endm
  1607. #endif
  1608. /**********************************************************************************************
  1609. * Macros for N=1 and M=8
  1610. **********************************************************************************************/
  1611. #if defined(_AIX)
  1612. define(`LOAD1x8_1', `
  1613. #else
  1614. .macro LOAD1x8_1
  1615. #endif
  1616. lxvd2x vs16, o0, BO // load real part from B
  1617. lxvd2x vs17, o16, BO // load imag part from B
  1618. addi BO, BO, 32
  1619. lxvd2x vs0, o0, AO // load real,imag from A
  1620. lxvd2x vs1, o16, AO // load real,imag from A
  1621. lxvd2x vs2, o32, AO // load real,imag from A
  1622. lxvd2x vs3, o48, AO // load real,imag from A
  1623. addi AO, AO, 64
  1624. lxvd2x vs4, o0, AO // load real,imag from A
  1625. lxvd2x vs5, o16, AO // load real,imag from A
  1626. lxvd2x vs6, o32, AO // load real,imag from A
  1627. lxvd2x vs7, o48, AO // load real,imag from A
  1628. addi AO, AO, 64
  1629. #if defined(_AIX)
  1630. ')
  1631. #else
  1632. .endm
  1633. #endif
  1634. #if defined(_AIX)
  1635. define(`KERNEL1x8_I1', `
  1636. #else
  1637. .macro KERNEL1x8_I1
  1638. #endif
  1639. lxvd2x vs8, o0, AO // load real,imag from A
  1640. lxvd2x vs9, o16, AO // load real,imag from A
  1641. lxvd2x vs10, o32, AO // load real,imag from A
  1642. lxvd2x vs11, o48, AO // load real,imag from A
  1643. addi AO, AO, 64
  1644. lxvd2x vs12, o0, AO // load real,imag from A
  1645. lxvd2x vs13, o16, AO // load real,imag from A
  1646. lxvd2x vs14, o32, AO // load real,imag from A
  1647. lxvd2x vs15, o48, AO // load real,imag from A
  1648. addi AO, AO, 64
  1649. lxvd2x vs20, o0, BO // load real part from B
  1650. lxvd2x vs21, o16, BO // load imag part from B
  1651. addi BO, BO, 32
  1652. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  1653. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  1654. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  1655. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  1656. xvmuldp vs36, vs2, vs16 // real*real, imag*real
  1657. xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
  1658. xvmuldp vs38, vs3, vs16 // real*real, imag*real
  1659. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
  1660. xvmuldp vs40, vs4, vs16 // real*real, imag*real
  1661. xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
  1662. xvmuldp vs42, vs5, vs16 // real*real, imag*real
  1663. xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
  1664. xvmuldp vs44, vs6, vs16 // real*real, imag*real
  1665. xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
  1666. xvmuldp vs46, vs7, vs16 // real*real, imag*real
  1667. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
  1668. #if defined(_AIX)
  1669. ')
  1670. #else
  1671. .endm
  1672. #endif
  1673. #if defined(_AIX)
  1674. define(`KERNEL1x8_1', `
  1675. #else
  1676. .macro KERNEL1x8_1
  1677. #endif
  1678. lxvd2x vs8, o0, AO // load real,imag from A
  1679. lxvd2x vs9, o16, AO // load real,imag from A
  1680. lxvd2x vs10, o32, AO // load real,imag from A
  1681. lxvd2x vs11, o48, AO // load real,imag from A
  1682. addi AO, AO, 64
  1683. lxvd2x vs12, o0, AO // load real,imag from A
  1684. lxvd2x vs13, o16, AO // load real,imag from A
  1685. lxvd2x vs14, o32, AO // load real,imag from A
  1686. lxvd2x vs15, o48, AO // load real,imag from A
  1687. addi AO, AO, 64
  1688. lxvd2x vs20, o0, BO // load real part from B
  1689. lxvd2x vs21, o16, BO // load imag part from B
  1690. addi BO, BO, 32
  1691. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  1692. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  1693. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  1694. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  1695. xvmaddadp vs36, vs2, vs16 // real*real, imag*real
  1696. xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
  1697. xvmaddadp vs38, vs3, vs16 // real*real, imag*real
  1698. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
  1699. xvmaddadp vs40, vs4, vs16 // real*real, imag*real
  1700. xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
  1701. xvmaddadp vs42, vs5, vs16 // real*real, imag*real
  1702. xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
  1703. xvmaddadp vs44, vs6, vs16 // real*real, imag*real
  1704. xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
  1705. xvmaddadp vs46, vs7, vs16 // real*real, imag*real
  1706. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
  1707. #if defined(_AIX)
  1708. ')
  1709. #else
  1710. .endm
  1711. #endif
  1712. #if defined(_AIX)
  1713. define(`KERNEL1x8_2', `
  1714. #else
  1715. .macro KERNEL1x8_2
  1716. #endif
  1717. lxvd2x vs0, o0, AO // load real,imag from A
  1718. lxvd2x vs1, o16, AO // load real,imag from A
  1719. lxvd2x vs2, o32, AO // load real,imag from A
  1720. lxvd2x vs3, o48, AO // load real,imag from A
  1721. addi AO, AO, 64
  1722. lxvd2x vs4, o0, AO // load real,imag from A
  1723. lxvd2x vs5, o16, AO // load real,imag from A
  1724. lxvd2x vs6, o32, AO // load real,imag from A
  1725. lxvd2x vs7, o48, AO // load real,imag from A
  1726. addi AO, AO, 64
  1727. lxvd2x vs16, o0, BO // load real part from B
  1728. lxvd2x vs17, o16, BO // load imag part from B
  1729. addi BO, BO, 32
  1730. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  1731. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  1732. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  1733. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  1734. xvmaddadp vs36, vs10, vs20 // real*real, imag*real
  1735. xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
  1736. xvmaddadp vs38, vs11, vs20 // real*real, imag*real
  1737. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
  1738. xvmaddadp vs40, vs12, vs20 // real*real, imag*real
  1739. xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
  1740. xvmaddadp vs42, vs13, vs20 // real*real, imag*real
  1741. xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
  1742. xvmaddadp vs44, vs14, vs20 // real*real, imag*real
  1743. xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
  1744. xvmaddadp vs46, vs15, vs20 // real*real, imag*real
  1745. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
  1746. #if defined(_AIX)
  1747. ')
  1748. #else
  1749. .endm
  1750. #endif
  1751. #if defined(_AIX)
  1752. define(`KERNEL1x8_E2', `
  1753. #else
  1754. .macro KERNEL1x8_E2
  1755. #endif
  1756. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  1757. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  1758. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  1759. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  1760. xvmaddadp vs36, vs10, vs20 // real*real, imag*real
  1761. xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
  1762. xvmaddadp vs38, vs11, vs20 // real*real, imag*real
  1763. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
  1764. xvmaddadp vs40, vs12, vs20 // real*real, imag*real
  1765. xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
  1766. xvmaddadp vs42, vs13, vs20 // real*real, imag*real
  1767. xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
  1768. xvmaddadp vs44, vs14, vs20 // real*real, imag*real
  1769. xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
  1770. xvmaddadp vs46, vs15, vs20 // real*real, imag*real
  1771. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
  1772. #if defined(_AIX)
  1773. ')
  1774. #else
  1775. .endm
  1776. #endif
  1777. #if defined(_AIX)
  1778. define(`KERNEL1x8_SUBI1', `
  1779. #else
  1780. .macro KERNEL1x8_SUBI1
  1781. #endif
  1782. lxvd2x vs0, o0, AO // load real,imag from A
  1783. lxvd2x vs1, o16, AO // load real,imag from A
  1784. lxvd2x vs2, o32, AO // load real,imag from A
  1785. lxvd2x vs3, o48, AO // load real,imag from A
  1786. addi AO, AO, 64
  1787. lxvd2x vs4, o0, AO // load real,imag from A
  1788. lxvd2x vs5, o16, AO // load real,imag from A
  1789. lxvd2x vs6, o32, AO // load real,imag from A
  1790. lxvd2x vs7, o48, AO // load real,imag from A
  1791. addi AO, AO, 64
  1792. lxvd2x vs16, o0, BO // load real part from B
  1793. lxvd2x vs17, o16, BO // load imag part from B
  1794. addi BO, BO, 32
  1795. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  1796. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  1797. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  1798. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  1799. xvmuldp vs36, vs2, vs16 // real*real, imag*real
  1800. xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
  1801. xvmuldp vs38, vs3, vs16 // real*real, imag*real
  1802. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
  1803. xvmuldp vs40, vs4, vs16 // real*real, imag*real
  1804. xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
  1805. xvmuldp vs42, vs5, vs16 // real*real, imag*real
  1806. xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
  1807. xvmuldp vs44, vs6, vs16 // real*real, imag*real
  1808. xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
  1809. xvmuldp vs46, vs7, vs16 // real*real, imag*real
  1810. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
  1811. #if defined(_AIX)
  1812. ')
  1813. #else
  1814. .endm
  1815. #endif
  1816. #if defined(_AIX)
  1817. define(`KERNEL1x8_SUB1', `
  1818. #else
  1819. .macro KERNEL1x8_SUB1
  1820. #endif
  1821. lxvd2x vs0, o0, AO // load real,imag from A
  1822. lxvd2x vs1, o16, AO // load real,imag from A
  1823. lxvd2x vs2, o32, AO // load real,imag from A
  1824. lxvd2x vs3, o48, AO // load real,imag from A
  1825. addi AO, AO, 64
  1826. lxvd2x vs4, o0, AO // load real,imag from A
  1827. lxvd2x vs5, o16, AO // load real,imag from A
  1828. lxvd2x vs6, o32, AO // load real,imag from A
  1829. lxvd2x vs7, o48, AO // load real,imag from A
  1830. addi AO, AO, 64
  1831. lxvd2x vs16, o0, BO // load real part from B
  1832. lxvd2x vs17, o16, BO // load imag part from B
  1833. addi BO, BO, 32
  1834. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  1835. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  1836. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  1837. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  1838. xvmaddadp vs36, vs2, vs16 // real*real, imag*real
  1839. xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
  1840. xvmaddadp vs38, vs3, vs16 // real*real, imag*real
  1841. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
  1842. xvmaddadp vs40, vs4, vs16 // real*real, imag*real
  1843. xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
  1844. xvmaddadp vs42, vs5, vs16 // real*real, imag*real
  1845. xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
  1846. xvmaddadp vs44, vs6, vs16 // real*real, imag*real
  1847. xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
  1848. xvmaddadp vs46, vs7, vs16 // real*real, imag*real
  1849. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
  1850. #if defined(_AIX)
  1851. ')
  1852. #else
  1853. .endm
  1854. #endif
  1855. #if defined(_AIX)
  1856. define(`SAVE1x8', `
  1857. #else
  1858. .macro SAVE1x8
  1859. #endif
  1860. mr T1, CO
  1861. addi T2, T1, 64
  1862. #ifndef TRMMKERNEL
  1863. lxvd2x vs16, o0, T1
  1864. lxvd2x vs17, o16, T1
  1865. lxvd2x vs18, o32, T1
  1866. lxvd2x vs19, o48, T1
  1867. lxvd2x vs20, o0, T2
  1868. lxvd2x vs21, o16, T2
  1869. lxvd2x vs22, o32, T2
  1870. lxvd2x vs23, o48, T2
  1871. #endif
  1872. xxlxor vs0, vs0, vs0
  1873. xxlxor vs1, vs1, vs1
  1874. XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1875. XSFADD_R1 vs0, vs0, vs32 // realA*realB
  1876. XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
  1877. XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1878. XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1879. XSFADD_I1 vs1, vs1, vs32 // realA*imagB
  1880. XSFADD_I2 vs1, vs1, vs33 // imagA*realB
  1881. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1882. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1883. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1884. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1885. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1886. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1887. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  1888. xxlxor vs0, vs0, vs0
  1889. xxlxor vs1, vs1, vs1
  1890. XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1891. XSFADD_R1 vs0, vs0, vs34 // realA*realB
  1892. XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
  1893. XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1894. XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1895. XSFADD_I1 vs1, vs1, vs34 // realA*imagB
  1896. XSFADD_I2 vs1, vs1, vs35 // imagA*realB
  1897. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1898. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1899. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1900. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1901. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1902. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1903. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  1904. xxlxor vs0, vs0, vs0
  1905. xxlxor vs1, vs1, vs1
  1906. XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1907. XSFADD_R1 vs0, vs0, vs36 // realA*realB
  1908. XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
  1909. XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1910. XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1911. XSFADD_I1 vs1, vs1, vs36 // realA*imagB
  1912. XSFADD_I2 vs1, vs1, vs37 // imagA*realB
  1913. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1914. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1915. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1916. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1917. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1918. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1919. xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
  1920. xxlxor vs0, vs0, vs0
  1921. xxlxor vs1, vs1, vs1
  1922. XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1923. XSFADD_R1 vs0, vs0, vs38 // realA*realB
  1924. XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
  1925. XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1926. XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1927. XSFADD_I1 vs1, vs1, vs38 // realA*imagB
  1928. XSFADD_I2 vs1, vs1, vs39 // imagA*realB
  1929. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1930. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1931. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1932. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1933. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1934. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1935. xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
  1936. xxlxor vs0, vs0, vs0
  1937. xxlxor vs1, vs1, vs1
  1938. XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1939. XSFADD_R1 vs0, vs0, vs40 // realA*realB
  1940. XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
  1941. XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1942. XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1943. XSFADD_I1 vs1, vs1, vs40 // realA*imagB
  1944. XSFADD_I2 vs1, vs1, vs41 // imagA*realB
  1945. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1946. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1947. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1948. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1949. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1950. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1951. xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
  1952. xxlxor vs0, vs0, vs0
  1953. xxlxor vs1, vs1, vs1
  1954. XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1955. XSFADD_R1 vs0, vs0, vs42 // realA*realB
  1956. XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
  1957. XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1958. XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1959. XSFADD_I1 vs1, vs1, vs42 // realA*imagB
  1960. XSFADD_I2 vs1, vs1, vs43 // imagA*realB
  1961. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1962. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1963. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1964. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1965. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1966. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1967. xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
  1968. xxlxor vs0, vs0, vs0
  1969. xxlxor vs1, vs1, vs1
  1970. XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1971. XSFADD_R1 vs0, vs0, vs44 // realA*realB
  1972. XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
  1973. XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1974. XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1975. XSFADD_I1 vs1, vs1, vs44 // realA*imagB
  1976. XSFADD_I2 vs1, vs1, vs45 // imagA*realB
  1977. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1978. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1979. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1980. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1981. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1982. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1983. xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
  1984. xxlxor vs0, vs0, vs0
  1985. xxlxor vs1, vs1, vs1
  1986. XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  1987. XSFADD_R1 vs0, vs0, vs46 // realA*realB
  1988. XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
  1989. XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  1990. XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  1991. XSFADD_I1 vs1, vs1, vs46 // realA*imagB
  1992. XSFADD_I2 vs1, vs1, vs47 // imagA*realB
  1993. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  1994. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  1995. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  1996. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  1997. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  1998. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  1999. xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
  2000. #ifndef TRMMKERNEL
  2001. xvadddp vs8, vs8, vs16
  2002. xvadddp vs9, vs9, vs17
  2003. xvadddp vs10, vs10, vs18
  2004. xvadddp vs11, vs11, vs19
  2005. xvadddp vs12, vs12, vs20
  2006. xvadddp vs13, vs13, vs21
  2007. xvadddp vs14, vs14, vs22
  2008. xvadddp vs15, vs15, vs23
  2009. #endif
  2010. stxvd2x vs8, o0, T1
  2011. stxvd2x vs9, o16, T1
  2012. stxvd2x vs10, o32, T1
  2013. stxvd2x vs11, o48, T1
  2014. stxvd2x vs12, o0, T2
  2015. stxvd2x vs13, o16, T2
  2016. stxvd2x vs14, o32, T2
  2017. stxvd2x vs15, o48, T2
  2018. add T1, T1, LDC
  2019. add T2, T2, LDC
  2020. addi CO, CO, 128
  2021. #if defined(_AIX)
  2022. ')
  2023. #else
  2024. .endm
  2025. #endif
  2026. /**********************************************************************************************
  2027. * Macros for N=1 and M=4
  2028. **********************************************************************************************/
  2029. #if defined(_AIX)
  2030. define(`LOAD1x4_1', `
  2031. #else
  2032. .macro LOAD1x4_1
  2033. #endif
  2034. lxvd2x vs16, o0, BO // load real part from B
  2035. lxvd2x vs17, o16, BO // load imag part from B
  2036. addi BO, BO, 32
  2037. lxvd2x vs0, o0, AO // load real,imag from A
  2038. lxvd2x vs1, o16, AO // load real,imag from A
  2039. lxvd2x vs2, o32, AO // load real,imag from A
  2040. lxvd2x vs3, o48, AO // load real,imag from A
  2041. addi AO, AO, 64
  2042. #if defined(_AIX)
  2043. ')
  2044. #else
  2045. .endm
  2046. #endif
  2047. #if defined(_AIX)
  2048. define(`KERNEL1x4_I1', `
  2049. #else
  2050. .macro KERNEL1x4_I1
  2051. #endif
  2052. lxvd2x vs8, o0, AO // load real,imag from A
  2053. lxvd2x vs9, o16, AO // load real,imag from A
  2054. lxvd2x vs10, o32, AO // load real,imag from A
  2055. lxvd2x vs11, o48, AO // load real,imag from A
  2056. addi AO, AO, 64
  2057. lxvd2x vs20, o0, BO // load real part from B
  2058. lxvd2x vs21, o16, BO // load imag part from B
  2059. addi BO, BO, 32
  2060. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  2061. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  2062. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  2063. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  2064. xvmuldp vs36, vs2, vs16 // real*real, imag*real
  2065. xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
  2066. xvmuldp vs38, vs3, vs16 // real*real, imag*real
  2067. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
  2068. #if defined(_AIX)
  2069. ')
  2070. #else
  2071. .endm
  2072. #endif
  2073. #if defined(_AIX)
  2074. define(`KERNEL1x4_1', `
  2075. #else
  2076. .macro KERNEL1x4_1
  2077. #endif
  2078. lxvd2x vs8, o0, AO // load real,imag from A
  2079. lxvd2x vs9, o16, AO // load real,imag from A
  2080. lxvd2x vs10, o32, AO // load real,imag from A
  2081. lxvd2x vs11, o48, AO // load real,imag from A
  2082. addi AO, AO, 64
  2083. lxvd2x vs20, o0, BO // load real part from B
  2084. lxvd2x vs21, o16, BO // load imag part from B
  2085. addi BO, BO, 32
  2086. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  2087. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  2088. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  2089. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  2090. xvmaddadp vs36, vs2, vs16 // real*real, imag*real
  2091. xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
  2092. xvmaddadp vs38, vs3, vs16 // real*real, imag*real
  2093. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
  2094. #if defined(_AIX)
  2095. ')
  2096. #else
  2097. .endm
  2098. #endif
  2099. #if defined(_AIX)
  2100. define(`KERNEL1x4_2', `
  2101. #else
  2102. .macro KERNEL1x4_2
  2103. #endif
  2104. lxvd2x vs0, o0, AO // load real,imag from A
  2105. lxvd2x vs1, o16, AO // load real,imag from A
  2106. lxvd2x vs2, o32, AO // load real,imag from A
  2107. lxvd2x vs3, o48, AO // load real,imag from A
  2108. addi AO, AO, 64
  2109. lxvd2x vs16, o0, BO // load real part from B
  2110. lxvd2x vs17, o16, BO // load imag part from B
  2111. addi BO, BO, 32
  2112. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  2113. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  2114. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  2115. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  2116. xvmaddadp vs36, vs10, vs20 // real*real, imag*real
  2117. xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
  2118. xvmaddadp vs38, vs11, vs20 // real*real, imag*real
  2119. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
  2120. #if defined(_AIX)
  2121. ')
  2122. #else
  2123. .endm
  2124. #endif
  2125. #if defined(_AIX)
  2126. define(`KERNEL1x4_E2', `
  2127. #else
  2128. .macro KERNEL1x4_E2
  2129. #endif
  2130. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  2131. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  2132. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  2133. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  2134. xvmaddadp vs36, vs10, vs20 // real*real, imag*real
  2135. xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
  2136. xvmaddadp vs38, vs11, vs20 // real*real, imag*real
  2137. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
  2138. #if defined(_AIX)
  2139. ')
  2140. #else
  2141. .endm
  2142. #endif
  2143. #if defined(_AIX)
  2144. define(`KERNEL1x4_SUBI1', `
  2145. #else
  2146. .macro KERNEL1x4_SUBI1
  2147. #endif
  2148. lxvd2x vs0, o0, AO // load real,imag from A
  2149. lxvd2x vs1, o16, AO // load real,imag from A
  2150. lxvd2x vs2, o32, AO // load real,imag from A
  2151. lxvd2x vs3, o48, AO // load real,imag from A
  2152. addi AO, AO, 64
  2153. lxvd2x vs16, o0, BO // load real part from B
  2154. lxvd2x vs17, o16, BO // load imag part from B
  2155. addi BO, BO, 32
  2156. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  2157. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  2158. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  2159. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  2160. xvmuldp vs36, vs2, vs16 // real*real, imag*real
  2161. xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
  2162. xvmuldp vs38, vs3, vs16 // real*real, imag*real
  2163. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
  2164. #if defined(_AIX)
  2165. ')
  2166. #else
  2167. .endm
  2168. #endif
  2169. #if defined(_AIX)
  2170. define(`KERNEL1x4_SUB1', `
  2171. #else
  2172. .macro KERNEL1x4_SUB1
  2173. #endif
  2174. lxvd2x vs0, o0, AO // load real,imag from A
  2175. lxvd2x vs1, o16, AO // load real,imag from A
  2176. lxvd2x vs2, o32, AO // load real,imag from A
  2177. lxvd2x vs3, o48, AO // load real,imag from A
  2178. addi AO, AO, 64
  2179. lxvd2x vs16, o0, BO // load real part from B
  2180. lxvd2x vs17, o16, BO // load imag part from B
  2181. addi BO, BO, 32
  2182. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  2183. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  2184. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  2185. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  2186. xvmaddadp vs36, vs2, vs16 // real*real, imag*real
  2187. xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
  2188. xvmaddadp vs38, vs3, vs16 // real*real, imag*real
  2189. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
  2190. #if defined(_AIX)
  2191. ')
  2192. #else
  2193. .endm
  2194. #endif
  2195. #if defined(_AIX)
  2196. define(`SAVE1x4', `
  2197. #else
  2198. .macro SAVE1x4
  2199. #endif
  2200. mr T1, CO
  2201. #ifndef TRMMKERNEL
  2202. lxvd2x vs16, o0, T1
  2203. lxvd2x vs17, o16, T1
  2204. lxvd2x vs18, o32, T1
  2205. lxvd2x vs19, o48, T1
  2206. #endif
  2207. xxlxor vs0, vs0, vs0
  2208. xxlxor vs1, vs1, vs1
  2209. XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  2210. XSFADD_R1 vs0, vs0, vs32 // realA*realB
  2211. XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
  2212. XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  2213. XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  2214. XSFADD_I1 vs1, vs1, vs32 // realA*imagB
  2215. XSFADD_I2 vs1, vs1, vs33 // imagA*realB
  2216. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  2217. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  2218. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  2219. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  2220. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  2221. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  2222. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  2223. xxlxor vs0, vs0, vs0
  2224. xxlxor vs1, vs1, vs1
  2225. XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  2226. XSFADD_R1 vs0, vs0, vs34 // realA*realB
  2227. XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
  2228. XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  2229. XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  2230. XSFADD_I1 vs1, vs1, vs34 // realA*imagB
  2231. XSFADD_I2 vs1, vs1, vs35 // imagA*realB
  2232. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  2233. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  2234. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  2235. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  2236. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  2237. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  2238. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  2239. xxlxor vs0, vs0, vs0
  2240. xxlxor vs1, vs1, vs1
  2241. XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  2242. XSFADD_R1 vs0, vs0, vs36 // realA*realB
  2243. XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
  2244. XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  2245. XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  2246. XSFADD_I1 vs1, vs1, vs36 // realA*imagB
  2247. XSFADD_I2 vs1, vs1, vs37 // imagA*realB
  2248. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  2249. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  2250. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  2251. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  2252. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  2253. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  2254. xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
  2255. xxlxor vs0, vs0, vs0
  2256. xxlxor vs1, vs1, vs1
  2257. XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  2258. XSFADD_R1 vs0, vs0, vs38 // realA*realB
  2259. XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
  2260. XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  2261. XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  2262. XSFADD_I1 vs1, vs1, vs38 // realA*imagB
  2263. XSFADD_I2 vs1, vs1, vs39 // imagA*realB
  2264. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  2265. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  2266. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  2267. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  2268. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  2269. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  2270. xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
  2271. #ifndef TRMMKERNEL
  2272. xvadddp vs8, vs8, vs16
  2273. xvadddp vs9, vs9, vs17
  2274. xvadddp vs10, vs10, vs18
  2275. xvadddp vs11, vs11, vs19
  2276. #endif
  2277. stxvd2x vs8, o0, T1
  2278. stxvd2x vs9, o16, T1
  2279. stxvd2x vs10, o32, T1
  2280. stxvd2x vs11, o48, T1
  2281. add T1, T1, LDC
  2282. addi CO, CO, 64
  2283. #if defined(_AIX)
  2284. ')
  2285. #else
  2286. .endm
  2287. #endif
  2288. /**********************************************************************************************
  2289. * Macros for N=1 and M=2
  2290. **********************************************************************************************/
  2291. #if defined(_AIX)
  2292. define(`LOAD1x2_1', `
  2293. #else
  2294. .macro LOAD1x2_1
  2295. #endif
  2296. lxvd2x vs16, o0, BO // load real part from B
  2297. lxvd2x vs17, o16, BO // load imag part from B
  2298. addi BO, BO, 32
  2299. lxvd2x vs0, o0, AO // load real,imag from A
  2300. lxvd2x vs1, o16, AO // load real,imag from A
  2301. addi AO, AO, 32
  2302. #if defined(_AIX)
  2303. ')
  2304. #else
  2305. .endm
  2306. #endif
  2307. #if defined(_AIX)
  2308. define(`KERNEL1x2_I1', `
  2309. #else
  2310. .macro KERNEL1x2_I1
  2311. #endif
  2312. lxvd2x vs8, o0, AO // load real,imag from A
  2313. lxvd2x vs9, o16, AO // load real,imag from A
  2314. addi AO, AO, 32
  2315. lxvd2x vs20, o0, BO // load real part from B
  2316. lxvd2x vs21, o16, BO // load imag part from B
  2317. addi BO, BO, 32
  2318. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  2319. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  2320. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  2321. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  2322. #if defined(_AIX)
  2323. ')
  2324. #else
  2325. .endm
  2326. #endif
  2327. #if defined(_AIX)
  2328. define(`KERNEL1x2_1', `
  2329. #else
  2330. .macro KERNEL1x2_1
  2331. #endif
  2332. lxvd2x vs8, o0, AO // load real,imag from A
  2333. lxvd2x vs9, o16, AO // load real,imag from A
  2334. addi AO, AO, 32
  2335. lxvd2x vs20, o0, BO // load real part from B
  2336. lxvd2x vs21, o16, BO // load imag part from B
  2337. addi BO, BO, 32
  2338. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  2339. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  2340. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  2341. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  2342. #if defined(_AIX)
  2343. ')
  2344. #else
  2345. .endm
  2346. #endif
  2347. #if defined(_AIX)
  2348. define(`KERNEL1x2_2', `
  2349. #else
  2350. .macro KERNEL1x2_2
  2351. #endif
  2352. lxvd2x vs0, o0, AO // load real,imag from A
  2353. lxvd2x vs1, o16, AO // load real,imag from A
  2354. addi AO, AO, 32
  2355. lxvd2x vs16, o0, BO // load real part from B
  2356. lxvd2x vs17, o16, BO // load imag part from B
  2357. addi BO, BO, 32
  2358. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  2359. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  2360. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  2361. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  2362. #if defined(_AIX)
  2363. ')
  2364. #else
  2365. .endm
  2366. #endif
  2367. #if defined(_AIX)
  2368. define(`KERNEL1x2_E2', `
  2369. #else
  2370. .macro KERNEL1x2_E2
  2371. #endif
  2372. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  2373. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  2374. xvmaddadp vs34, vs9, vs20 // real*real, imag*real
  2375. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
  2376. #if defined(_AIX)
  2377. ')
  2378. #else
  2379. .endm
  2380. #endif
  2381. #if defined(_AIX)
  2382. define(`KERNEL1x2_SUBI1', `
  2383. #else
  2384. .macro KERNEL1x2_SUBI1
  2385. #endif
  2386. lxvd2x vs0, o0, AO // load real,imag from A
  2387. lxvd2x vs1, o16, AO // load real,imag from A
  2388. addi AO, AO, 32
  2389. lxvd2x vs16, o0, BO // load real part from B
  2390. lxvd2x vs17, o16, BO // load imag part from B
  2391. addi BO, BO, 32
  2392. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  2393. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  2394. xvmuldp vs34, vs1, vs16 // real*real, imag*real
  2395. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
  2396. #if defined(_AIX)
  2397. ')
  2398. #else
  2399. .endm
  2400. #endif
  2401. #if defined(_AIX)
  2402. define(`KERNEL1x2_SUB1', `
  2403. #else
  2404. .macro KERNEL1x2_SUB1
  2405. #endif
  2406. lxvd2x vs0, o0, AO // load real,imag from A
  2407. lxvd2x vs1, o16, AO // load real,imag from A
  2408. addi AO, AO, 32
  2409. lxvd2x vs16, o0, BO // load real part from B
  2410. lxvd2x vs17, o16, BO // load imag part from B
  2411. addi BO, BO, 32
  2412. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  2413. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  2414. xvmaddadp vs34, vs1, vs16 // real*real, imag*real
  2415. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
  2416. #if defined(_AIX)
  2417. ')
  2418. #else
  2419. .endm
  2420. #endif
  2421. #if defined(_AIX)
  2422. define(`SAVE1x2', `
  2423. #else
  2424. .macro SAVE1x2
  2425. #endif
  2426. mr T1, CO
  2427. #ifndef TRMMKERNEL
  2428. lxvd2x vs16, o0, T1
  2429. lxvd2x vs17, o16, T1
  2430. #endif
  2431. xxlxor vs0, vs0, vs0
  2432. xxlxor vs1, vs1, vs1
  2433. XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  2434. XSFADD_R1 vs0, vs0, vs32 // realA*realB
  2435. XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
  2436. XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  2437. XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  2438. XSFADD_I1 vs1, vs1, vs32 // realA*imagB
  2439. XSFADD_I2 vs1, vs1, vs33 // imagA*realB
  2440. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  2441. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  2442. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  2443. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  2444. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  2445. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  2446. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  2447. xxlxor vs0, vs0, vs0
  2448. xxlxor vs1, vs1, vs1
  2449. XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  2450. XSFADD_R1 vs0, vs0, vs34 // realA*realB
  2451. XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
  2452. XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  2453. XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  2454. XSFADD_I1 vs1, vs1, vs34 // realA*imagB
  2455. XSFADD_I2 vs1, vs1, vs35 // imagA*realB
  2456. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  2457. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  2458. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  2459. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  2460. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  2461. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  2462. xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
  2463. #ifndef TRMMKERNEL
  2464. xvadddp vs8, vs8, vs16
  2465. xvadddp vs9, vs9, vs17
  2466. #endif
  2467. stxvd2x vs8, o0, T1
  2468. stxvd2x vs9, o16, T1
  2469. add T1, T1, LDC
  2470. addi CO, CO, 32
  2471. #if defined(_AIX)
  2472. ')
  2473. #else
  2474. .endm
  2475. #endif
  2476. /**********************************************************************************************
  2477. * Macros for N=1 and M=1
  2478. **********************************************************************************************/
  2479. #if defined(_AIX)
  2480. define(`LOAD1x1_1', `
  2481. #else
  2482. .macro LOAD1x1_1
  2483. #endif
  2484. lxvd2x vs16, o0, BO // load real part from B
  2485. lxvd2x vs17, o16, BO // load imag part from B
  2486. addi BO, BO, 32
  2487. lxvd2x vs0, o0, AO // load real,imag from A
  2488. addi AO, AO, 16
  2489. #if defined(_AIX)
  2490. ')
  2491. #else
  2492. .endm
  2493. #endif
  2494. #if defined(_AIX)
  2495. define(`KERNEL1x1_I1', `
  2496. #else
  2497. .macro KERNEL1x1_I1
  2498. #endif
  2499. lxvd2x vs8, o0, AO // load real,imag from A
  2500. addi AO, AO, 16
  2501. lxvd2x vs20, o0, BO // load real part from B
  2502. lxvd2x vs21, o16, BO // load imag part from B
  2503. addi BO, BO, 32
  2504. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  2505. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  2506. #if defined(_AIX)
  2507. ')
  2508. #else
  2509. .endm
  2510. #endif
  2511. #if defined(_AIX)
  2512. define(`KERNEL1x1_1', `
  2513. #else
  2514. .macro KERNEL1x1_1
  2515. #endif
  2516. lxvd2x vs8, o0, AO // load real,imag from A
  2517. addi AO, AO, 16
  2518. lxvd2x vs20, o0, BO // load real part from B
  2519. lxvd2x vs21, o16, BO // load imag part from B
  2520. addi BO, BO, 32
  2521. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  2522. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  2523. #if defined(_AIX)
  2524. ')
  2525. #else
  2526. .endm
  2527. #endif
  2528. #if defined(_AIX)
  2529. define(`KERNEL1x1_2', `
  2530. #else
  2531. .macro KERNEL1x1_2
  2532. #endif
  2533. lxvd2x vs0, o0, AO // load real,imag from A
  2534. addi AO, AO, 16
  2535. lxvd2x vs16, o0, BO // load real part from B
  2536. lxvd2x vs17, o16, BO // load imag part from B
  2537. addi BO, BO, 32
  2538. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  2539. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  2540. #if defined(_AIX)
  2541. ')
  2542. #else
  2543. .endm
  2544. #endif
  2545. #if defined(_AIX)
  2546. define(`KERNEL1x1_E2', `
  2547. #else
  2548. .macro KERNEL1x1_E2
  2549. #endif
  2550. xvmaddadp vs32, vs8, vs20 // real*real, imag*real
  2551. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
  2552. #if defined(_AIX)
  2553. ')
  2554. #else
  2555. .endm
  2556. #endif
  2557. #if defined(_AIX)
  2558. define(`KERNEL1x1_SUBI1', `
  2559. #else
  2560. .macro KERNEL1x1_SUBI1
  2561. #endif
  2562. lxvd2x vs0, o0, AO // load real,imag from A
  2563. addi AO, AO, 16
  2564. lxvd2x vs16, o0, BO // load real part from B
  2565. lxvd2x vs17, o16, BO // load imag part from B
  2566. addi BO, BO, 32
  2567. xvmuldp vs32, vs0, vs16 // real*real, imag*real
  2568. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
  2569. #if defined(_AIX)
  2570. ')
  2571. #else
  2572. .endm
  2573. #endif
  2574. #if defined(_AIX)
  2575. define(`KERNEL1x1_SUB1', `
  2576. #else
  2577. .macro KERNEL1x1_SUB1
  2578. #endif
  2579. lxvd2x vs0, o0, AO // load real,imag from A
  2580. addi AO, AO, 16
  2581. lxvd2x vs16, o0, BO // load real part from B
  2582. lxvd2x vs17, o16, BO // load imag part from B
  2583. addi BO, BO, 32
  2584. xvmaddadp vs32, vs0, vs16 // real*real, imag*real
  2585. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
  2586. #if defined(_AIX)
  2587. ')
  2588. #else
  2589. .endm
  2590. #endif
  2591. #if defined(_AIX)
  2592. define(`SAVE1x1', `
  2593. #else
  2594. .macro SAVE1x1
  2595. #endif
  2596. mr T1, CO
  2597. #ifndef TRMMKERNEL
  2598. lxvd2x vs16, o0, T1
  2599. #endif
  2600. xxlxor vs0, vs0, vs0
  2601. xxlxor vs1, vs1, vs1
  2602. XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
  2603. XSFADD_R1 vs0, vs0, vs32 // realA*realB
  2604. XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
  2605. XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
  2606. XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
  2607. XSFADD_I1 vs1, vs1, vs32 // realA*imagB
  2608. XSFADD_I2 vs1, vs1, vs33 // imagA*realB
  2609. xsmuldp vs4, vs0, alpha_r // real*alpha_r
  2610. xsmuldp vs5, vs1, alpha_i // imag*alpha_i
  2611. xsmuldp vs6, vs0, alpha_i // real*alpha_i
  2612. xsmuldp vs7, vs1, alpha_r // imag*alpha_r
  2613. xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
  2614. xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
  2615. xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
  2616. #ifndef TRMMKERNEL
  2617. xvadddp vs8, vs8, vs16
  2618. #endif
  2619. stxvd2x vs8, o0, T1
  2620. add T1, T1, LDC
  2621. addi CO, CO, 16
  2622. #if defined(_AIX)
  2623. ')
  2624. #else
  2625. .endm
  2626. #endif
  2627. #if defined(_AIX)
  2628. define(`ZCOPYB_1x1', `
  2629. #else
  2630. .macro ZCOPYB_1x1
  2631. #endif
  2632. lxvdsx vs4, o0, BO // b0_r
  2633. lxvdsx vs5, o8, BO // b0_i
  2634. addi BO, BO, 16
  2635. stxvd2x vs4, o0, BBO
  2636. stxvd2x vs5, o16, BBO
  2637. addi BBO, BBO, 32
  2638. #if defined(_AIX)
  2639. ')
  2640. #else
  2641. .endm
  2642. #endif
  2643. #if defined(_AIX)
  2644. define(`ZCOPYB_8x1', `
  2645. #else
  2646. .macro ZCOPYB_8x1
  2647. #endif
  2648. lxvd2x vs32, o0, BO
  2649. lxvd2x vs33, o16, BO
  2650. lxvd2x vs34, o32, BO
  2651. lxvd2x vs35, o48, BO
  2652. addi BO, BO, 64
  2653. lxvd2x vs36, o0, BO
  2654. lxvd2x vs37, o16, BO
  2655. lxvd2x vs38, o32, BO
  2656. lxvd2x vs39, o48, BO
  2657. addi BO, BO, 64
  2658. XXSPLTD(vs40,vs32,0)
  2659. XXSPLTD(vs41,vs32,1)
  2660. XXSPLTD(vs42,vs33,0)
  2661. XXSPLTD(vs43,vs33,1)
  2662. XXSPLTD(vs44,vs34,0)
  2663. XXSPLTD(vs45,vs34,1)
  2664. XXSPLTD(vs46,vs35,0)
  2665. XXSPLTD(vs47,vs35,1)
  2666. XXSPLTD(vs48,vs36,0)
  2667. XXSPLTD(vs49,vs36,1)
  2668. XXSPLTD(vs50,vs37,0)
  2669. XXSPLTD(vs51,vs37,1)
  2670. XXSPLTD(vs52,vs38,0)
  2671. XXSPLTD(vs53,vs38,1)
  2672. XXSPLTD(vs54,vs39,0)
  2673. XXSPLTD(vs55,vs39,1)
  2674. stxvd2x vs40, o0, BBO
  2675. stxvd2x vs41, o16, BBO
  2676. stxvd2x vs42, o32, BBO
  2677. stxvd2x vs43, o48, BBO
  2678. addi BBO, BBO, 64
  2679. stxvd2x vs44, o0, BBO
  2680. stxvd2x vs45, o16, BBO
  2681. stxvd2x vs46, o32, BBO
  2682. stxvd2x vs47, o48, BBO
  2683. addi BBO, BBO, 64
  2684. stxvd2x vs48, o0, BBO
  2685. stxvd2x vs49, o16, BBO
  2686. stxvd2x vs50, o32, BBO
  2687. stxvd2x vs51, o48, BBO
  2688. addi BBO, BBO, 64
  2689. stxvd2x vs52, o0, BBO
  2690. stxvd2x vs53, o16, BBO
  2691. stxvd2x vs54, o32, BBO
  2692. stxvd2x vs55, o48, BBO
  2693. addi BBO, BBO, 64
  2694. #if defined(_AIX)
  2695. ')
  2696. #else
  2697. .endm
  2698. #endif