You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_8x8_sandy.S 86 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. #define ASSEMBLER
  29. #include "common.h"
  30. #define old_bm %rdi
  31. #define old_bn %rsi
  32. #define old_bk %rdx
  33. #define bm %r13
  34. #define bn %r14
  35. #define bk %r15
  36. #define ALPHA %xmm0
  37. #define ba %rcx
  38. #define bb %r8
  39. #define C %r9
  40. #define ldc %r10
  41. #define i %r11
  42. #define k %rax
  43. #define ptrba %rdi
  44. #define ptrbb %rsi
  45. #define C0 %rbx
  46. #define C1 %rbp
  47. #define prebb %r12
  48. #ifndef WINDOWS_ABI
  49. #define STACKSIZE 128
  50. #define old_ldc 8+STACKSIZE(%rsp)
  51. #define old_offset 16+STACKSIZE(%rsp)
  52. #define MEMALPHA 48(%rsp)
  53. #define j 56(%rsp)
  54. #define OFFSET 64(%rsp)
  55. #define kk 72(%rsp)
  56. #define kkk 80(%rsp)
  57. #else
  58. #define STACKSIZE 512
  59. #define OLD_A 40 + STACKSIZE(%rsp)
  60. #define OLD_B 48 + STACKSIZE(%rsp)
  61. #define OLD_C 56 + STACKSIZE(%rsp)
  62. #define old_ldc 64 + STACKSIZE(%rsp)
  63. #define old_offset 72 + STACKSIZE(%rsp)
  64. #define MEMALPHA 224(%rsp)
  65. #define j 232(%rsp)
  66. #define OFFSET 240(%rsp)
  67. #define kk 248(%rsp)
  68. #define kkk 256(%rsp)
  69. #endif
  70. #define PREFETCH0 prefetcht0
  71. #define PREFETCH1 prefetcht0
  72. #define PREFETCH2 prefetcht2
  73. #define PRESIZE 80
  74. #define xvec0 %xmm0
  75. #define xvec1 %xmm1
  76. #define xvec2 %xmm2
  77. #define xvec3 %xmm3
  78. #define xvec4 %xmm4
  79. #define xvec5 %xmm5
  80. #define xvec6 %xmm6
  81. #define xvec7 %xmm7
  82. #define xvec8 %xmm8
  83. #define xvec9 %xmm9
  84. #define xvec10 %xmm10
  85. #define xvec11 %xmm11
  86. #define xvec12 %xmm12
  87. #define xvec13 %xmm13
  88. #define xvec14 %xmm14
  89. #define xvec15 %xmm15
  90. #define yvec0 %ymm0
  91. #define yvec1 %ymm1
  92. #define yvec2 %ymm2
  93. #define yvec3 %ymm3
  94. #define yvec4 %ymm4
  95. #define yvec5 %ymm5
  96. #define yvec6 %ymm6
  97. #define yvec7 %ymm7
  98. #define yvec8 %ymm8
  99. #define yvec9 %ymm9
  100. #define yvec10 %ymm10
  101. #define yvec11 %ymm11
  102. #define yvec12 %ymm12
  103. #define yvec13 %ymm13
  104. #define yvec14 %ymm14
  105. #define yvec15 %ymm15
  106. #define LEAQ leaq
  107. #define ADDQ addq
  108. #define MULQ imulq
  109. #define SARQ sarq
  110. #define SALQ salq
  111. #define ANDQ andq
  112. #define SUBQ subq
  113. #define DECQ decq
  114. #define JG jg
  115. #define JLE jle
  116. #define TEST testq
  117. #define OR orq
  118. #define JNE jne
  119. #define JMP jmp
  120. #define NOP
  121. #define XOR xorpd
  122. #undef MOVQ
  123. #define MOVQ movq
  124. #define XOR_SY vxorps
  125. #define XOR_SX vxorps
  126. #define LD_SY vmovaps
  127. #define LD_SX vmovaps
  128. #define LDL_SX vmovlps
  129. #define LDL_SY vmovlps
  130. #define LDH_SX vmovhps
  131. #define LDH_SY vmovhps
  132. #define ST_SY vmovaps
  133. #define ST_SX vmovaps
  134. #define STL_SX vmovlps
  135. #define STL_SY vmovlps
  136. #define STH_SX vmovhps
  137. #define STH_SY vmovhps
  138. #define EDUP_SY vmovsldup
  139. #define ODUP_SY vmovshdup
  140. #define EDUP_SX vmovsldup
  141. #define ODUP_SX vmovshdup
  142. #define ADD_SY vaddps
  143. #define ADD_SX vaddps
  144. #define ADD1_DY vaddpd
  145. #define ADDSUB_SY vaddsubps
  146. #define MUL_SY vmulps
  147. #define MUL_SX vmulps
  148. #define SHUF_SY vperm2f128
  149. #define SHUF_DY vperm2f128
  150. #define SHUF_SX vpshufd
  151. #define VPERMILP_SY vpermilps
  152. #define VPERMILP_SX vpermilps
  153. #define BROAD_SY vbroadcastss
  154. #define BROAD_SX vbroadcastss
  155. #define MOV_SY vmovaps
  156. #define MOV_SX vmovaps
  157. #define REVS_SY vshufps
  158. #define REVS_SX vshufps
  159. #define EXTRA_SY vextractf128
  160. PROLOGUE
  161. subq $STACKSIZE, %rsp;
  162. movq %rbx, 0(%rsp);
  163. movq %rbp, 8(%rsp);
  164. movq %r12, 16(%rsp);
  165. movq %r13, 24(%rsp);
  166. movq %r14, 32(%rsp);
  167. movq %r15, 40(%rsp);
  168. #ifdef WINDOWS_ABI
  169. movq %rdi, 48(%rsp)
  170. movq %rsi, 56(%rsp)
  171. movups %xmm6, 64(%rsp)
  172. movups %xmm7, 80(%rsp)
  173. movups %xmm8, 96(%rsp)
  174. movups %xmm9, 112(%rsp)
  175. movups %xmm10, 128(%rsp)
  176. movups %xmm11, 144(%rsp)
  177. movups %xmm12, 160(%rsp)
  178. movups %xmm13, 176(%rsp)
  179. movups %xmm14, 192(%rsp)
  180. movups %xmm15, 208(%rsp)
  181. movq ARG1, old_bm
  182. movq ARG2, old_bn
  183. movq ARG3, old_bk
  184. movq OLD_A, ba
  185. movq OLD_B, bb
  186. movq OLD_C, C
  187. movq old_ldc, ldc
  188. #ifdef TRMMKERNEL
  189. movq old_offset, %r11
  190. #endif
  191. movaps %xmm3, %xmm0
  192. #else
  193. movq old_ldc, ldc
  194. #ifdef TRMMKERNEL
  195. movq old_offset, %r11
  196. #endif
  197. #endif
  198. vzeroupper
  199. vmovlps ALPHA, MEMALPHA
  200. movq old_bm, bm
  201. movq old_bn, bn
  202. movq old_bk, bk
  203. leaq (, ldc, SIZE), ldc
  204. #ifdef TRMMKERNEL
  205. movq %r11, OFFSET
  206. #ifndef LEFT
  207. negq %r11;
  208. #endif
  209. movq %r11, kk
  210. #endif
  211. MOVQ bn,j;
  212. SARQ $3,j;
  213. JLE .L0_loopE;
  214. ALIGN_4;
  215. .L0_bodyB:;
  216. #if defined(TRMMKERNEL) && defined(LEFT)
  217. MOVQ OFFSET, %rax;
  218. MOVQ %rax, kk;
  219. #endif
  220. MOVQ C,C0;
  221. LEAQ (C,ldc,4),C1;
  222. MOVQ bk, k;
  223. SALQ $5, k;
  224. LEAQ (bb, k, 1), prebb;
  225. MOVQ ba,ptrba;
  226. MOVQ bm,i;
  227. SARQ $3,i;
  228. JLE .L1_loopE;
  229. ALIGN_4;
  230. .L1_bodyB:;
  231. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  232. MOVQ bb, ptrbb;
  233. #else
  234. MOVQ bb, ptrbb;
  235. MOVQ kk, %rax;
  236. LEAQ (, %rax, SIZE), %rax;
  237. LEAQ (ptrba, %rax, 8), ptrba;
  238. LEAQ (ptrbb, %rax, 8), ptrbb;
  239. #endif
  240. #### Initial Results Register ####
  241. XOR_SY yvec15, yvec15, yvec15;
  242. PREFETCH0 0*SIZE(prebb);
  243. XOR_SY yvec14, yvec14, yvec14;
  244. PREFETCH0 16*SIZE(prebb);
  245. XOR_SY yvec13, yvec13, yvec13;
  246. PREFETCH0 32*SIZE(prebb);
  247. XOR_SY yvec12, yvec12, yvec12;
  248. ADDQ $48*SIZE, prebb;
  249. EDUP_SY 0*SIZE(ptrbb), yvec2;
  250. LEAQ (ldc, ldc, 2), %rax;
  251. PREFETCH2 7*SIZE(C0);
  252. PREFETCH2 7*SIZE(C1);
  253. XOR_SY yvec11, yvec11, yvec11;
  254. XOR_SY yvec10, yvec10, yvec10;
  255. PREFETCH2 7*SIZE(C0, ldc, 1);
  256. PREFETCH2 7*SIZE(C1, ldc, 1);
  257. LD_SY 0*SIZE(ptrba), yvec0;
  258. XOR_SY yvec9, yvec9, yvec9;
  259. PREFETCH2 7*SIZE(C0, ldc, 2);
  260. PREFETCH2 7*SIZE(C1, ldc, 2);
  261. XOR_SY yvec8, yvec8, yvec8;
  262. VPERMILP_SY $0x4e, yvec2, yvec3;
  263. PREFETCH2 7*SIZE(C0, %rax, 1);
  264. PREFETCH2 7*SIZE(C1, %rax, 1);
  265. #ifndef TRMMKERNEL
  266. MOVQ bk,k;
  267. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  268. MOVQ bk, %rax;
  269. SUBQ kk, %rax;
  270. MOVQ %rax, kkk;
  271. #else
  272. MOVQ kk, %rax;
  273. #ifdef LEFT
  274. ADDQ $8, %rax;
  275. #else
  276. ADDQ $8, %rax;
  277. #endif
  278. MOVQ %rax, kkk;
  279. #endif
  280. SARQ $2,k;
  281. JLE .L2_loopE;
  282. ALIGN_4;
  283. .L2_bodyB:;
  284. # Computing kernel
  285. #### Unroll times 1 ####
  286. PREFETCH0 PRESIZE*SIZE(ptrba);
  287. MUL_SY yvec0, yvec2, yvec6;
  288. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  289. ODUP_SY 0*SIZE(ptrbb), yvec2
  290. MUL_SY yvec0, yvec3, yvec7;
  291. SHUF_SY $0x03, yvec3, yvec3, yvec5
  292. ADD_SY yvec15, yvec6, yvec15
  293. ADD_SY yvec13, yvec7, yvec13;
  294. LD_SY 8*SIZE(ptrba), yvec1;
  295. VPERMILP_SY $0x4e, yvec2, yvec3;
  296. MUL_SY yvec0, yvec4, yvec6;
  297. MUL_SY yvec0, yvec5, yvec7;
  298. ADD_SY yvec11, yvec6, yvec11;
  299. ADD_SY yvec9, yvec7, yvec9;
  300. MUL_SY yvec0, yvec2, yvec6;
  301. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  302. EDUP_SY 8*SIZE(ptrbb), yvec2;
  303. MUL_SY yvec0, yvec3, yvec7;
  304. SHUF_SY $0x03, yvec3, yvec3, yvec5;
  305. ADD_SY yvec14, yvec6, yvec14;
  306. ADD_SY yvec12, yvec7, yvec12;
  307. VPERMILP_SY $0x4e, yvec2, yvec3;
  308. MUL_SY yvec0, yvec4, yvec6;
  309. MUL_SY yvec0, yvec5, yvec7;
  310. ADD_SY yvec10, yvec6, yvec10;
  311. ADD_SY yvec8, yvec7, yvec8;
  312. #### Unroll times 2 ####
  313. MUL_SY yvec1, yvec2, yvec6;
  314. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  315. ODUP_SY 8*SIZE(ptrbb), yvec2
  316. MUL_SY yvec1, yvec3, yvec7;
  317. SHUF_SY $0x03, yvec3, yvec3, yvec5
  318. ADD_SY yvec15, yvec6, yvec15
  319. ADD_SY yvec13, yvec7, yvec13;
  320. LD_SY 16*SIZE(ptrba), yvec0;
  321. VPERMILP_SY $0x4e, yvec2, yvec3;
  322. MUL_SY yvec1, yvec4, yvec6;
  323. MUL_SY yvec1, yvec5, yvec7;
  324. ADD_SY yvec11, yvec6, yvec11;
  325. ADD_SY yvec9, yvec7, yvec9;
  326. MUL_SY yvec1, yvec2, yvec6;
  327. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  328. EDUP_SY 16*SIZE(ptrbb), yvec2;
  329. MUL_SY yvec1, yvec3, yvec7;
  330. SHUF_SY $0x03, yvec3, yvec3, yvec5;
  331. ADD_SY yvec14, yvec6, yvec14;
  332. ADD_SY yvec12, yvec7, yvec12;
  333. VPERMILP_SY $0x4e, yvec2, yvec3;
  334. MUL_SY yvec1, yvec4, yvec6;
  335. MUL_SY yvec1, yvec5, yvec7;
  336. ADD_SY yvec10, yvec6, yvec10;
  337. ADD_SY yvec8, yvec7, yvec8;
  338. #### Unroll times 3 ####
  339. PREFETCH0 (PRESIZE+16)*SIZE(ptrba);
  340. MUL_SY yvec0, yvec2, yvec6;
  341. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  342. ODUP_SY 16*SIZE(ptrbb), yvec2
  343. MUL_SY yvec0, yvec3, yvec7;
  344. SHUF_SY $0x03, yvec3, yvec3, yvec5
  345. ADD_SY yvec15, yvec6, yvec15
  346. ADD_SY yvec13, yvec7, yvec13;
  347. LD_SY 24*SIZE(ptrba), yvec1;
  348. VPERMILP_SY $0x4e, yvec2, yvec3;
  349. MUL_SY yvec0, yvec4, yvec6;
  350. MUL_SY yvec0, yvec5, yvec7;
  351. ADD_SY yvec11, yvec6, yvec11;
  352. ADD_SY yvec9, yvec7, yvec9;
  353. ADDQ $32*SIZE, ptrba;
  354. MUL_SY yvec0, yvec2, yvec6;
  355. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  356. MUL_SY yvec0, yvec3, yvec7;
  357. SHUF_SY $0x03, yvec3, yvec3, yvec5;
  358. EDUP_SY 24*SIZE(ptrbb), yvec2;
  359. ADD_SY yvec14, yvec6, yvec14;
  360. ADD_SY yvec12, yvec7, yvec12;
  361. VPERMILP_SY $0x4e, yvec2, yvec3;
  362. MUL_SY yvec0, yvec4, yvec6;
  363. MUL_SY yvec0, yvec5, yvec7;
  364. ADD_SY yvec10, yvec6, yvec10;
  365. ADD_SY yvec8, yvec7, yvec8;
  366. #### Unroll times 4 ####
  367. MUL_SY yvec1, yvec2, yvec6;
  368. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  369. ODUP_SY 24*SIZE(ptrbb), yvec2
  370. MUL_SY yvec1, yvec3, yvec7;
  371. SHUF_SY $0x03, yvec3, yvec3, yvec5
  372. ADDQ $32*SIZE, ptrbb;
  373. ADD_SY yvec15, yvec6, yvec15
  374. ADD_SY yvec13, yvec7, yvec13;
  375. LD_SY 0*SIZE(ptrba), yvec0;
  376. VPERMILP_SY $0x4e, yvec2, yvec3;
  377. MUL_SY yvec1, yvec4, yvec6;
  378. MUL_SY yvec1, yvec5, yvec7;
  379. ADD_SY yvec11, yvec6, yvec11;
  380. ADD_SY yvec9, yvec7, yvec9;
  381. MUL_SY yvec1, yvec2, yvec6;
  382. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  383. EDUP_SY 0*SIZE(ptrbb), yvec2;
  384. MUL_SY yvec1, yvec3, yvec7;
  385. SHUF_SY $0x03, yvec3, yvec3, yvec5;
  386. ADD_SY yvec14, yvec6, yvec14;
  387. ADD_SY yvec12, yvec7, yvec12;
  388. VPERMILP_SY $0x4e, yvec2, yvec3;
  389. MUL_SY yvec1, yvec4, yvec6;
  390. MUL_SY yvec1, yvec5, yvec7;
  391. ADD_SY yvec10, yvec6, yvec10;
  392. ADD_SY yvec8, yvec7, yvec8;
  393. .L2_bodyE:;
  394. DECQ k;
  395. JG .L2_bodyB;
  396. ALIGN_4
  397. .L2_loopE:;
  398. #ifndef TRMMKERNEL
  399. TEST $2, bk;
  400. #else
  401. TEST $2, kkk;
  402. #endif
  403. JLE .L3_loopE;
  404. ALIGN_4
  405. .L3_loobB:
  406. #### Unroll times 1 ####
  407. MUL_SY yvec0, yvec2, yvec6;
  408. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  409. ODUP_SY 0*SIZE(ptrbb), yvec2
  410. MUL_SY yvec0, yvec3, yvec7;
  411. SHUF_SY $0x03, yvec3, yvec3, yvec5
  412. ADD_SY yvec15, yvec6, yvec15
  413. ADD_SY yvec13, yvec7, yvec13;
  414. LD_SY 8*SIZE(ptrba), yvec1;
  415. VPERMILP_SY $0x4e, yvec2, yvec3;
  416. MUL_SY yvec0, yvec4, yvec6;
  417. MUL_SY yvec0, yvec5, yvec7;
  418. ADDQ $16*SIZE, ptrba;
  419. ADD_SY yvec11, yvec6, yvec11;
  420. ADD_SY yvec9, yvec7, yvec9;
  421. MUL_SY yvec0, yvec2, yvec6;
  422. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  423. EDUP_SY 8*SIZE(ptrbb), yvec2;
  424. MUL_SY yvec0, yvec3, yvec7;
  425. SHUF_SY $0x03, yvec3, yvec3, yvec5;
  426. ADD_SY yvec14, yvec6, yvec14;
  427. ADD_SY yvec12, yvec7, yvec12;
  428. VPERMILP_SY $0x4e, yvec2, yvec3;
  429. MUL_SY yvec0, yvec4, yvec6;
  430. MUL_SY yvec0, yvec5, yvec7;
  431. ADD_SY yvec10, yvec6, yvec10;
  432. ADD_SY yvec8, yvec7, yvec8;
  433. #### Unroll times 2 ####
  434. MUL_SY yvec1, yvec2, yvec6;
  435. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  436. ODUP_SY 8*SIZE(ptrbb), yvec2
  437. MUL_SY yvec1, yvec3, yvec7;
  438. SHUF_SY $0x03, yvec3, yvec3, yvec5
  439. ADDQ $16*SIZE, ptrbb
  440. ADD_SY yvec15, yvec6, yvec15
  441. ADD_SY yvec13, yvec7, yvec13;
  442. LD_SY 0*SIZE(ptrba), yvec0;
  443. VPERMILP_SY $0x4e, yvec2, yvec3;
  444. MUL_SY yvec1, yvec4, yvec6;
  445. MUL_SY yvec1, yvec5, yvec7;
  446. ADD_SY yvec11, yvec6, yvec11;
  447. ADD_SY yvec9, yvec7, yvec9;
  448. MUL_SY yvec1, yvec2, yvec6;
  449. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  450. EDUP_SY 0*SIZE(ptrbb), yvec2;
  451. MUL_SY yvec1, yvec3, yvec7;
  452. SHUF_SY $0x03, yvec3, yvec3, yvec5;
  453. ADD_SY yvec14, yvec6, yvec14;
  454. ADD_SY yvec12, yvec7, yvec12;
  455. VPERMILP_SY $0x4e, yvec2, yvec3;
  456. MUL_SY yvec1, yvec4, yvec6;
  457. MUL_SY yvec1, yvec5, yvec7;
  458. ADD_SY yvec10, yvec6, yvec10;
  459. ADD_SY yvec8, yvec7, yvec8;
  460. .L3_loopE:
  461. #ifndef TRMMKERNEL
  462. TEST $1, bk;
  463. #else
  464. TEST $1, kkk;
  465. #endif
  466. JLE .L4_loopE;
  467. ALIGN_4
  468. .L4_loopB:;
  469. #### Unroll times 1 ####
  470. MUL_SY yvec0, yvec2, yvec6;
  471. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  472. ODUP_SY 0*SIZE(ptrbb), yvec2
  473. MUL_SY yvec0, yvec3, yvec7;
  474. SHUF_SY $0x03, yvec3, yvec3, yvec5
  475. ADDQ $8*SIZE, ptrba;
  476. ADD_SY yvec15, yvec6, yvec15
  477. ADD_SY yvec13, yvec7, yvec13;
  478. VPERMILP_SY $0x4e, yvec2, yvec3;
  479. MUL_SY yvec0, yvec4, yvec6;
  480. MUL_SY yvec0, yvec5, yvec7;
  481. ADDQ $8*SIZE, ptrbb;
  482. ADD_SY yvec11, yvec6, yvec11;
  483. ADD_SY yvec9, yvec7, yvec9;
  484. MUL_SY yvec0, yvec2, yvec6;
  485. SHUF_SY $0x03, yvec2, yvec2, yvec4;
  486. MUL_SY yvec0, yvec3, yvec7;
  487. SHUF_SY $0x03, yvec3, yvec3, yvec5;
  488. ADD_SY yvec14, yvec6, yvec14;
  489. ADD_SY yvec12, yvec7, yvec12;
  490. MUL_SY yvec0, yvec4, yvec6;
  491. MUL_SY yvec0, yvec5, yvec7;
  492. ADD_SY yvec10, yvec6, yvec10;
  493. ADD_SY yvec8, yvec7, yvec8;
  494. .L4_loopE:;
  495. #### Load Alpha ####
  496. BROAD_SY MEMALPHA,yvec7;
  497. MUL_SY yvec7,yvec15,yvec15;
  498. MUL_SY yvec7,yvec14,yvec14;
  499. MUL_SY yvec7,yvec13,yvec13;
  500. MUL_SY yvec7,yvec12,yvec12;
  501. MUL_SY yvec7,yvec11,yvec11;
  502. MUL_SY yvec7,yvec10,yvec10;
  503. MUL_SY yvec7,yvec9,yvec9;
  504. MUL_SY yvec7,yvec8,yvec8;
  505. MOV_SY yvec15,yvec7;
  506. REVS_SY $0xe4,yvec13,yvec15,yvec15;
  507. REVS_SY $0xe4,yvec7,yvec13,yvec13;
  508. MOV_SY yvec14,yvec7;
  509. REVS_SY $0xe4,yvec12,yvec14,yvec14;
  510. REVS_SY $0xe4,yvec7,yvec12,yvec12;
  511. MOV_SY yvec11,yvec7;
  512. REVS_SY $0xe4,yvec9,yvec11,yvec11;
  513. REVS_SY $0xe4,yvec7,yvec9,yvec9;
  514. MOV_SY yvec10,yvec7;
  515. REVS_SY $0xe4,yvec8,yvec10,yvec10;
  516. REVS_SY $0xe4,yvec7,yvec8,yvec8;
  517. ##### Testing alignment #####
  518. MOVQ C0, %rax;
  519. OR ldc, %rax;
  520. TEST $15, %rax;
  521. JNE .L4_loopEx;
  522. ALIGN_4
  523. LEAQ (ldc,ldc,2),%rax;
  524. EXTRA_SY $1,yvec15,xvec7;
  525. EXTRA_SY $1,yvec14,xvec6;
  526. EXTRA_SY $1,yvec13,xvec5;
  527. EXTRA_SY $1,yvec12,xvec4;
  528. EXTRA_SY $1,yvec11,xvec3;
  529. EXTRA_SY $1,yvec10,xvec2;
  530. EXTRA_SY $1,yvec9,xvec1;
  531. EXTRA_SY $1,yvec8,xvec0;
  532. #ifndef TRMMKERNEL
  533. ADD_SY 0*SIZE(C0), xvec15, xvec15;
  534. ADD_SY 4*SIZE(C1), xvec7, xvec7;
  535. ADD_SY 0*SIZE(C0,ldc,1), xvec14, xvec14;
  536. ADD_SY 4*SIZE(C1,ldc,1), xvec6, xvec6;
  537. ADD_SY 0*SIZE(C0,ldc,2), xvec13, xvec13;
  538. ADD_SY 4*SIZE(C1,ldc,2), xvec5, xvec5;
  539. ADD_SY 0*SIZE(C0,%rax,1), xvec12, xvec12;
  540. ADD_SY 4*SIZE(C1,%rax,1), xvec4, xvec4;
  541. ADD_SY 0*SIZE(C1), xvec11, xvec11;
  542. ADD_SY 4*SIZE(C0), xvec3, xvec3;
  543. ADD_SY 0*SIZE(C1,ldc,1), xvec10, xvec10;
  544. ADD_SY 4*SIZE(C0,ldc,1), xvec2, xvec2;
  545. ADD_SY 0*SIZE(C1,ldc,2), xvec9, xvec9;
  546. ADD_SY 4*SIZE(C0,ldc,2), xvec1, xvec1;
  547. ADD_SY 0*SIZE(C1,%rax,1), xvec8, xvec8;
  548. ADD_SY 4*SIZE(C0,%rax,1), xvec0, xvec0;
  549. #endif
  550. ST_SY xvec15,0*SIZE(C0);
  551. ST_SY xvec7,4*SIZE(C1);
  552. ST_SY xvec14,0*SIZE(C0,ldc,1);
  553. ST_SY xvec6,4*SIZE(C1,ldc,1);
  554. ST_SY xvec13,0*SIZE(C0,ldc,2);
  555. ST_SY xvec5,4*SIZE(C1,ldc,2);
  556. ST_SY xvec12,0*SIZE(C0,%rax,1);
  557. ST_SY xvec4,4*SIZE(C1,%rax,1);
  558. ST_SY xvec11,0*SIZE(C1);
  559. ST_SY xvec3,4*SIZE(C0);
  560. ST_SY xvec10,0*SIZE(C1,ldc,1);
  561. ST_SY xvec2,4*SIZE(C0,ldc,1);
  562. ST_SY xvec9,0*SIZE(C1,ldc,2);
  563. ST_SY xvec1,4*SIZE(C0,ldc,2);
  564. ST_SY xvec8,0*SIZE(C1,%rax,1);
  565. ST_SY xvec0,4*SIZE(C0,%rax,1);
  566. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  567. MOVQ bk, %rax;
  568. SUBQ kkk, %rax;
  569. LEAQ (, %rax, SIZE), %rax;
  570. LEAQ (ptrba, %rax, 8), ptrba;
  571. LEAQ (ptrbb, %rax, 8), ptrbb;
  572. #endif
  573. #if defined(TRMMKERNEL) && defined(LEFT)
  574. ADDQ $8, kk
  575. #endif
  576. ADDQ $8*SIZE,C0;
  577. ADDQ $8*SIZE,C1;
  578. .L1_bodyE:;
  579. DECQ i;
  580. JG .L1_bodyB;
  581. JMP .L1_loopE;
  582. ALIGN_4;
  583. .L4_loopEx:
  584. LEAQ (ldc,ldc,2),%rax;
  585. EXTRA_SY $1, yvec15, xvec7;
  586. #ifndef TRMMKERNEL
  587. LDL_SY 0*SIZE(C0), xvec6, xvec6;
  588. LDH_SY 2*SIZE(C0), xvec6, xvec6;
  589. ADD_SY xvec6, xvec15, xvec15;
  590. #endif
  591. STL_SY xvec15, 0*SIZE(C0);
  592. STH_SY xvec15, 2*SIZE(C0);
  593. #ifndef TRMMKERNEL
  594. LDL_SY 4*SIZE(C1), xvec5, xvec5;
  595. LDH_SY 6*SIZE(C1), xvec5, xvec5;
  596. ADD_SY xvec5, xvec7, xvec7;
  597. #endif
  598. STL_SY xvec7, 4*SIZE(C1);
  599. STH_SY xvec7, 6*SIZE(C1);
  600. EXTRA_SY $1, yvec14, xvec6;
  601. #ifndef TRMMKERNEL
  602. LDL_SY 0*SIZE(C0, ldc, 1), xvec5, xvec5;
  603. LDH_SY 2*SIZE(C0, ldc, 1), xvec5, xvec5;
  604. ADD_SY xvec5, xvec14, xvec14;
  605. #endif
  606. STL_SY xvec14, 0*SIZE(C0, ldc, 1);
  607. STH_SY xvec14, 2*SIZE(C0, ldc, 1);
  608. #ifndef TRMMKERNEL
  609. LDL_SY 4*SIZE(C1, ldc, 1), xvec4, xvec4;
  610. LDH_SY 6*SIZE(C1, ldc, 1), xvec4, xvec4;
  611. ADD_SY xvec4, xvec6, xvec6;
  612. #endif
  613. STL_SY xvec6, 4*SIZE(C1, ldc, 1);
  614. STH_SY xvec6, 6*SIZE(C1, ldc, 1);
  615. EXTRA_SY $1, yvec13, xvec5;
  616. #ifndef TRMMKERNEL
  617. LDL_SY 0*SIZE(C0, ldc, 2), xvec4, xvec4;
  618. LDH_SY 2*SIZE(C0, ldc, 2), xvec4, xvec4;
  619. ADD_SY xvec4, xvec13, xvec13;
  620. #endif
  621. STL_SY xvec13, 0*SIZE(C0, ldc, 2);
  622. STH_SY xvec13, 2*SIZE(C0, ldc, 2);
  623. #ifndef TRMMKERNEL
  624. LDL_SY 4*SIZE(C1, ldc, 2), xvec3, xvec3;
  625. LDH_SY 6*SIZE(C1, ldc, 2), xvec3, xvec3;
  626. ADD_SY xvec3, xvec5, xvec5;
  627. #endif
  628. STL_SY xvec5, 4*SIZE(C1, ldc, 2);
  629. STH_SY xvec5, 6*SIZE(C1, ldc, 2);
  630. EXTRA_SY $1, yvec12, xvec4;
  631. #ifndef TRMMKERNEL
  632. LDL_SY 0*SIZE(C0, %rax, 1), xvec3, xvec3;
  633. LDH_SY 2*SIZE(C0, %rax, 1), xvec3, xvec3;
  634. ADD_SY xvec3, xvec12, xvec12;
  635. #endif
  636. STL_SY xvec12, 0*SIZE(C0, %rax, 1);
  637. STH_SY xvec12, 2*SIZE(C0, %rax, 1);
  638. #ifndef TRMMKERNEL
  639. LDL_SY 4*SIZE(C1, %rax, 1), xvec2, xvec2;
  640. LDH_SY 6*SIZE(C1, %rax, 1), xvec2, xvec2;
  641. ADD_SY xvec2, xvec4, xvec4;
  642. #endif
  643. STL_SY xvec4, 4*SIZE(C1, %rax, 1);
  644. STH_SY xvec4, 6*SIZE(C1, %rax, 1);
  645. EXTRA_SY $1, yvec11, xvec3;
  646. #ifndef TRMMKERNEL
  647. LDL_SY 0*SIZE(C1), xvec2, xvec2;
  648. LDH_SY 2*SIZE(C1), xvec2, xvec2;
  649. ADD_SY xvec2, xvec11, xvec11;
  650. #endif
  651. STL_SY xvec11, 0*SIZE(C1);
  652. STH_SY xvec11, 2*SIZE(C1);
  653. #ifndef TRMMKERNEL
  654. LDL_SY 4*SIZE(C0), xvec1, xvec1;
  655. LDH_SY 6*SIZE(C0), xvec1, xvec1;
  656. ADD_SY xvec1, xvec3, xvec3;
  657. #endif
  658. STL_SY xvec3, 4*SIZE(C0);
  659. STH_SY xvec3, 6*SIZE(C0);
  660. EXTRA_SY $1, yvec10, xvec2;
  661. #ifndef TRMMKERNEL
  662. LDL_SY 0*SIZE(C1, ldc, 1), xvec1, xvec1;
  663. LDH_SY 2*SIZE(C1, ldc, 1), xvec1, xvec1;
  664. ADD_SY xvec1, xvec10, xvec10;
  665. #endif
  666. STL_SY xvec10, 0*SIZE(C1, ldc, 1);
  667. STH_SY xvec10, 2*SIZE(C1, ldc, 1);
  668. #ifndef TRMMKERNEL
  669. LDL_SY 4*SIZE(C0, ldc, 1), xvec0, xvec0;
  670. LDH_SY 6*SIZE(C0, ldc, 1), xvec0, xvec0;
  671. ADD_SY xvec0, xvec2, xvec2;
  672. #endif
  673. STL_SY xvec2, 4*SIZE(C0, ldc, 1);
  674. STH_SY xvec2, 6*SIZE(C0, ldc, 1);
  675. EXTRA_SY $1, yvec9, xvec1;
  676. #ifndef TRMMKERNEL
  677. LDL_SY 0*SIZE(C1, ldc, 2), xvec0, xvec0;
  678. LDH_SY 2*SIZE(C1, ldc, 2), xvec0, xvec0;
  679. ADD_SY xvec0, xvec9, xvec9;
  680. #endif
  681. STL_SY xvec9, 0*SIZE(C1, ldc, 2);
  682. STH_SY xvec9, 2*SIZE(C1, ldc, 2);
  683. #ifndef TRMMKERNEL
  684. LDL_SY 4*SIZE(C0, ldc, 2), xvec7, xvec7;
  685. LDH_SY 6*SIZE(C0, ldc, 2), xvec7, xvec7;
  686. ADD_SY xvec7, xvec1, xvec1;
  687. #endif
  688. STL_SY xvec1, 4*SIZE(C0, ldc, 2);
  689. STH_SY xvec1, 6*SIZE(C0, ldc, 2);
  690. EXTRA_SY $1, yvec8, xvec0;
  691. #ifndef TRMMKERNEL
  692. LDL_SY 0*SIZE(C1, %rax, 1), xvec6, xvec6;
  693. LDH_SY 2*SIZE(C1, %rax, 1), xvec6, xvec6;
  694. ADD_SY xvec6, xvec8, xvec8;
  695. #endif
  696. STL_SY xvec8, 0*SIZE(C1, %rax, 1);
  697. STH_SY xvec8, 2*SIZE(C1, %rax, 1);
  698. #ifndef TRMMKERNEL
  699. LDL_SY 4*SIZE(C0, %rax, 1), xvec5, xvec5;
  700. LDH_SY 6*SIZE(C0, %rax, 1), xvec5, xvec5;
  701. ADD_SY xvec5, xvec0, xvec0;
  702. #endif
  703. STL_SY xvec0, 4*SIZE(C0, %rax, 1);
  704. STH_SY xvec0, 6*SIZE(C0, %rax, 1);
  705. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  706. MOVQ bk, %rax;
  707. SUBQ kkk, %rax;
  708. LEAQ (, %rax, SIZE), %rax;
  709. LEAQ (ptrba, %rax, 8), ptrba;
  710. LEAQ (ptrbb, %rax, 8), ptrbb;
  711. #endif
  712. #if defined(TRMMKERNEL) && defined(LEFT)
  713. ADDQ $8, kk
  714. #endif
  715. ADDQ $8*SIZE, C0;
  716. ADDQ $8*SIZE, C1;
  717. DECQ i;
  718. JG .L1_bodyB;
  719. ALIGN_4
  720. .L1_loopE:;
  721. TEST $4, bm;
  722. JLE .L5_loopE;
  723. ALIGN_4
  724. .L5_bodyB:
  725. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  726. MOVQ bb, ptrbb;
  727. #else
  728. MOVQ bb, ptrbb;
  729. MOVQ kk, %rax;
  730. LEAQ (, %rax, SIZE), %rax;
  731. LEAQ (ptrba, %rax, 4), ptrba;
  732. LEAQ (ptrbb, %rax, 8), ptrbb;
  733. #endif
  734. #### Initial Results Register ####
  735. XOR_SY yvec15, yvec15, yvec15;
  736. XOR_SY yvec14, yvec14, yvec14;
  737. XOR_SY yvec13, yvec13, yvec13;
  738. XOR_SY yvec12, yvec12, yvec12;
  739. LD_SX 0*SIZE(ptrba), xvec0;
  740. XOR_SY yvec11, yvec11, yvec11;
  741. XOR_SY yvec10, yvec10, yvec10;
  742. EDUP_SX 0*SIZE(ptrbb), xvec2;
  743. XOR_SY yvec9, yvec9, yvec9;
  744. XOR_SY yvec8, yvec8, yvec8;
  745. ODUP_SX 0*SIZE(ptrbb), xvec3;
  746. #ifndef TRMMKERNEL
  747. MOVQ bk, k;
  748. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  749. MOVQ bk, %rax;
  750. SUBQ kk, %rax;
  751. MOVQ %rax, kkk;
  752. #else
  753. MOVQ kk, %rax;
  754. #ifdef LEFT
  755. ADDQ $4, %rax;
  756. #else
  757. ADDQ $8, %rax;
  758. #endif
  759. MOVQ %rax, kkk;
  760. #endif
  761. SARQ $2, k;
  762. JLE .L8_loopE;
  763. ALIGN_4
  764. .L8_bodyB:
  765. #### Unroll time 1 ####
  766. SHUF_SX $0x4e, xvec2, xvec4;
  767. MUL_SX xvec0, xvec2, xvec2;
  768. ADD_SX xvec2, xvec15, xvec15;
  769. SHUF_SX $0x4e, xvec3, xvec5;
  770. MUL_SX xvec0, xvec3, xvec3;
  771. ADD_SX xvec3, xvec14, xvec14;
  772. EDUP_SX 4*SIZE(ptrbb), xvec2;
  773. MUL_SX xvec0, xvec4, xvec4;
  774. ADD_SX xvec4, xvec13, xvec13;
  775. ODUP_SX 4*SIZE(ptrbb), xvec3;
  776. MUL_SX xvec0, xvec5, xvec5;
  777. ADD_SX xvec5, xvec12, xvec12;
  778. LD_SX 4*SIZE(ptrba), xvec1;
  779. SHUF_SX $0x4e, xvec2, xvec4;
  780. MUL_SX xvec0, xvec2, xvec2;
  781. ADD_SX xvec2, xvec11, xvec11;
  782. SHUF_SX $0x4e, xvec3, xvec5;
  783. MUL_SX xvec0, xvec3, xvec3;
  784. ADD_SX xvec3, xvec10, xvec10;
  785. EDUP_SX 8*SIZE(ptrbb), xvec2;
  786. MUL_SX xvec0, xvec4, xvec4;
  787. ADD_SX xvec4, xvec9, xvec9;
  788. ODUP_SX 8*SIZE(ptrbb), xvec3;
  789. MUL_SX xvec0, xvec5, xvec5;
  790. ADD_SX xvec5, xvec8, xvec8;
  791. #### Unroll time 2 ####
  792. SHUF_SX $0x4e, xvec2, xvec4;
  793. MUL_SX xvec1, xvec2, xvec2;
  794. ADD_SX xvec2, xvec15, xvec15;
  795. SHUF_SX $0x4e, xvec3, xvec5;
  796. MUL_SX xvec1, xvec3, xvec3;
  797. ADD_SX xvec3, xvec14, xvec14;
  798. EDUP_SX 12*SIZE(ptrbb), xvec2;
  799. MUL_SX xvec1, xvec4, xvec4;
  800. ADD_SX xvec4, xvec13, xvec13;
  801. ODUP_SX 12*SIZE(ptrbb), xvec3;
  802. MUL_SX xvec1, xvec5, xvec5;
  803. ADD_SX xvec5, xvec12, xvec12;
  804. LD_SX 8*SIZE(ptrba), xvec0;
  805. SHUF_SX $0x4e, xvec2, xvec4;
  806. MUL_SX xvec1, xvec2, xvec2;
  807. ADD_SX xvec2, xvec11, xvec11;
  808. SHUF_SX $0x4e, xvec3, xvec5;
  809. MUL_SX xvec1, xvec3, xvec3;
  810. ADD_SX xvec3, xvec10, xvec10;
  811. EDUP_SX 16*SIZE(ptrbb), xvec2;
  812. MUL_SX xvec1, xvec4, xvec4;
  813. ADD_SX xvec4, xvec9, xvec9;
  814. ODUP_SX 16*SIZE(ptrbb), xvec3;
  815. MUL_SX xvec1, xvec5, xvec5;
  816. ADD_SX xvec5, xvec8, xvec8;
  817. #### Unroll time 3 ####
  818. SHUF_SX $0x4e, xvec2, xvec4;
  819. MUL_SX xvec0, xvec2, xvec2;
  820. ADD_SX xvec2, xvec15, xvec15;
  821. SHUF_SX $0x4e, xvec3, xvec5;
  822. MUL_SX xvec0, xvec3, xvec3;
  823. ADD_SX xvec3, xvec14, xvec14;
  824. EDUP_SX 20*SIZE(ptrbb), xvec2;
  825. MUL_SX xvec0, xvec4, xvec4;
  826. ADD_SX xvec4, xvec13, xvec13;
  827. ODUP_SX 20*SIZE(ptrbb), xvec3;
  828. MUL_SX xvec0, xvec5, xvec5;
  829. ADD_SX xvec5, xvec12, xvec12;
  830. LD_SX 12*SIZE(ptrba), xvec1;
  831. SHUF_SX $0x4e, xvec2, xvec4;
  832. MUL_SX xvec0, xvec2, xvec2;
  833. ADD_SX xvec2, xvec11, xvec11;
  834. SHUF_SX $0x4e, xvec3, xvec5;
  835. MUL_SX xvec0, xvec3, xvec3;
  836. ADD_SX xvec3, xvec10, xvec10;
  837. EDUP_SX 24*SIZE(ptrbb), xvec2;
  838. MUL_SX xvec0, xvec4, xvec4;
  839. ADD_SX xvec4, xvec9, xvec9;
  840. ODUP_SX 24*SIZE(ptrbb), xvec3;
  841. MUL_SX xvec0, xvec5, xvec5;
  842. ADD_SX xvec5, xvec8, xvec8;
  843. ADDQ $16*SIZE, ptrba;
  844. #### Unroll time 4 ####
  845. SHUF_SX $0x4e, xvec2, xvec4;
  846. MUL_SX xvec1, xvec2, xvec2;
  847. ADD_SX xvec2, xvec15, xvec15;
  848. SHUF_SX $0x4e, xvec3, xvec5;
  849. MUL_SX xvec1, xvec3, xvec3;
  850. ADD_SX xvec3, xvec14, xvec14;
  851. EDUP_SX 28*SIZE(ptrbb), xvec2;
  852. MUL_SX xvec1, xvec4, xvec4;
  853. ADD_SX xvec4, xvec13, xvec13;
  854. ODUP_SX 28*SIZE(ptrbb), xvec3;
  855. MUL_SX xvec1, xvec5, xvec5;
  856. ADD_SX xvec5, xvec12, xvec12;
  857. ADDQ $32*SIZE, ptrbb;
  858. LD_SX 0*SIZE(ptrba), xvec0;
  859. SHUF_SX $0x4e, xvec2, xvec4;
  860. MUL_SX xvec1, xvec2, xvec2;
  861. ADD_SX xvec2, xvec11, xvec11;
  862. SHUF_SX $0x4e, xvec3, xvec5;
  863. MUL_SX xvec1, xvec3, xvec3;
  864. ADD_SX xvec3, xvec10, xvec10;
  865. EDUP_SX 0*SIZE(ptrbb), xvec2;
  866. MUL_SX xvec1, xvec4, xvec4;
  867. ADD_SX xvec4, xvec9, xvec9;
  868. ODUP_SX 0*SIZE(ptrbb), xvec3;
  869. MUL_SX xvec1, xvec5, xvec5;
  870. ADD_SX xvec5, xvec8, xvec8;
  871. DECQ k;
  872. JG .L8_bodyB;
  873. ALIGN_4
  874. .L8_loopE:
  875. #ifndef TRMMKERNEL
  876. TEST $2, bk;
  877. #else
  878. TEST $2, kkk;
  879. #endif
  880. JLE .L9_loopE;
  881. ALIGN_4
  882. .L9_bodyB:
  883. #### Unroll time 1 ####
  884. SHUF_SX $0x4e, xvec2, xvec4;
  885. MUL_SX xvec0, xvec2, xvec2;
  886. ADD_SX xvec2, xvec15, xvec15;
  887. SHUF_SX $0x4e, xvec3, xvec5;
  888. MUL_SX xvec0, xvec3, xvec3;
  889. ADD_SX xvec3, xvec14, xvec14;
  890. EDUP_SX 4*SIZE(ptrbb), xvec2;
  891. MUL_SX xvec0, xvec4, xvec4;
  892. ADD_SX xvec4, xvec13, xvec13;
  893. ODUP_SX 4*SIZE(ptrbb), xvec3;
  894. MUL_SX xvec0, xvec5, xvec5;
  895. ADD_SX xvec5, xvec12, xvec12;
  896. LD_SX 4*SIZE(ptrba), xvec1;
  897. SHUF_SX $0x4e, xvec2, xvec4;
  898. MUL_SX xvec0, xvec2, xvec2;
  899. ADD_SX xvec2, xvec11, xvec11;
  900. SHUF_SX $0x4e, xvec3, xvec5;
  901. MUL_SX xvec0, xvec3, xvec3;
  902. ADD_SX xvec3, xvec10, xvec10;
  903. EDUP_SX 8*SIZE(ptrbb), xvec2;
  904. MUL_SX xvec0, xvec4, xvec4;
  905. ADD_SX xvec4, xvec9, xvec9;
  906. ODUP_SX 8*SIZE(ptrbb), xvec3;
  907. MUL_SX xvec0, xvec5, xvec5;
  908. ADD_SX xvec5, xvec8, xvec8;
  909. #### Unroll time 2 ####
  910. ADDQ $8*SIZE, ptrba;
  911. SHUF_SX $0x4e, xvec2, xvec4;
  912. MUL_SX xvec1, xvec2, xvec2;
  913. ADD_SX xvec2, xvec15, xvec15;
  914. SHUF_SX $0x4e, xvec3, xvec5;
  915. MUL_SX xvec1, xvec3, xvec3;
  916. ADD_SX xvec3, xvec14, xvec14;
  917. EDUP_SX 12*SIZE(ptrbb), xvec2;
  918. MUL_SX xvec1, xvec4, xvec4;
  919. ADD_SX xvec4, xvec13, xvec13;
  920. ODUP_SX 12*SIZE(ptrbb), xvec3;
  921. MUL_SX xvec1, xvec5, xvec5;
  922. ADD_SX xvec5, xvec12, xvec12;
  923. ADDQ $16*SIZE, ptrbb;
  924. LD_SX 0*SIZE(ptrba), xvec0;
  925. SHUF_SX $0x4e, xvec2, xvec4;
  926. MUL_SX xvec1, xvec2, xvec2;
  927. ADD_SX xvec2, xvec11, xvec11;
  928. SHUF_SX $0x4e, xvec3, xvec5;
  929. MUL_SX xvec1, xvec3, xvec3;
  930. ADD_SX xvec3, xvec10, xvec10;
  931. EDUP_SX 0*SIZE(ptrbb), xvec2;
  932. MUL_SX xvec1, xvec4, xvec4;
  933. ADD_SX xvec4, xvec9, xvec9;
  934. ODUP_SX 0*SIZE(ptrbb), xvec3;
  935. MUL_SX xvec1, xvec5, xvec5;
  936. ADD_SX xvec5, xvec8, xvec8;
  937. .L9_loopE:
  938. #ifndef TRMMKERNEL
  939. TEST $1, bk;
  940. #else
  941. TEST $1, kkk;
  942. #endif
  943. JLE .L10_loopE;
  944. ALIGN_4
  945. .L10_bodyB:
  946. #### Unroll time 1 ####
  947. SHUF_SX $0x4e, xvec2, xvec4;
  948. MUL_SX xvec0, xvec2, xvec2;
  949. ADD_SX xvec2, xvec15, xvec15;
  950. SHUF_SX $0x4e, xvec3, xvec5;
  951. MUL_SX xvec0, xvec3, xvec3;
  952. ADD_SX xvec3, xvec14, xvec14;
  953. ADDQ $4*SIZE, ptrba;
  954. EDUP_SX 4*SIZE(ptrbb), xvec2;
  955. MUL_SX xvec0, xvec4, xvec4;
  956. ADD_SX xvec4, xvec13, xvec13;
  957. ODUP_SX 4*SIZE(ptrbb), xvec3;
  958. MUL_SX xvec0, xvec5, xvec5;
  959. ADD_SX xvec5, xvec12, xvec12;
  960. ADDQ $8*SIZE, ptrbb;
  961. SHUF_SX $0x4e, xvec2, xvec4;
  962. MUL_SX xvec0, xvec2, xvec2;
  963. ADD_SX xvec2, xvec11, xvec11;
  964. SHUF_SX $0x4e, xvec3, xvec5;
  965. MUL_SX xvec0, xvec3, xvec3;
  966. ADD_SX xvec3, xvec10, xvec10;
  967. MUL_SX xvec0, xvec4, xvec4;
  968. ADD_SX xvec4, xvec9, xvec9;
  969. MUL_SX xvec0, xvec5, xvec5;
  970. ADD_SX xvec5, xvec8, xvec8;
  971. .L10_loopE:
  972. #### Multiply Alpha ####
  973. BROAD_SX MEMALPHA, xvec7;
  974. MUL_SX xvec7, xvec15, xvec15;
  975. MUL_SX xvec7, xvec14, xvec14;
  976. MUL_SX xvec7, xvec13, xvec13;
  977. MUL_SX xvec7, xvec12, xvec12;
  978. MUL_SX xvec7, xvec11, xvec11;
  979. MUL_SX xvec7, xvec10, xvec10;
  980. MUL_SX xvec7, xvec9, xvec9;
  981. MUL_SX xvec7, xvec8, xvec8;
  982. #### Reverse Result ####
  983. MOV_SX xvec15, xvec7;
  984. REVS_SX $0xe4, xvec13, xvec15, xvec15;
  985. REVS_SX $0xe4, xvec7, xvec13, xvec13;
  986. MOV_SX xvec14, xvec7;
  987. REVS_SX $0xe4, xvec12, xvec14, xvec14;
  988. REVS_SX $0xe4, xvec7, xvec12, xvec12;
  989. MOV_SX xvec11, xvec7;
  990. REVS_SX $0xe4, xvec9, xvec11, xvec11;
  991. REVS_SX $0xe4, xvec7, xvec9, xvec9;
  992. MOV_SX xvec10, xvec7;
  993. REVS_SX $0xe4, xvec8, xvec10, xvec10;
  994. REVS_SX $0xe4, xvec7, xvec8, xvec8;
  995. #### Testing Alignment ####
  996. MOVQ C0, %rax;
  997. OR ldc, %rax;
  998. TEST $15, %rax;
  999. JNE .L10_loopEx;
  1000. ALIGN_4
  1001. LEAQ (ldc,ldc,2),%rax;
  1002. #ifndef TRMMKERNEL
  1003. ADD_SX 0*SIZE(C0), xvec15, xvec15;
  1004. ADD_SX 0*SIZE(C0, ldc,1), xvec14, xvec14;
  1005. ADD_SX 0*SIZE(C0, ldc,2), xvec13, xvec13;
  1006. ADD_SX 0*SIZE(C0, %rax,1), xvec12, xvec12;
  1007. ADD_SX 0*SIZE(C1), xvec11, xvec11;
  1008. ADD_SX 0*SIZE(C1, ldc,1), xvec10, xvec10;
  1009. ADD_SX 0*SIZE(C1, ldc,2), xvec9, xvec9;
  1010. ADD_SX 0*SIZE(C1, %rax,1), xvec8, xvec8;
  1011. #endif
  1012. ST_SX xvec15, 0*SIZE(C0);
  1013. ST_SX xvec14, 0*SIZE(C0, ldc, 1);
  1014. ST_SX xvec13, 0*SIZE(C0, ldc, 2);
  1015. ST_SX xvec12, 0*SIZE(C0, %rax, 1);
  1016. ST_SX xvec11, 0*SIZE(C1);
  1017. ST_SX xvec10, 0*SIZE(C1, ldc, 1);
  1018. ST_SX xvec9, 0*SIZE(C1, ldc, 2);
  1019. ST_SX xvec8, 0*SIZE(C1, %rax, 1);
  1020. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1021. MOVQ bk, %rax;
  1022. SUBQ kkk, %rax;
  1023. LEAQ (, %rax, SIZE), %rax;
  1024. LEAQ (ptrba, %rax, 4), ptrba;
  1025. LEAQ (ptrbb, %rax, 8), ptrbb;
  1026. #endif
  1027. #if defined(TRMMKERNEL)&&defined(LEFT)
  1028. ADDQ $4, kk
  1029. #endif
  1030. ADDQ $4*SIZE, C0;
  1031. ADDQ $4*SIZE, C1;
  1032. JMP .L5_loopE;
  1033. ALIGN_4
  1034. .L10_loopEx:
  1035. LEAQ (ldc,ldc,2),%rax;
  1036. #ifndef TRMMKERNEL
  1037. LDL_SX 0*SIZE(C0), xvec7, xvec7;
  1038. LDH_SX 2*SIZE(C0), xvec7, xvec7;
  1039. LDL_SX 0*SIZE(C0, ldc, 1), xvec6, xvec6;
  1040. LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6;
  1041. LDL_SX 0*SIZE(C0, ldc, 2), xvec5, xvec5;
  1042. LDH_SX 2*SIZE(C0, ldc, 2), xvec5, xvec5;
  1043. LDL_SX 0*SIZE(C0, %rax, 1), xvec4, xvec4;
  1044. LDH_SX 2*SIZE(C0, %rax, 1), xvec4, xvec4;
  1045. LDL_SX 0*SIZE(C1), xvec3, xvec3;
  1046. LDH_SX 2*SIZE(C1), xvec3, xvec3;
  1047. LDL_SX 0*SIZE(C1, ldc, 1), xvec2, xvec2;
  1048. LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2;
  1049. LDL_SX 0*SIZE(C1, ldc, 2), xvec1, xvec1;
  1050. LDH_SX 2*SIZE(C1, ldc, 2), xvec1, xvec1;
  1051. LDL_SX 0*SIZE(C1, %rax, 1), xvec0, xvec0;
  1052. LDH_SX 2*SIZE(C1, %rax, 1), xvec0, xvec0;
  1053. ADD_SX xvec7, xvec15, xvec15;
  1054. ADD_SX xvec6, xvec14, xvec14;
  1055. ADD_SX xvec5, xvec13, xvec13;
  1056. ADD_SX xvec4, xvec12, xvec12;
  1057. ADD_SX xvec3, xvec11, xvec11;
  1058. ADD_SX xvec2, xvec10, xvec10;
  1059. ADD_SX xvec1, xvec9, xvec9;
  1060. ADD_SX xvec0, xvec8, xvec8;
  1061. #endif
  1062. STL_SX xvec15, 0*SIZE(C0);
  1063. STH_SX xvec15, 2*SIZE(C0);
  1064. STL_SX xvec14, 0*SIZE(C0, ldc, 1);
  1065. STH_SX xvec14, 2*SIZE(C0, ldc, 1);
  1066. STL_SX xvec13, 0*SIZE(C0, ldc, 2);
  1067. STH_SX xvec13, 2*SIZE(C0, ldc, 2);
  1068. STL_SX xvec12, 0*SIZE(C0, %rax, 1);
  1069. STH_SX xvec12, 2*SIZE(C0, %rax, 1);
  1070. STL_SX xvec11, 0*SIZE(C1);
  1071. STH_SX xvec11, 2*SIZE(C1);
  1072. STL_SX xvec10, 0*SIZE(C1, ldc, 1);
  1073. STH_SX xvec10, 2*SIZE(C1, ldc, 1);
  1074. STL_SX xvec9, 0*SIZE(C1, ldc, 2);
  1075. STH_SX xvec9, 2*SIZE(C1, ldc, 2);
  1076. STL_SX xvec8, 0*SIZE(C1, %rax, 1);
  1077. STH_SX xvec8, 2*SIZE(C1, %rax, 1);
  1078. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1079. MOVQ bk, %rax;
  1080. SUBQ kkk, %rax;
  1081. LEAQ (, %rax, SIZE), %rax;
  1082. LEAQ (ptrba, %rax, 4), ptrba;
  1083. LEAQ (ptrbb, %rax, 8), ptrbb;
  1084. #endif
  1085. #if defined(TRMMKERNEL)&&defined(LEFT)
  1086. ADDQ $4, kk
  1087. #endif
  1088. ADDQ $4*SIZE, C0;
  1089. ADDQ $4*SIZE, C1;
  1090. .L5_loopE:
  1091. TEST $2, bm;
  1092. JLE .L6_loopE;
  1093. ALIGN_4
  1094. .L6_bodyB:
  1095. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1096. MOVQ bb, ptrbb;
  1097. #else
  1098. MOVQ bb, ptrbb;
  1099. MOVQ kk, %rax;
  1100. LEAQ (, %rax, SIZE), %rax;
  1101. LEAQ (ptrba, %rax, 2), ptrba;
  1102. LEAQ (ptrbb, %rax, 8), ptrbb
  1103. #endif
  1104. #### Initial Results Register ####
  1105. XOR_SY yvec15, yvec15, yvec15;
  1106. XOR_SY yvec14, yvec14, yvec14;
  1107. XOR_SY yvec13, yvec13, yvec13;
  1108. XOR_SY yvec12, yvec12, yvec12;
  1109. MOVQ bk, k;
  1110. #ifndef TRMMKERNEL
  1111. MOVQ bk, k;
  1112. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1113. MOVQ bk, %rax;
  1114. SUBQ kk, %rax;
  1115. MOVQ %rax, kkk;
  1116. #else
  1117. MOVQ kk, %rax;
  1118. #ifdef LEFT
  1119. ADDQ $2, %rax;
  1120. #else
  1121. ADDQ $8, %rax;
  1122. #endif
  1123. MOVQ %rax, kkk;
  1124. #endif
  1125. SARQ $2, k;
  1126. JLE .L11_loopE;
  1127. ALIGN_4
  1128. .L11_bodyB:
  1129. #### Computing kernel
  1130. LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
  1131. SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2
  1132. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1133. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1134. MUL_SX xvec1, xvec2, xvec2;
  1135. ADD_SX xvec2, xvec15, xvec15;
  1136. MUL_SX xvec1, xvec3, xvec3;
  1137. ADD_SX xvec3, xvec14, xvec14;
  1138. EDUP_SX 4*SIZE(ptrbb), xvec4;
  1139. ODUP_SX 4*SIZE(ptrbb), xvec5;
  1140. MUL_SX xvec1, xvec4, xvec4;
  1141. ADD_SX xvec4, xvec13, xvec13;
  1142. MUL_SX xvec1, xvec5, xvec5;
  1143. ADD_SX xvec5, xvec12, xvec12;
  1144. SHUF_SX $0xee, xvec0, xvec6;
  1145. EDUP_SX 8*SIZE(ptrbb), xvec2;
  1146. ODUP_SX 8*SIZE(ptrbb), xvec3;
  1147. MUL_SX xvec6, xvec2, xvec2;
  1148. ADD_SX xvec2, xvec15, xvec15;
  1149. MUL_SX xvec6, xvec3, xvec3;
  1150. ADD_SX xvec3, xvec14, xvec14;
  1151. EDUP_SX 12*SIZE(ptrbb), xvec4;
  1152. ODUP_SX 12*SIZE(ptrbb), xvec5;
  1153. MUL_SX xvec6, xvec4, xvec4;
  1154. ADD_SX xvec4, xvec13, xvec13;
  1155. MUL_SX xvec6, xvec5, xvec5;
  1156. ADD_SX xvec5, xvec12, xvec12;
  1157. LD_SX 4*SIZE(ptrba), xvec0;
  1158. SHUF_SX $0x44, xvec0, xvec1;
  1159. EDUP_SX 16*SIZE(ptrbb), xvec2;
  1160. ODUP_SX 16*SIZE(ptrbb), xvec3;
  1161. MUL_SX xvec1, xvec2, xvec2;
  1162. ADD_SX xvec2, xvec15, xvec15;
  1163. MUL_SX xvec1, xvec3, xvec3;
  1164. ADD_SX xvec3, xvec14, xvec14;
  1165. EDUP_SX 20*SIZE(ptrbb), xvec4;
  1166. ODUP_SX 20*SIZE(ptrbb), xvec5;
  1167. MUL_SX xvec1, xvec4, xvec4;
  1168. ADD_SX xvec4, xvec13, xvec13;
  1169. MUL_SX xvec1, xvec5, xvec5;
  1170. ADD_SX xvec5, xvec12, xvec12;
  1171. SHUF_SX $0xee, xvec0, xvec6;
  1172. EDUP_SX 24*SIZE(ptrbb), xvec2;
  1173. ODUP_SX 24*SIZE(ptrbb), xvec3;
  1174. MUL_SX xvec6, xvec2, xvec2;
  1175. ADD_SX xvec2, xvec15, xvec15;
  1176. MUL_SX xvec6, xvec3, xvec3;
  1177. ADD_SX xvec3, xvec14, xvec14;
  1178. EDUP_SX 28*SIZE(ptrbb), xvec4;
  1179. ODUP_SX 28*SIZE(ptrbb), xvec5;
  1180. MUL_SX xvec6, xvec4, xvec4;
  1181. ADD_SX xvec4, xvec13, xvec13;
  1182. MUL_SX xvec6, xvec5, xvec5;
  1183. ADD_SX xvec5, xvec12, xvec12;
  1184. ADDQ $8*SIZE, ptrba;
  1185. ADDQ $32*SIZE, ptrbb;
  1186. DECQ k;
  1187. JG .L11_bodyB;
  1188. ALIGN_4
  1189. .L11_loopE:
  1190. #ifndef TRMMKERNEL
  1191. TEST $2, bk;
  1192. #else
  1193. TEST $2, kkk;
  1194. #endif
  1195. JLE .L12_loopE;
  1196. ALIGN_4
  1197. .L12_bodyB:
  1198. LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
  1199. SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2
  1200. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1201. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1202. MUL_SX xvec1, xvec2, xvec2;
  1203. ADD_SX xvec2, xvec15, xvec15;
  1204. MUL_SX xvec1, xvec3, xvec3;
  1205. ADD_SX xvec3, xvec14, xvec14;
  1206. EDUP_SX 4*SIZE(ptrbb), xvec4;
  1207. ODUP_SX 4*SIZE(ptrbb), xvec5;
  1208. MUL_SX xvec1, xvec4, xvec4;
  1209. ADD_SX xvec4, xvec13, xvec13;
  1210. MUL_SX xvec1, xvec5, xvec5;
  1211. ADD_SX xvec5, xvec12, xvec12;
  1212. SHUF_SX $0xee, xvec0, xvec6;
  1213. EDUP_SX 8*SIZE(ptrbb), xvec2;
  1214. ODUP_SX 8*SIZE(ptrbb), xvec3;
  1215. MUL_SX xvec6, xvec2, xvec2;
  1216. ADD_SX xvec2, xvec15, xvec15;
  1217. MUL_SX xvec6, xvec3, xvec3;
  1218. ADD_SX xvec3, xvec14, xvec14;
  1219. EDUP_SX 12*SIZE(ptrbb), xvec4;
  1220. ODUP_SX 12*SIZE(ptrbb), xvec5;
  1221. MUL_SX xvec6, xvec4, xvec4;
  1222. ADD_SX xvec4, xvec13, xvec13;
  1223. MUL_SX xvec6, xvec5, xvec5;
  1224. ADD_SX xvec5, xvec12, xvec12;
  1225. ADDQ $4*SIZE, ptrba;
  1226. ADDQ $16*SIZE, ptrbb;
  1227. .L12_loopE:
  1228. #ifndef TRMMKERNEL
  1229. TEST $1, bk;
  1230. #else
  1231. TEST $1, kkk;
  1232. #endif
  1233. JLE .L13_loopE;
  1234. ALIGN_4
  1235. .L13_bodyB:
  1236. LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
  1237. SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2
  1238. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1239. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1240. MUL_SX xvec1, xvec2, xvec2;
  1241. ADD_SX xvec2, xvec15, xvec15;
  1242. MUL_SX xvec1, xvec3, xvec3;
  1243. ADD_SX xvec3, xvec14, xvec14;
  1244. EDUP_SX 4*SIZE(ptrbb), xvec4;
  1245. ODUP_SX 4*SIZE(ptrbb), xvec5;
  1246. MUL_SX xvec1, xvec4, xvec4;
  1247. ADD_SX xvec4, xvec13, xvec13;
  1248. MUL_SX xvec1, xvec5, xvec5;
  1249. ADD_SX xvec5, xvec12, xvec12;
  1250. ADDQ $2*SIZE, ptrba;
  1251. ADDQ $8*SIZE, ptrbb;
  1252. .L13_loopE:
  1253. LEAQ (ldc,ldc,2),%rax;
  1254. #### Multiply Alpha ####
  1255. BROAD_SX MEMALPHA, xvec7;
  1256. MUL_SX xvec7, xvec15, xvec15;
  1257. MUL_SX xvec7, xvec14, xvec14;
  1258. MUL_SX xvec7, xvec13, xvec13;
  1259. MUL_SX xvec7, xvec12, xvec12;
  1260. #ifndef TRMMKERNEL
  1261. LDL_SX 0*SIZE(C0), xvec11, xvec11;
  1262. LDH_SX 0*SIZE(C0, ldc, 2), xvec11, xvec11;
  1263. LDL_SX 0*SIZE(C0, ldc, 1), xvec10, xvec10;
  1264. LDH_SX 0*SIZE(C0, %rax, 1), xvec10, xvec10;
  1265. LDL_SX 0*SIZE(C1), xvec9, xvec9;
  1266. LDH_SX 0*SIZE(C1, ldc, 2), xvec9, xvec9;
  1267. LDL_SX 0*SIZE(C1, ldc, 1), xvec8, xvec8;
  1268. LDH_SX 0*SIZE(C1, %rax,1), xvec8, xvec8;
  1269. ADD_SX xvec11, xvec15, xvec15;
  1270. ADD_SX xvec10, xvec14, xvec14;
  1271. ADD_SX xvec9, xvec13, xvec13;
  1272. ADD_SX xvec8, xvec12, xvec12;
  1273. #endif
  1274. STL_SX xvec15, 0*SIZE(C0);
  1275. STH_SX xvec15, 0*SIZE(C0, ldc, 2);
  1276. STL_SX xvec14, 0*SIZE(C0, ldc, 1);
  1277. STH_SX xvec14, 0*SIZE(C0, %rax, 1);
  1278. STL_SX xvec13, 0*SIZE(C1);
  1279. STH_SX xvec13, 0*SIZE(C1, ldc, 2);
  1280. STL_SX xvec12, 0*SIZE(C1, ldc, 1);
  1281. STH_SX xvec12, 0*SIZE(C1, %rax, 1);
  1282. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1283. MOVQ bk, %rax;
  1284. SUBQ kkk, %rax;
  1285. LEAQ (,%rax, SIZE), %rax;
  1286. LEAQ (ptrba, %rax, 2), ptrba;
  1287. LEAQ (ptrbb, %rax, 8), ptrbb;
  1288. #endif
  1289. #if defined(TRMMKERNEL) && defined(LEFT)
  1290. ADDQ $2, kk
  1291. #endif
  1292. ADDQ $2*SIZE, C0;
  1293. ADDQ $2*SIZE, C1;
  1294. #### Writing Back ####
  1295. .L6_loopE:
  1296. TEST $1, bm;
  1297. JLE .L7_loopE;
  1298. ALIGN_4
  1299. .L7_bodyB:
  1300. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1301. MOVQ bb, ptrbb;
  1302. #else
  1303. MOVQ bb, ptrbb;
  1304. MOVQ kk, %rax;
  1305. LEAQ (,%rax, SIZE), %rax;
  1306. ADDQ %rax, ptrba;
  1307. LEAQ (ptrbb, %rax, 8), ptrbb;
  1308. #endif
  1309. #### intitial ####
  1310. XOR_SY yvec15, yvec15, yvec15;
  1311. XOR_SY yvec14, yvec14, yvec14;
  1312. MOVQ bk, k;
  1313. #ifndef TRMMKERNEL
  1314. MOVQ bk, k;
  1315. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1316. MOVQ bk, %rax;
  1317. SUBQ kk, %rax;
  1318. MOVQ %rax, kkk;
  1319. #else
  1320. MOVQ kk, %rax;
  1321. #ifdef LEFT
  1322. ADDQ $1, %rax;
  1323. #else
  1324. ADDQ $8, %rax;
  1325. #endif
  1326. MOVQ %rax, kkk;
  1327. #endif
  1328. SARQ $2, k;
  1329. JLE .L14_loopE;
  1330. ALIGN_4
  1331. .L14_bodyB:
  1332. BROAD_SX 0*SIZE(ptrba), xvec0;
  1333. LD_SX 0*SIZE(ptrbb), xvec2;
  1334. MUL_SX xvec0, xvec2, xvec2;
  1335. ADD_SX xvec2, xvec15, xvec15;
  1336. LD_SX 4*SIZE(ptrbb), xvec3;
  1337. MUL_SX xvec0, xvec3, xvec3;
  1338. ADD_SX xvec3, xvec14, xvec14;
  1339. BROAD_SX 1*SIZE(ptrba), xvec1;
  1340. LD_SX 8*SIZE(ptrbb), xvec4;
  1341. MUL_SX xvec1, xvec4, xvec4;
  1342. ADD_SX xvec4, xvec15, xvec15;
  1343. LD_SX 12*SIZE(ptrbb), xvec5;
  1344. MUL_SX xvec1, xvec5, xvec5;
  1345. ADD_SX xvec5, xvec14, xvec14;
  1346. BROAD_SX 2*SIZE(ptrba), xvec0;
  1347. LD_SX 16*SIZE(ptrbb), xvec2;
  1348. MUL_SX xvec0, xvec2, xvec2;
  1349. ADD_SX xvec2, xvec15, xvec15;
  1350. LD_SX 20*SIZE(ptrbb), xvec3;
  1351. MUL_SX xvec0, xvec3, xvec3;
  1352. ADD_SX xvec3, xvec14, xvec14;
  1353. BROAD_SX 3*SIZE(ptrba), xvec1;
  1354. LD_SX 24*SIZE(ptrbb), xvec4;
  1355. MUL_SX xvec1, xvec4, xvec4;
  1356. ADD_SX xvec4, xvec15, xvec15;
  1357. LD_SX 28*SIZE(ptrbb), xvec5;
  1358. MUL_SX xvec1, xvec5, xvec5;
  1359. ADD_SX xvec5, xvec14, xvec14;
  1360. ADDQ $4*SIZE, ptrba;
  1361. ADDQ $32*SIZE, ptrbb;
  1362. DECQ k;
  1363. JG .L14_bodyB;
  1364. ALIGN_4
  1365. .L14_loopE:
  1366. #ifndef TRMMKERNEL
  1367. TEST $2, bk;
  1368. #else
  1369. TEST $2, kkk;
  1370. #endif
  1371. JLE .L15_loopE;
  1372. ALIGN_4
  1373. .L15_bodyB:
  1374. BROAD_SX 0*SIZE(ptrba), xvec0;
  1375. LD_SX 0*SIZE(ptrbb), xvec2;
  1376. MUL_SX xvec0, xvec2, xvec2;
  1377. ADD_SX xvec2, xvec15, xvec15;
  1378. LD_SX 4*SIZE(ptrbb), xvec3;
  1379. MUL_SX xvec0, xvec3, xvec3;
  1380. ADD_SX xvec3, xvec14, xvec14;
  1381. BROAD_SX 1*SIZE(ptrba), xvec1;
  1382. LD_SX 8*SIZE(ptrbb), xvec4;
  1383. MUL_SX xvec1, xvec4, xvec4;
  1384. ADD_SX xvec4, xvec15, xvec15;
  1385. LD_SX 12*SIZE(ptrbb), xvec5;
  1386. MUL_SX xvec1, xvec5, xvec5;
  1387. ADD_SX xvec5, xvec14, xvec14;
  1388. ADDQ $2*SIZE, ptrba;
  1389. ADDQ $16*SIZE, ptrbb;
  1390. .L15_loopE:
  1391. #ifndef TRMMKERNEL
  1392. TEST $1, bk;
  1393. #else
  1394. TEST $1, kkk;
  1395. #endif
  1396. JLE .L16_loopE;
  1397. ALIGN_4
  1398. .L16_bodyB:
  1399. BROAD_SX 0*SIZE(ptrba), xvec0;
  1400. LD_SX 0*SIZE(ptrbb), xvec2;
  1401. MUL_SX xvec0, xvec2, xvec2;
  1402. ADD_SX xvec2, xvec15, xvec15;
  1403. LD_SX 4*SIZE(ptrbb), xvec3;
  1404. MUL_SX xvec0, xvec3, xvec3;
  1405. ADD_SX xvec3, xvec14, xvec14;
  1406. ADDQ $1, ptrba;
  1407. ADDQ $4, ptrbb;
  1408. .L16_loopE:
  1409. BROAD_SX MEMALPHA, xvec7;
  1410. MUL_SX xvec7, xvec15, xvec15;
  1411. MUL_SX xvec7, xvec14, xvec14;
  1412. LEAQ (ldc,ldc,2),%rax;
  1413. SHUF_SX $0xff, xvec15, xvec13;
  1414. SHUF_SX $0xaa, xvec15, xvec12;
  1415. SHUF_SX $0x55, xvec15, xvec11;
  1416. SHUF_SX $0x00, xvec15, xvec10;
  1417. #ifndef TRMMKERNEL
  1418. addss 0*SIZE(C0), xvec10;
  1419. addss 0*SIZE(C0, ldc, 1), xvec11;
  1420. addss 0*SIZE(C0, ldc, 2), xvec12;
  1421. addss 0*SIZE(C0, %rax, 1), xvec13;
  1422. #endif
  1423. movss xvec10, 0*SIZE(C0);
  1424. movss xvec11, 0*SIZE(C0, ldc, 1);
  1425. movss xvec12, 0*SIZE(C0, ldc, 2);
  1426. movss xvec13, 0*SIZE(C0, %rax, 1);
  1427. SHUF_SX $0xff, xvec14, xvec9;
  1428. SHUF_SX $0xaa, xvec14, xvec8;
  1429. SHUF_SX $0x55, xvec14, xvec7;
  1430. SHUF_SX $0x00, xvec14, xvec6;
  1431. #ifndef TRMMKERNEL
  1432. addss 0*SIZE(C1), xvec6;
  1433. addss 0*SIZE(C1, ldc, 1), xvec7;
  1434. addss 0*SIZE(C1, ldc, 2), xvec8;
  1435. addss 0*SIZE(C1, %rax, 1), xvec9;
  1436. #endif
  1437. movss xvec6, 0*SIZE(C1);
  1438. movss xvec7, 0*SIZE(C1, ldc, 1);
  1439. movss xvec8, 0*SIZE(C1, ldc, 2);
  1440. movss xvec9, 0*SIZE(C1, %rax, 1);
  1441. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1442. MOVQ bk, %rax;
  1443. SUBQ kkk, %rax;
  1444. LEAQ (,%rax, SIZE), %rax;
  1445. ADDQ %rax, ptrba;
  1446. LEAQ (ptrbb, %rax, 8), ptrbb;
  1447. #endif
  1448. #if defined(TRMMKERNEL)&&defined(LEFT)
  1449. ADDQ $1, kk
  1450. #endif
  1451. ADDQ $1*SIZE, C0;
  1452. ADDQ $1*SIZE, C1;
  1453. #### Writing Back ####
  1454. .L7_loopE:
  1455. #if defined(TRMMKERNEL)&&!defined(LEFT)
  1456. ADDQ $8, kk
  1457. #endif
  1458. MOVQ bk,k;
  1459. SALQ $5,k;
  1460. ADDQ k,bb;
  1461. LEAQ (C,ldc,8),C;
  1462. .L0_bodyE:;
  1463. DECQ j;
  1464. JG .L0_bodyB;
  1465. ALIGN_4;
  1466. .L0_loopE:;
  1467. TEST $4, bn; # Rn = 4
  1468. JLE .L20_loopE;
  1469. ALIGN_4;
  1470. .L20_bodyB:
  1471. #if defined(TRMMKERNEL) && defined(LEFT)
  1472. MOVQ OFFSET, %rax;
  1473. MOVQ %rax, kk;
  1474. #endif
  1475. MOVQ C, C0;
  1476. LEAQ (C, ldc, 2), C1;
  1477. MOVQ ba, ptrba;
  1478. MOVQ bm, i;
  1479. SARQ $3, i;
  1480. JLE .L21_loopE;
  1481. ALIGN_4
  1482. .L21_bodyB:
  1483. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1484. MOVQ bb, ptrbb;
  1485. #else
  1486. MOVQ bb, ptrbb;
  1487. MOVQ kk, %rax;
  1488. LEAQ (, %rax, SIZE), %rax;
  1489. LEAQ (ptrba, %rax, 8), ptrba;
  1490. LEAQ (ptrbb, %rax, 4), ptrbb;
  1491. #endif
  1492. #### Initial ####
  1493. XOR_SY yvec15, yvec15, yvec15;
  1494. XOR_SY yvec14, yvec14, yvec14;
  1495. XOR_SY yvec13, yvec13, yvec13;
  1496. XOR_SY yvec12, yvec12, yvec12;
  1497. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1498. XOR_SY yvec11, yvec11, yvec11;
  1499. XOR_SY yvec10, yvec10, yvec10;
  1500. LD_SX 0*SIZE(ptrba), xvec0;
  1501. XOR_SY yvec9, yvec9, yvec9;
  1502. XOR_SY yvec8, yvec8, yvec8;
  1503. LD_SX 4*SIZE(ptrba), xvec1;
  1504. #ifndef TRMMKERNEL
  1505. MOVQ bk,k;
  1506. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1507. MOVQ bk, %rax;
  1508. SUBQ kk, %rax;
  1509. MOVQ %rax, kkk;
  1510. #else
  1511. MOVQ kk, %rax;
  1512. #ifdef LEFT
  1513. ADDQ $8, %rax;
  1514. #else
  1515. ADDQ $4, %rax;
  1516. #endif
  1517. MOVQ %rax, kkk;
  1518. #endif
  1519. SARQ $2,k;
  1520. JLE .L211_loopE;
  1521. ALIGN_4
  1522. .L211_bodyB:
  1523. #### Unroll time 1 ####
  1524. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1525. SHUF_SX $0x4e, xvec2, xvec4;
  1526. MOV_SX xvec2, xvec6;
  1527. MUL_SX xvec0, xvec2, xvec2;
  1528. ADD_SX xvec2, xvec15, xvec15;
  1529. MUL_SX xvec1, xvec6, xvec6;
  1530. ADD_SX xvec6, xvec14, xvec14;
  1531. SHUF_SX $0x4e, xvec3, xvec5;
  1532. MOV_SX xvec3, xvec7;
  1533. MUL_SX xvec0, xvec3, xvec3;
  1534. ADD_SX xvec3, xvec13, xvec13;
  1535. MUL_SX xvec1, xvec7, xvec7;
  1536. ADD_SX xvec7, xvec12, xvec12;
  1537. EDUP_SX 4*SIZE(ptrbb), xvec2;
  1538. MOV_SX xvec4, xvec6;
  1539. MUL_SX xvec0, xvec4, xvec4;
  1540. ADD_SX xvec4, xvec11, xvec11;
  1541. MUL_SX xvec1, xvec6, xvec6;
  1542. ADD_SX xvec6, xvec10, xvec10;
  1543. MOV_SX xvec5, xvec7;
  1544. MUL_SX xvec0, xvec5, xvec5;
  1545. ADD_SX xvec5, xvec9, xvec9;
  1546. LD_SX 8*SIZE(ptrba), xvec0;
  1547. MUL_SX xvec1, xvec7, xvec7;
  1548. ADD_SX xvec7, xvec8, xvec8;
  1549. LD_SX 12*SIZE(ptrba), xvec1;
  1550. #### Unroll time 2 ####
  1551. ODUP_SX 4*SIZE(ptrbb), xvec3;
  1552. SHUF_SX $0x4e, xvec2, xvec4;
  1553. MOV_SX xvec2, xvec6;
  1554. MUL_SX xvec0, xvec2, xvec2;
  1555. ADD_SX xvec2, xvec15, xvec15;
  1556. MUL_SX xvec1, xvec6, xvec6;
  1557. ADD_SX xvec6, xvec14, xvec14;
  1558. SHUF_SX $0x4e, xvec3, xvec5;
  1559. MOV_SX xvec3, xvec7;
  1560. MUL_SX xvec0, xvec3, xvec3;
  1561. ADD_SX xvec3, xvec13, xvec13;
  1562. MUL_SX xvec1, xvec7, xvec7;
  1563. ADD_SX xvec7, xvec12, xvec12;
  1564. EDUP_SX 8*SIZE(ptrbb), xvec2;
  1565. MOV_SX xvec4, xvec6;
  1566. MUL_SX xvec0, xvec4, xvec4;
  1567. ADD_SX xvec4, xvec11, xvec11;
  1568. MUL_SX xvec1, xvec6, xvec6;
  1569. ADD_SX xvec6, xvec10, xvec10;
  1570. MOV_SX xvec5, xvec7;
  1571. MUL_SX xvec0, xvec5, xvec5;
  1572. ADD_SX xvec5, xvec9, xvec9;
  1573. LD_SX 16*SIZE(ptrba), xvec0;
  1574. MUL_SX xvec1, xvec7, xvec7;
  1575. ADD_SX xvec7, xvec8, xvec8;
  1576. LD_SX 20*SIZE(ptrba), xvec1;
  1577. #### Unroll time 3 ####
  1578. ODUP_SX 8*SIZE(ptrbb), xvec3;
  1579. SHUF_SX $0x4e, xvec2, xvec4;
  1580. MOV_SX xvec2, xvec6;
  1581. MUL_SX xvec0, xvec2, xvec2;
  1582. ADD_SX xvec2, xvec15, xvec15;
  1583. MUL_SX xvec1, xvec6, xvec6;
  1584. ADD_SX xvec6, xvec14, xvec14;
  1585. SHUF_SX $0x4e, xvec3, xvec5;
  1586. MOV_SX xvec3, xvec7;
  1587. MUL_SX xvec0, xvec3, xvec3;
  1588. ADD_SX xvec3, xvec13, xvec13;
  1589. MUL_SX xvec1, xvec7, xvec7;
  1590. ADD_SX xvec7, xvec12, xvec12;
  1591. EDUP_SX 12*SIZE(ptrbb), xvec2;
  1592. MOV_SX xvec4, xvec6;
  1593. ADDQ $16*SIZE, ptrbb;
  1594. MUL_SX xvec0, xvec4, xvec4;
  1595. ADD_SX xvec4, xvec11, xvec11;
  1596. MUL_SX xvec1, xvec6, xvec6;
  1597. ADD_SX xvec6, xvec10, xvec10;
  1598. MOV_SX xvec5, xvec7;
  1599. MUL_SX xvec0, xvec5, xvec5;
  1600. ADD_SX xvec5, xvec9, xvec9;
  1601. LD_SX 24*SIZE(ptrba), xvec0;
  1602. MUL_SX xvec1, xvec7, xvec7;
  1603. ADD_SX xvec7, xvec8, xvec8;
  1604. LD_SX 28*SIZE(ptrba), xvec1;
  1605. ADDQ $32*SIZE, ptrba;
  1606. #### Unroll time 4 ####
  1607. ODUP_SX -4*SIZE(ptrbb), xvec3;
  1608. SHUF_SX $0x4e, xvec2, xvec4;
  1609. MOV_SX xvec2, xvec6;
  1610. MUL_SX xvec0, xvec2, xvec2;
  1611. ADD_SX xvec2, xvec15, xvec15;
  1612. MUL_SX xvec1, xvec6, xvec6;
  1613. ADD_SX xvec6, xvec14, xvec14;
  1614. SHUF_SX $0x4e, xvec3, xvec5;
  1615. MOV_SX xvec3, xvec7;
  1616. MUL_SX xvec0, xvec3, xvec3;
  1617. ADD_SX xvec3, xvec13, xvec13;
  1618. MUL_SX xvec1, xvec7, xvec7;
  1619. ADD_SX xvec7, xvec12, xvec12;
  1620. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1621. MOV_SX xvec4, xvec6;
  1622. MUL_SX xvec0, xvec4, xvec4;
  1623. ADD_SX xvec4, xvec11, xvec11;
  1624. MUL_SX xvec1, xvec6, xvec6;
  1625. ADD_SX xvec6, xvec10, xvec10;
  1626. MOV_SX xvec5, xvec7;
  1627. MUL_SX xvec0, xvec5, xvec5;
  1628. ADD_SX xvec5, xvec9, xvec9;
  1629. LD_SX 0*SIZE(ptrba), xvec0;
  1630. MUL_SX xvec1, xvec7, xvec7;
  1631. ADD_SX xvec7, xvec8, xvec8;
  1632. LD_SX 4*SIZE(ptrba), xvec1;
  1633. DECQ k;
  1634. JG .L211_bodyB;
  1635. ALIGN_4
  1636. .L211_loopE:
  1637. #ifndef TRMMKERNEL
  1638. TEST $2, bk
  1639. #else
  1640. TEST $2, kkk;
  1641. #endif
  1642. JLE .L212_loopE;
  1643. ALIGN_4
  1644. .L212_bodyB:
  1645. #### Unroll time 1 ####
  1646. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1647. SHUF_SX $0x4e, xvec2, xvec4;
  1648. MOV_SX xvec2, xvec6;
  1649. MUL_SX xvec0, xvec2, xvec2;
  1650. ADD_SX xvec2, xvec15, xvec15;
  1651. MUL_SX xvec1, xvec6, xvec6;
  1652. ADD_SX xvec6, xvec14, xvec14;
  1653. SHUF_SX $0x4e, xvec3, xvec5;
  1654. MOV_SX xvec3, xvec7;
  1655. MUL_SX xvec0, xvec3, xvec3;
  1656. ADD_SX xvec3, xvec13, xvec13;
  1657. MUL_SX xvec1, xvec7, xvec7;
  1658. ADD_SX xvec7, xvec12, xvec12;
  1659. EDUP_SX 4*SIZE(ptrbb), xvec2;
  1660. MOV_SX xvec4, xvec6;
  1661. MUL_SX xvec0, xvec4, xvec4;
  1662. ADD_SX xvec4, xvec11, xvec11;
  1663. ADDQ $8*SIZE, ptrbb;
  1664. MUL_SX xvec1, xvec6, xvec6;
  1665. ADD_SX xvec6, xvec10, xvec10;
  1666. MOV_SX xvec5, xvec7;
  1667. MUL_SX xvec0, xvec5, xvec5;
  1668. ADD_SX xvec5, xvec9, xvec9;
  1669. LD_SX 8*SIZE(ptrba), xvec0;
  1670. MUL_SX xvec1, xvec7, xvec7;
  1671. ADD_SX xvec7, xvec8, xvec8;
  1672. LD_SX 12*SIZE(ptrba), xvec1;
  1673. ADDQ $16*SIZE, ptrba;
  1674. #### Unroll time 2 ####
  1675. ODUP_SX -4*SIZE(ptrbb), xvec3;
  1676. SHUF_SX $0x4e, xvec2, xvec4;
  1677. MOV_SX xvec2, xvec6;
  1678. MUL_SX xvec0, xvec2, xvec2;
  1679. ADD_SX xvec2, xvec15, xvec15;
  1680. MUL_SX xvec1, xvec6, xvec6;
  1681. ADD_SX xvec6, xvec14, xvec14;
  1682. SHUF_SX $0x4e, xvec3, xvec5;
  1683. MOV_SX xvec3, xvec7;
  1684. MUL_SX xvec0, xvec3, xvec3;
  1685. ADD_SX xvec3, xvec13, xvec13;
  1686. MUL_SX xvec1, xvec7, xvec7;
  1687. ADD_SX xvec7, xvec12, xvec12;
  1688. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1689. MOV_SX xvec4, xvec6;
  1690. MUL_SX xvec0, xvec4, xvec4;
  1691. ADD_SX xvec4, xvec11, xvec11;
  1692. MUL_SX xvec1, xvec6, xvec6;
  1693. ADD_SX xvec6, xvec10, xvec10;
  1694. MOV_SX xvec5, xvec7;
  1695. MUL_SX xvec0, xvec5, xvec5;
  1696. ADD_SX xvec5, xvec9, xvec9;
  1697. LD_SX 0*SIZE(ptrba), xvec0;
  1698. MUL_SX xvec1, xvec7, xvec7;
  1699. ADD_SX xvec7, xvec8, xvec8;
  1700. LD_SX 4*SIZE(ptrba), xvec1;
  1701. .L212_loopE:
  1702. #ifndef TRMMKERNEL
  1703. TEST $1, bk;
  1704. #else
  1705. TEST $1, kkk;
  1706. #endif
  1707. JLE .L213_loopE;
  1708. ALIGN_4
  1709. .L213_bodyB:
  1710. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1711. SHUF_SX $0x4e, xvec2, xvec4;
  1712. MOV_SX xvec2, xvec6;
  1713. MUL_SX xvec0, xvec2, xvec2;
  1714. ADD_SX xvec2, xvec15, xvec15;
  1715. ADDQ $4*SIZE, ptrbb;
  1716. SHUF_SX $0x4e, xvec3, xvec5;
  1717. MUL_SX xvec1, xvec6, xvec6;
  1718. ADD_SX xvec6, xvec14, xvec14;
  1719. MOV_SX xvec3, xvec7;
  1720. MUL_SX xvec0, xvec3, xvec3;
  1721. ADD_SX xvec3, xvec13, xvec13;
  1722. MUL_SX xvec1, xvec7, xvec7;
  1723. ADD_SX xvec7, xvec12, xvec12;
  1724. MOV_SX xvec4, xvec6;
  1725. ADDQ $8*SIZE, ptrba;
  1726. MUL_SX xvec0, xvec4, xvec4;
  1727. ADD_SX xvec4, xvec11, xvec11;
  1728. MUL_SX xvec1, xvec6, xvec6;
  1729. ADD_SX xvec6, xvec10, xvec10;
  1730. MOV_SX xvec5, xvec7;
  1731. MUL_SX xvec0, xvec5, xvec5;
  1732. ADD_SX xvec5, xvec9, xvec9;
  1733. MUL_SX xvec1, xvec7, xvec7;
  1734. ADD_SX xvec7, xvec8, xvec8;
  1735. .L213_loopE:
  1736. #### Multiply Alpha ####
  1737. BROAD_SX MEMALPHA, xvec7;
  1738. MUL_SX xvec7, xvec15, xvec15;
  1739. MUL_SX xvec7, xvec14, xvec14;
  1740. MUL_SX xvec7, xvec13, xvec13;
  1741. MUL_SX xvec7, xvec12, xvec12;
  1742. MUL_SX xvec7, xvec11, xvec11;
  1743. MUL_SX xvec7, xvec10, xvec10;
  1744. MUL_SX xvec7, xvec9, xvec9;
  1745. MUL_SX xvec7, xvec8, xvec8;
  1746. #### Writing Back ####
  1747. #ifndef TRMMKERNEL
  1748. LDL_SX 0*SIZE(C0), xvec0, xvec0;
  1749. LDH_SX 2*SIZE(C1), xvec0, xvec0;
  1750. LDL_SX 4*SIZE(C0), xvec1, xvec1;
  1751. LDH_SX 6*SIZE(C1), xvec1, xvec1;
  1752. LDL_SX 0*SIZE(C0, ldc, 1), xvec2, xvec2;
  1753. LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2;
  1754. LDL_SX 4*SIZE(C0, ldc, 1), xvec3, xvec3;
  1755. LDH_SX 6*SIZE(C1, ldc, 1), xvec3, xvec3;
  1756. LDL_SX 0*SIZE(C1), xvec4, xvec4;
  1757. LDH_SX 2*SIZE(C0), xvec4, xvec4;
  1758. LDL_SX 4*SIZE(C1), xvec5, xvec5;
  1759. LDH_SX 6*SIZE(C0), xvec5, xvec5;
  1760. LDL_SX 0*SIZE(C1, ldc, 1), xvec6, xvec6;
  1761. LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6;
  1762. LDL_SX 4*SIZE(C1, ldc, 1), xvec7, xvec7;
  1763. LDH_SX 6*SIZE(C0, ldc, 1), xvec7, xvec7;
  1764. ADD_SX xvec0, xvec15, xvec15;
  1765. ADD_SX xvec1, xvec14, xvec14;
  1766. ADD_SX xvec2, xvec13, xvec13;
  1767. ADD_SX xvec3, xvec12, xvec12;
  1768. ADD_SX xvec4, xvec11, xvec11;
  1769. ADD_SX xvec5, xvec10, xvec10;
  1770. ADD_SX xvec6, xvec9, xvec9;
  1771. ADD_SX xvec7, xvec8, xvec8;
  1772. #endif
  1773. STL_SX xvec15, 0*SIZE(C0);
  1774. STH_SX xvec15, 2*SIZE(C1);
  1775. STL_SX xvec14, 4*SIZE(C0);
  1776. STH_SX xvec14, 6*SIZE(C1);
  1777. STL_SX xvec13, 0*SIZE(C0, ldc, 1);
  1778. STH_SX xvec13, 2*SIZE(C1, ldc, 1);
  1779. STL_SX xvec12, 4*SIZE(C0, ldc, 1);
  1780. STH_SX xvec12, 6*SIZE(C1, ldc, 1);
  1781. STL_SX xvec11, 0*SIZE(C1);
  1782. STH_SX xvec11, 2*SIZE(C0);
  1783. STL_SX xvec10, 4*SIZE(C1);
  1784. STH_SX xvec10, 6*SIZE(C0);
  1785. STL_SX xvec9, 0*SIZE(C1, ldc, 1);
  1786. STH_SX xvec9, 2*SIZE(C0, ldc, 1);
  1787. STL_SX xvec8, 4*SIZE(C1, ldc, 1);
  1788. STH_SX xvec8, 6*SIZE(C0, ldc, 1);
  1789. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1790. MOVQ bk, %rax;
  1791. SUBQ kkk, %rax;
  1792. LEAQ (, %rax, SIZE), %rax;
  1793. LEAQ (ptrba, %rax, 8), ptrba;
  1794. LEAQ (ptrbb, %rax, 4), ptrbb;
  1795. #endif
  1796. #if defined(TRMMKERNEL) && defined(LEFT)
  1797. ADDQ $8, kk
  1798. #endif
  1799. ADDQ $8*SIZE, C0;
  1800. ADDQ $8*SIZE, C1;
  1801. DECQ i;
  1802. JG .L21_bodyB;
  1803. ALIGN_4
  1804. .L21_loopE:
  1805. TEST $4, bm;
  1806. JLE .L22_loopE;
  1807. ALIGN_4
  1808. .L22_bodyB:
  1809. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1810. MOVQ bb, ptrbb;
  1811. #else
  1812. MOVQ bb, ptrbb;
  1813. MOVQ kk, %rax;
  1814. LEAQ (, %rax, SIZE), %rax;
  1815. LEAQ (ptrba, %rax, 4), ptrba;
  1816. LEAQ (ptrbb, %rax, 4), ptrbb;
  1817. #endif
  1818. #### Initial Results ####
  1819. XOR_SY yvec15, yvec15, yvec15;
  1820. XOR_SY yvec14, yvec14, yvec14;
  1821. XOR_SY yvec13, yvec13, yvec13;
  1822. XOR_SY yvec12, yvec12, yvec12;
  1823. #ifndef TRMMKERNEL
  1824. MOVQ bk, k;
  1825. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1826. MOVQ bk, %rax;
  1827. SUBQ kk, %rax;
  1828. MOVQ %rax, kkk;
  1829. #else
  1830. MOVQ kk, %rax;
  1831. #ifdef LEFT
  1832. ADDQ $4, %rax;
  1833. #else
  1834. ADDQ $4, %rax;
  1835. #endif
  1836. MOVQ %rax, kkk;
  1837. #endif
  1838. SARQ $2, k;
  1839. JLE .L221_loopE;
  1840. ALIGN_4
  1841. .L221_bodyB:
  1842. LD_SX 0*SIZE(ptrba), xvec0;
  1843. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1844. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1845. SHUF_SX $0x4e, xvec2, xvec4;
  1846. MUL_SX xvec0, xvec2, xvec2;
  1847. ADD_SX xvec2, xvec15, xvec15;
  1848. SHUF_SX $0x4e, xvec3, xvec5;
  1849. MUL_SX xvec0, xvec3, xvec3;
  1850. ADD_SX xvec3, xvec14, xvec14;
  1851. MUL_SX xvec0, xvec4, xvec4;
  1852. ADD_SX xvec4, xvec13, xvec13;
  1853. MUL_SX xvec0, xvec5, xvec5;
  1854. ADD_SX xvec5, xvec12, xvec12;
  1855. LD_SX 4*SIZE(ptrba), xvec1;
  1856. EDUP_SX 4*SIZE(ptrbb), xvec2;
  1857. ODUP_SX 4*SIZE(ptrbb), xvec3;
  1858. SHUF_SX $0x4e, xvec2, xvec4;
  1859. MUL_SX xvec1, xvec2, xvec2;
  1860. ADD_SX xvec2, xvec15, xvec15;
  1861. SHUF_SX $0x4e, xvec3, xvec5;
  1862. MUL_SX xvec1, xvec3, xvec3;
  1863. ADD_SX xvec3, xvec14, xvec14;
  1864. MUL_SX xvec1, xvec4, xvec4;
  1865. ADD_SX xvec4, xvec13, xvec13;
  1866. MUL_SX xvec1, xvec5, xvec5;
  1867. ADD_SX xvec5, xvec12, xvec12;
  1868. LD_SX 8*SIZE(ptrba), xvec0;
  1869. EDUP_SX 8*SIZE(ptrbb), xvec2;
  1870. ODUP_SX 8*SIZE(ptrbb), xvec3;
  1871. SHUF_SX $0x4e, xvec2, xvec4;
  1872. MUL_SX xvec0, xvec2, xvec2;
  1873. ADD_SX xvec2, xvec15, xvec15;
  1874. SHUF_SX $0x4e, xvec3, xvec5;
  1875. MUL_SX xvec0, xvec3, xvec3;
  1876. ADD_SX xvec3, xvec14, xvec14;
  1877. MUL_SX xvec0, xvec4, xvec4;
  1878. ADD_SX xvec4, xvec13, xvec13;
  1879. MUL_SX xvec0, xvec5, xvec5;
  1880. ADD_SX xvec5, xvec12, xvec12;
  1881. LD_SX 12*SIZE(ptrba), xvec1;
  1882. EDUP_SX 12*SIZE(ptrbb), xvec2;
  1883. ODUP_SX 12*SIZE(ptrbb), xvec3;
  1884. SHUF_SX $0x4e, xvec2, xvec4;
  1885. MUL_SX xvec1, xvec2, xvec2;
  1886. ADD_SX xvec2, xvec15, xvec15
  1887. SHUF_SX $0x4e, xvec3, xvec5;
  1888. MUL_SX xvec1, xvec3, xvec3;
  1889. ADD_SX xvec3, xvec14, xvec14;
  1890. MUL_SX xvec1, xvec4, xvec4;
  1891. ADD_SX xvec4, xvec13, xvec13;
  1892. MUL_SX xvec1, xvec5, xvec5;
  1893. ADD_SX xvec5, xvec12, xvec12;
  1894. ADDQ $16*SIZE, ptrba;
  1895. ADDQ $16*SIZE, ptrbb;
  1896. DECQ k;
  1897. JG .L221_bodyB;
  1898. ALIGN_4
  1899. .L221_loopE:
  1900. #ifndef TRMMKERNEL
  1901. TEST $2, bk;
  1902. #else
  1903. TEST $2, kkk;
  1904. #endif
  1905. JLE .L222_loopE;
  1906. ALIGN_4
  1907. .L222_bodyB:
  1908. LD_SX 0*SIZE(ptrba), xvec0;
  1909. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1910. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1911. SHUF_SX $0x4e, xvec2, xvec4;
  1912. MUL_SX xvec0, xvec2, xvec2;
  1913. ADD_SX xvec2, xvec15, xvec15;
  1914. SHUF_SX $0x4e, xvec3, xvec5;
  1915. MUL_SX xvec0, xvec3, xvec3;
  1916. ADD_SX xvec3, xvec14, xvec14;
  1917. MUL_SX xvec0, xvec4, xvec4;
  1918. ADD_SX xvec4, xvec13, xvec13;
  1919. MUL_SX xvec0, xvec5, xvec5;
  1920. ADD_SX xvec5, xvec12, xvec12;
  1921. LD_SX 4*SIZE(ptrba), xvec1;
  1922. EDUP_SX 4*SIZE(ptrbb), xvec2;
  1923. ODUP_SX 4*SIZE(ptrbb), xvec3;
  1924. SHUF_SX $0x4e, xvec2, xvec4;
  1925. MUL_SX xvec1, xvec2, xvec2;
  1926. ADD_SX xvec2, xvec15, xvec15;
  1927. SHUF_SX $0x4e, xvec3, xvec5;
  1928. MUL_SX xvec1, xvec3, xvec3;
  1929. ADD_SX xvec3, xvec14, xvec14;
  1930. MUL_SX xvec1, xvec4, xvec4;
  1931. ADD_SX xvec4, xvec13, xvec13
  1932. MUL_SX xvec1, xvec5, xvec5;
  1933. ADD_SX xvec5, xvec12, xvec12;
  1934. ADDQ $8*SIZE, ptrba;
  1935. ADDQ $8*SIZE, ptrbb;
  1936. .L222_loopE:
  1937. #ifndef TRMMKERNEL
  1938. TEST $1, bk;
  1939. #else
  1940. TEST $1, kkk;
  1941. #endif
  1942. JLE .L223_loopE;
  1943. ALIGN_4
  1944. .L223_bodyB:
  1945. LD_SX 0*SIZE(ptrba), xvec0;
  1946. EDUP_SX 0*SIZE(ptrbb), xvec2;
  1947. ODUP_SX 0*SIZE(ptrbb), xvec3;
  1948. SHUF_SX $0x4e, xvec2, xvec4;
  1949. MUL_SX xvec0, xvec2, xvec2;
  1950. ADD_SX xvec2, xvec15, xvec15;
  1951. SHUF_SX $0x4e, xvec3, xvec5;
  1952. MUL_SX xvec0, xvec3, xvec3;
  1953. ADD_SX xvec3, xvec14, xvec14;
  1954. MUL_SX xvec0, xvec4, xvec4;
  1955. ADD_SX xvec4, xvec13, xvec13;
  1956. MUL_SX xvec0, xvec5, xvec5;
  1957. ADD_SX xvec5, xvec12, xvec12;
  1958. ADDQ $4*SIZE, ptrba;
  1959. ADDQ $4*SIZE, ptrbb;
  1960. .L223_loopE:
  1961. #### Multiply Alpha ####
  1962. BROAD_SX MEMALPHA, xvec7;
  1963. MUL_SX xvec7, xvec15, xvec15;
  1964. MUL_SX xvec7, xvec14, xvec14;
  1965. MUL_SX xvec7, xvec13, xvec13;
  1966. MUL_SX xvec7, xvec12, xvec12;
  1967. #### Writing back ####
  1968. #ifndef TRMMKERNEL
  1969. LDL_SX 0*SIZE(C0), xvec0, xvec0;
  1970. LDH_SX 2*SIZE(C1), xvec0, xvec0;
  1971. LDL_SX 0*SIZE(C0, ldc, 1), xvec1, xvec1;
  1972. LDH_SX 2*SIZE(C1, ldc, 1), xvec1, xvec1;
  1973. LDL_SX 0*SIZE(C1), xvec2, xvec2;
  1974. LDH_SX 2*SIZE(C0), xvec2, xvec2;
  1975. LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3;
  1976. LDH_SX 2*SIZE(C0, ldc, 1), xvec3, xvec3;
  1977. ADD_SX xvec0, xvec15, xvec15;
  1978. ADD_SX xvec1, xvec14, xvec14;
  1979. ADD_SX xvec2, xvec13, xvec13;
  1980. ADD_SX xvec3, xvec12, xvec12;
  1981. #endif
  1982. STL_SX xvec15, 0*SIZE(C0);
  1983. STH_SX xvec15, 2*SIZE(C1);
  1984. STL_SX xvec14, 0*SIZE(C0, ldc, 1);
  1985. STH_SX xvec14, 2*SIZE(C1, ldc, 1);
  1986. STL_SX xvec13, 0*SIZE(C1);
  1987. STH_SX xvec13, 2*SIZE(C0);
  1988. STL_SX xvec12, 0*SIZE(C1, ldc, 1);
  1989. STH_SX xvec12, 2*SIZE(C0, ldc, 1);
  1990. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1991. MOVQ bk, %rax;
  1992. SUBQ kkk, %rax;
  1993. LEAQ (, %rax, SIZE), %rax;
  1994. LEAQ (ptrba, %rax, 4), ptrba;
  1995. LEAQ (ptrbb, %rax, 4), ptrbb;
  1996. #endif
  1997. #if defined(TRMMKERNEL)&&defined(LEFT)
  1998. ADDQ $4, kk
  1999. #endif
  2000. ADDQ $4*SIZE, C0;
  2001. ADDQ $4*SIZE, C1;
  2002. .L22_loopE:
  2003. TEST $2, bm;
  2004. JLE .L23_loopE;
  2005. ALIGN_4
  2006. .L23_bodyB:
  2007. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2008. MOVQ bb, ptrbb;
  2009. #else
  2010. MOVQ bb, ptrbb;
  2011. MOVQ kk, %rax;
  2012. LEAQ (, %rax, SIZE), %rax;
  2013. LEAQ (ptrba, %rax, 2), ptrba;
  2014. LEAQ (ptrbb, %rax, 4), ptrbb
  2015. #endif
  2016. #### Initial ####
  2017. XOR_SY yvec15, yvec15, yvec15;
  2018. XOR_SY yvec14, yvec14, yvec14;
  2019. #ifndef TRMMKERNEL
  2020. MOVQ bk, k;
  2021. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2022. MOVQ bk, %rax;
  2023. SUBQ kk, %rax;
  2024. MOVQ %rax, kkk;
  2025. #else
  2026. MOVQ kk, %rax;
  2027. #ifdef LEFT
  2028. ADDQ $2, %rax;
  2029. #else
  2030. ADDQ $4, %rax;
  2031. #endif
  2032. MOVQ %rax, kkk;
  2033. #endif
  2034. SARQ $2, k;
  2035. JLE .L231_loopE;
  2036. ALIGN_4
  2037. .L231_bodyB:
  2038. LD_SX 0*SIZE(ptrba), xvec0;
  2039. EDUP_SX 0*SIZE(ptrbb), xvec4;
  2040. ODUP_SX 0*SIZE(ptrbb), xvec5;
  2041. SHUF_SX $0x44, xvec0, xvec1;
  2042. MUL_SX xvec1, xvec4, xvec4;
  2043. ADD_SX xvec4, xvec15, xvec15;
  2044. MUL_SX xvec1, xvec5, xvec5;
  2045. ADD_SX xvec5, xvec14, xvec14;
  2046. SHUF_SX $0xee, xvec0, xvec2;
  2047. EDUP_SX 4*SIZE(ptrbb), xvec6;
  2048. ODUP_SX 4*SIZE(ptrbb), xvec7;
  2049. MUL_SX xvec2, xvec6, xvec6;
  2050. ADD_SX xvec6, xvec15, xvec15;
  2051. MUL_SX xvec2, xvec7, xvec7;
  2052. ADD_SX xvec7, xvec14, xvec14;
  2053. LD_SX 4*SIZE(ptrba), xvec0;
  2054. EDUP_SX 8*SIZE(ptrbb), xvec4;
  2055. ODUP_SX 8*SIZE(ptrbb), xvec5;
  2056. SHUF_SX $0x44, xvec0, xvec1;
  2057. MUL_SX xvec1, xvec4, xvec4;
  2058. ADD_SX xvec4, xvec15, xvec15;
  2059. MUL_SX xvec1, xvec5, xvec5;
  2060. ADD_SX xvec5, xvec14, xvec14;
  2061. SHUF_SX $0xee, xvec0, xvec2;
  2062. EDUP_SX 12*SIZE(ptrbb), xvec6;
  2063. ODUP_SX 12*SIZE(ptrbb), xvec7;
  2064. MUL_SX xvec2, xvec6, xvec6;
  2065. ADD_SX xvec6, xvec15, xvec15;
  2066. MUL_SX xvec2, xvec7, xvec7;
  2067. ADD_SX xvec7, xvec14, xvec14;
  2068. ADDQ $8*SIZE, ptrba;
  2069. ADDQ $16*SIZE, ptrbb;
  2070. DECQ k;
  2071. JG .L231_bodyB;
  2072. ALIGN_4
  2073. .L231_loopE:
  2074. #ifndef TRMMKERNEL
  2075. TEST $2, bk;
  2076. #else
  2077. TEST $2, kkk;
  2078. #endif
  2079. JLE .L232_loopE;
  2080. ALIGN_4
  2081. .L232_bodyB:
  2082. LD_SX 0*SIZE(ptrba), xvec0;
  2083. EDUP_SX 0*SIZE(ptrbb), xvec4;
  2084. ODUP_SX 0*SIZE(ptrbb), xvec5;
  2085. SHUF_SX $0x44, xvec0, xvec1;
  2086. MUL_SX xvec1, xvec4, xvec4;
  2087. ADD_SX xvec4, xvec15, xvec15;
  2088. MUL_SX xvec1, xvec5, xvec5;
  2089. ADD_SX xvec5, xvec14, xvec14;
  2090. SHUF_SX $0xee, xvec0, xvec2;
  2091. EDUP_SX 4*SIZE(ptrbb), xvec6;
  2092. ODUP_SX 4*SIZE(ptrbb), xvec7;
  2093. MUL_SX xvec2, xvec6, xvec6;
  2094. ADD_SX xvec6, xvec15, xvec15;
  2095. MUL_SX xvec2, xvec7, xvec7;
  2096. ADD_SX xvec7, xvec14, xvec14;
  2097. ADDQ $4*SIZE, ptrba;
  2098. ADDQ $8*SIZE, ptrbb;
  2099. .L232_loopE:
  2100. #ifndef TRMMKERNEL
  2101. TEST $1, bk;
  2102. #else
  2103. TEST $1, kkk;
  2104. #endif
  2105. JLE .L233_loopE;
  2106. ALIGN_4
  2107. .L233_bodyB:
  2108. LD_SX 0*SIZE(ptrba), xvec0;
  2109. EDUP_SX 0*SIZE(ptrbb), xvec4;
  2110. ODUP_SX 0*SIZE(ptrbb), xvec5;
  2111. SHUF_SX $0x44, xvec0, xvec1;
  2112. MUL_SX xvec1, xvec4, xvec4;
  2113. ADD_SX xvec4, xvec15, xvec15;
  2114. MUL_SX xvec1, xvec5, xvec5;
  2115. ADD_SX xvec5, xvec14, xvec14;
  2116. ADDQ $2*SIZE, ptrba;
  2117. ADDQ $4*SIZE, ptrbb;
  2118. .L233_loopE:
  2119. #### Multiply Alpha ####
  2120. BROAD_SY MEMALPHA, yvec7;
  2121. MUL_SY xvec7, xvec15, xvec15;
  2122. MUL_SY xvec7, xvec14, xvec14;
  2123. #### Writing Back ####
  2124. SHUF_SX $0xee, xvec15, xvec13;
  2125. SHUF_SX $0xee, xvec14, xvec12;
  2126. #ifndef TRMMKERNEL
  2127. ADD_SY 0*SIZE(C0), xvec15, xvec15;
  2128. ADD_SY 0*SIZE(C0, ldc, 1), xvec14, xvec14;
  2129. ADD_SY 0*SIZE(C1), xvec13, xvec13;
  2130. ADD_SY 0*SIZE(C1, ldc, 1), xvec12, xvec12;
  2131. #endif
  2132. STL_SY xvec15, 0*SIZE(C0);
  2133. STL_SY xvec14, 0*SIZE(C0, ldc, 1);
  2134. STL_SY xvec13, 0*SIZE(C1);
  2135. STL_SY xvec12, 0*SIZE(C1, ldc, 1);
  2136. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2137. MOVQ bk, %rax;
  2138. SUBQ kkk, %rax;
  2139. LEAQ (,%rax, SIZE), %rax;
  2140. LEAQ (ptrba, %rax, 2), ptrba;
  2141. LEAQ (ptrbb, %rax, 4), ptrbb;
  2142. #endif
  2143. #if defined(TRMMKERNEL) && defined(LEFT)
  2144. ADDQ $2, kk
  2145. #endif
  2146. ADDQ $2*SIZE, C0;
  2147. ADDQ $2*SIZE, C1;
  2148. .L23_loopE:
  2149. TEST $1, bm;
  2150. JLE .L24_loopE;
  2151. ALIGN_4
  2152. .L24_bodyB:
  2153. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2154. MOVQ bb, ptrbb;
  2155. #else
  2156. MOVQ bb, ptrbb;
  2157. MOVQ kk, %rax;
  2158. LEAQ (,%rax, SIZE), %rax;
  2159. ADDQ %rax, ptrba;
  2160. LEAQ (ptrbb, %rax, 4), ptrbb;
  2161. #endif
  2162. #### Initial ####
  2163. XOR_SY yvec15, yvec15, yvec15;
  2164. #ifndef TRMMKERNEL
  2165. MOVQ bk, k;
  2166. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2167. MOVQ bk, %rax;
  2168. SUBQ kk, %rax;
  2169. MOVQ %rax, kkk;
  2170. #else
  2171. MOVQ kk, %rax;
  2172. #ifdef LEFT
  2173. ADDQ $1, %rax;
  2174. #else
  2175. ADDQ $4, %rax;
  2176. #endif
  2177. MOVQ %rax, kkk;
  2178. #endif
  2179. SARQ $2, k;
  2180. JLE .L241_loopE;
  2181. ALIGN_4
  2182. .L241_bodyB:
  2183. BROAD_SX 0*SIZE(ptrba), xvec0;
  2184. LD_SX 0*SIZE(ptrbb), xvec1;
  2185. MUL_SX xvec0, xvec1, xvec1;
  2186. ADD_SX xvec1, xvec15, xvec15;
  2187. BROAD_SX 1*SIZE(ptrba), xvec2;
  2188. LD_SX 4*SIZE(ptrbb), xvec3;
  2189. MUL_SX xvec2, xvec3, xvec3;
  2190. ADD_SX xvec3, xvec15, xvec15;
  2191. BROAD_SX 2*SIZE(ptrba), xvec4;
  2192. LD_SX 8*SIZE(ptrbb), xvec5;
  2193. MUL_SX xvec4, xvec5, xvec5;
  2194. ADD_SX xvec5, xvec15, xvec15;
  2195. BROAD_SX 3*SIZE(ptrba), xvec6;
  2196. LD_SX 12*SIZE(ptrbb), xvec7;
  2197. MUL_SX xvec6, xvec7, xvec7;
  2198. ADD_SX xvec7, xvec15, xvec15;
  2199. ADDQ $4*SIZE, ptrba;
  2200. ADDQ $16*SIZE, ptrbb;
  2201. DECQ k;
  2202. JG .L241_bodyB;
  2203. ALIGN_4
  2204. .L241_loopE:
  2205. #ifndef TRMMKERNEL
  2206. TEST $2, bk;
  2207. #else
  2208. TEST $2, kkk;
  2209. #endif
  2210. JLE .L242_loopE;
  2211. ALIGN_4
  2212. .L242_bodyB:
  2213. BROAD_SX 0*SIZE(ptrba), xvec0;
  2214. LD_SX 0*SIZE(ptrbb), xvec1;
  2215. MUL_SX xvec0, xvec1, xvec1;
  2216. ADD_SX xvec1, xvec15, xvec15;
  2217. BROAD_SX 1*SIZE(ptrba), xvec2;
  2218. LD_SX 4*SIZE(ptrbb), xvec3;
  2219. MUL_SX xvec2, xvec3, xvec3;
  2220. ADD_SX xvec3, xvec15, xvec15;
  2221. ADDQ $2*SIZE, ptrba;
  2222. ADDQ $8*SIZE, ptrbb;
  2223. .L242_loopE:
  2224. #ifndef TRMMKERNEL
  2225. TEST $1, bk;
  2226. #else
  2227. TEST $1, kkk;
  2228. #endif
  2229. JLE .L243_loopE;
  2230. ALIGN_4;
  2231. .L243_bodyB:
  2232. BROAD_SX 0*SIZE(ptrba), xvec0;
  2233. LD_SX 0*SIZE(ptrbb), xvec1;
  2234. MUL_SX xvec0, xvec1, xvec1;
  2235. ADD_SX xvec1, xvec15, xvec15;
  2236. ADDQ $1*SIZE, ptrba;
  2237. ADDQ $4*SIZE, ptrbb;
  2238. .L243_loopE:
  2239. #### Multiply Alpha ####
  2240. BROAD_SX MEMALPHA, xvec7;
  2241. MUL_SX xvec7, xvec15, xvec15;
  2242. SHUF_SX $0xff, xvec15, xvec14;
  2243. SHUF_SX $0xaa, xvec15, xvec13;
  2244. SHUF_SX $0x55, xvec15, xvec12;
  2245. SHUF_SX $0x00, xvec15, xvec11;
  2246. #ifndef TRMMKERNEL
  2247. addss 0*SIZE(C0), xvec11;
  2248. addss 0*SIZE(C0, ldc, 1), xvec12;
  2249. addss 0*SIZE(C1), xvec13;
  2250. addss 0*SIZE(C1, ldc, 1), xvec14;
  2251. #endif
  2252. movss xvec11, 0*SIZE(C0);
  2253. movss xvec12, 0*SIZE(C0, ldc, 1);
  2254. movss xvec13, 0*SIZE(C1);
  2255. movss xvec14, 0*SIZE(C1, ldc, 1);
  2256. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2257. MOVQ bk, %rax;
  2258. SUBQ kkk, %rax;
  2259. LEAQ (,%rax, SIZE), %rax;
  2260. ADDQ %rax, ptrba;
  2261. LEAQ (ptrbb, %rax, 4), ptrbb;
  2262. #endif
  2263. #if defined(TRMMKERNEL)&&defined(LEFT)
  2264. ADDQ $1, kk
  2265. #endif
  2266. ADDQ $1*SIZE, C0;
  2267. ADDQ $1*SIZE, C1;
  2268. .L24_loopE:
  2269. #if defined(TRMMKERNEL)&&!defined(LEFT)
  2270. ADDQ $4, kk
  2271. #endif
  2272. MOVQ bk, k;
  2273. SALQ $4, k;
  2274. ADDQ k, bb;
  2275. LEAQ (C, ldc, 4), C;
  2276. .L20_loopE:
  2277. TEST $2, bn;
  2278. JLE .L30_loopE;
  2279. ALIGN_4
  2280. .L30_bodyB:
  2281. #if defined(TRMMKERNEL) && defined(LEFT)
  2282. MOVQ OFFSET, %rax;
  2283. MOVQ %rax, kk
  2284. #endif
  2285. MOVQ C, C0;
  2286. LEAQ (C, ldc, 1), C1;
  2287. MOVQ ba, ptrba;
  2288. MOVQ bm, i;
  2289. SARQ $3, i;
  2290. JLE .L31_loopE;
  2291. ALIGN_4
  2292. .L31_bodyB:
  2293. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2294. MOVQ bb, ptrbb;
  2295. #else
  2296. MOVQ bb, ptrbb;
  2297. MOVQ kk, %rax;
  2298. LEAQ (, %rax, SIZE), %rax;
  2299. LEAQ (ptrba, %rax, 8), ptrba;
  2300. LEAQ (ptrbb, %rax, 2), ptrbb;
  2301. #endif
  2302. #### Initial ####
  2303. XOR_SY yvec15, yvec15, yvec15;
  2304. XOR_SY yvec14, yvec14, yvec14;
  2305. XOR_SY yvec13, yvec13, yvec13;
  2306. XOR_SY yvec12, yvec12, yvec12;
  2307. #ifndef TRMMKERNEL
  2308. MOVQ bk, k;
  2309. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2310. MOVQ bk, %rax;
  2311. SUBQ kk, %rax;
  2312. MOVQ %rax, kkk;
  2313. #else
  2314. MOVQ kk, %rax;
  2315. #ifdef LEFT
  2316. ADDQ $8, %rax;
  2317. #else
  2318. ADDQ $2, %rax;
  2319. #endif
  2320. MOVQ %rax, kkk;
  2321. #endif
  2322. SARQ $2, k;
  2323. JLE .L311_loopE;
  2324. ALIGN_4
  2325. .L311_bodyB:
  2326. LD_SX 0*SIZE(ptrbb), xvec2;
  2327. SHUF_SX $0x50, xvec2, xvec3;
  2328. LD_SX 0*SIZE(ptrba), xvec0;
  2329. LD_SX 4*SIZE(ptrba), xvec1;
  2330. MOV_SX xvec3, xvec4;
  2331. MUL_SX xvec0, xvec3, xvec3;
  2332. ADD_SX xvec3, xvec15, xvec15;
  2333. SHUF_SX $0x4e, xvec4, xvec5;
  2334. MUL_SX xvec1, xvec4, xvec4;
  2335. ADD_SX xvec4, xvec14, xvec14;
  2336. MOV_SX xvec5, xvec6;
  2337. MUL_SX xvec0, xvec5, xvec5;
  2338. ADD_SX xvec5, xvec13, xvec13;
  2339. MUL_SX xvec1, xvec6, xvec6;
  2340. ADD_SX xvec6, xvec12, xvec12;
  2341. SHUF_SX $0xfa, xvec2, xvec3;
  2342. LD_SX 8*SIZE(ptrba), xvec0;
  2343. LD_SX 12*SIZE(ptrba), xvec1;
  2344. MOV_SX xvec3, xvec4;
  2345. MUL_SX xvec0, xvec3, xvec3;
  2346. ADD_SX xvec3, xvec15, xvec15;
  2347. SHUF_SX $0x4e, xvec4, xvec5;
  2348. MUL_SX xvec1, xvec4, xvec4;
  2349. ADD_SX xvec4, xvec14, xvec14;
  2350. MOV_SX xvec5, xvec6;
  2351. MUL_SX xvec0, xvec5, xvec5;
  2352. ADD_SX xvec5, xvec13, xvec13;
  2353. MUL_SX xvec1, xvec6, xvec6;
  2354. ADD_SX xvec6, xvec12, xvec12;
  2355. LD_SX 4*SIZE(ptrbb), xvec2;
  2356. SHUF_SX $0x50, xvec2, xvec3;
  2357. LD_SX 16*SIZE(ptrba), xvec0;
  2358. LD_SX 20*SIZE(ptrba), xvec1;
  2359. MOV_SX xvec3, xvec4;
  2360. MUL_SX xvec0, xvec3, xvec3;
  2361. ADD_SX xvec3, xvec15, xvec15;
  2362. SHUF_SX $0x4e, xvec4, xvec5;
  2363. MUL_SX xvec1, xvec4, xvec4;
  2364. ADD_SX xvec4, xvec14, xvec14;
  2365. MOV_SX xvec5, xvec6;
  2366. MUL_SX xvec0, xvec5, xvec5;
  2367. ADD_SX xvec5, xvec13, xvec13;
  2368. MUL_SX xvec1, xvec6, xvec6;
  2369. ADD_SX xvec6, xvec12, xvec12;
  2370. SHUF_SX $0xfa, xvec2, xvec3;
  2371. LD_SX 24*SIZE(ptrba), xvec0;
  2372. LD_SX 28*SIZE(ptrba), xvec1;
  2373. MOV_SX xvec3, xvec4;
  2374. MUL_SX xvec0, xvec3, xvec3;
  2375. ADD_SX xvec3, xvec15, xvec15;
  2376. SHUF_SX $0x4e, xvec4, xvec5;
  2377. MUL_SX xvec1, xvec4, xvec4;
  2378. ADD_SX xvec4, xvec14, xvec14;
  2379. MOV_SX xvec5, xvec6;
  2380. MUL_SX xvec0, xvec5, xvec5;
  2381. ADD_SX xvec5, xvec13, xvec13;
  2382. MUL_SX xvec1, xvec6, xvec6;
  2383. ADD_SX xvec6, xvec12, xvec12;
  2384. ADDQ $32*SIZE, ptrba;
  2385. ADDQ $8*SIZE, ptrbb;
  2386. DECQ k;
  2387. JG .L311_bodyB;
  2388. ALIGN_4
  2389. .L311_loopE:
  2390. #ifndef TRMMKERNEL
  2391. TEST $2, bk;
  2392. #else
  2393. TEST $2, kkk;
  2394. #endif
  2395. JLE .L312_loopE;
  2396. ALIGN_4
  2397. .L312_bodyB:
  2398. LD_SX 0*SIZE(ptrbb), xvec2;
  2399. SHUF_SX $0x50, xvec2, xvec3;
  2400. LD_SX 0*SIZE(ptrba), xvec0;
  2401. LD_SX 4*SIZE(ptrba), xvec1;
  2402. MOV_SX xvec3, xvec4;
  2403. MUL_SX xvec0, xvec3, xvec3;
  2404. ADD_SX xvec3, xvec15, xvec15;
  2405. SHUF_SX $0x4e, xvec4, xvec5;
  2406. MUL_SX xvec1, xvec4, xvec4;
  2407. ADD_SX xvec4, xvec14, xvec14;
  2408. MOV_SX xvec5, xvec6;
  2409. MUL_SX xvec0, xvec5, xvec5;
  2410. ADD_SX xvec5, xvec13, xvec13;
  2411. MUL_SX xvec1, xvec6, xvec6;
  2412. ADD_SX xvec6, xvec12, xvec12;
  2413. SHUF_SX $0xfa, xvec2, xvec3;
  2414. LD_SX 8*SIZE(ptrba), xvec0;
  2415. LD_SX 12*SIZE(ptrba), xvec1;
  2416. MOV_SX xvec3, xvec4;
  2417. MUL_SX xvec0, xvec3, xvec3;
  2418. ADD_SX xvec3, xvec15, xvec15;
  2419. SHUF_SX $0x4e, xvec4, xvec5;
  2420. MUL_SX xvec1, xvec4, xvec4;
  2421. ADD_SX xvec4, xvec14, xvec14;
  2422. MOV_SX xvec5, xvec6;
  2423. MUL_SX xvec0, xvec5, xvec5;
  2424. ADD_SX xvec5, xvec13, xvec13;
  2425. MUL_SX xvec1, xvec6, xvec6;
  2426. ADD_SX xvec6, xvec12, xvec12;
  2427. ADDQ $16*SIZE, ptrba;
  2428. ADDQ $4*SIZE, ptrbb;
  2429. .L312_loopE:
  2430. #ifndef TRMMKERNEL
  2431. TEST $1, bk;
  2432. #else
  2433. TEST $1, kkk;
  2434. #endif
  2435. JLE .L313_loopE;
  2436. ALIGN_4
  2437. .L313_bodyB:
  2438. LD_SX 0*SIZE(ptrbb), xvec2;
  2439. SHUF_SX $0x50, xvec2, xvec3;
  2440. LD_SX 0*SIZE(ptrba), xvec0;
  2441. LD_SX 4*SIZE(ptrba), xvec1;
  2442. MOV_SX xvec3, xvec4;
  2443. MUL_SX xvec0, xvec3, xvec3;
  2444. ADD_SX xvec3, xvec15, xvec15;
  2445. SHUF_SX $0x4e, xvec4, xvec5;
  2446. MUL_SX xvec1, xvec4, xvec4;
  2447. ADD_SX xvec4, xvec14, xvec14;
  2448. MOV_SX xvec5, xvec6;
  2449. MUL_SX xvec0, xvec5, xvec5;
  2450. ADD_SX xvec5, xvec13, xvec13;
  2451. MUL_SX xvec1, xvec6, xvec6;
  2452. ADD_SX xvec6, xvec12, xvec12;
  2453. ADDQ $8*SIZE, ptrba;
  2454. ADDQ $2*SIZE, ptrbb;
  2455. .L313_loopE:
  2456. BROAD_SX MEMALPHA, xvec7;
  2457. MUL_SX xvec7, xvec15, xvec15;
  2458. MUL_SX xvec7, xvec14, xvec14;
  2459. MUL_SX xvec7, xvec13, xvec13;
  2460. MUL_SX xvec7, xvec12, xvec12;
  2461. #### Writing Back ####
  2462. #ifndef TRMMKERNEL
  2463. LDL_SX 0*SIZE(C0), xvec0, xvec0;
  2464. LDH_SX 2*SIZE(C1), xvec0, xvec0;
  2465. LDL_SX 4*SIZE(C0), xvec1, xvec1;
  2466. LDH_SX 6*SIZE(C1), xvec1, xvec1;
  2467. LDL_SX 0*SIZE(C1), xvec2, xvec2;
  2468. LDH_SX 2*SIZE(C0), xvec2, xvec2;
  2469. LDL_SX 4*SIZE(C1), xvec3, xvec3;
  2470. LDH_SX 6*SIZE(C0), xvec3, xvec3;
  2471. ADD_SX xvec0, xvec15, xvec15;
  2472. ADD_SX xvec1, xvec14, xvec14;
  2473. ADD_SX xvec2, xvec13, xvec13;
  2474. ADD_SX xvec3, xvec12, xvec12;
  2475. #endif
  2476. STL_SX xvec15, 0*SIZE(C0);
  2477. STH_SX xvec15, 2*SIZE(C1);
  2478. STL_SX xvec14, 4*SIZE(C0);
  2479. STH_SX xvec14, 6*SIZE(C1);
  2480. STL_SX xvec13, 0*SIZE(C1);
  2481. STH_SX xvec13, 2*SIZE(C0);
  2482. STL_SX xvec12, 4*SIZE(C1);
  2483. STH_SX xvec12, 6*SIZE(C0);
  2484. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2485. MOVQ bk, %rax;
  2486. SUBQ kkk, %rax;
  2487. LEAQ (,%rax, SIZE), %rax;
  2488. LEAQ (ptrba, %rax, 8), ptrba;
  2489. LEAQ (ptrbb, %rax, 2), ptrbb;
  2490. #endif
  2491. #if defined(TRMMKERNEL) && defined(LEFT)
  2492. ADDQ $8, kk
  2493. #endif
  2494. ADDQ $8*SIZE, C0;
  2495. ADDQ $8*SIZE, C1;
  2496. DECQ i;
  2497. JG .L31_bodyB;
  2498. ALIGN_4
  2499. .L31_loopE:
  2500. TEST $4, bm;
  2501. JLE .L32_loopE;
  2502. ALIGN_4
  2503. .L32_bodyB:
  2504. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2505. MOVQ bb, ptrbb;
  2506. #else
  2507. MOVQ bb, ptrbb;
  2508. MOVQ kk, %rax;
  2509. LEAQ (,%rax, SIZE), %rax;
  2510. LEAQ (ptrba, %rax, 4), ptrba;
  2511. LEAQ (ptrbb, %rax, 2), ptrbb;
  2512. #endif
  2513. #### Initial ####
  2514. XOR_SY yvec15, yvec15, yvec15;
  2515. XOR_SY yvec14, yvec14, yvec14;
  2516. #ifndef TRMMKERNEL
  2517. MOVQ bk, k;
  2518. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2519. MOVQ bk, %rax;
  2520. SUBQ kk, %rax;
  2521. MOVQ %rax, kkk;
  2522. #else
  2523. MOVQ kk, %rax;
  2524. #ifdef LEFT
  2525. ADDQ $4, %rax;
  2526. #else
  2527. ADDQ $2, %rax;
  2528. #endif
  2529. MOVQ %rax, kkk;
  2530. #endif
  2531. SARQ $2, k;
  2532. JLE .L321_loopE;
  2533. ALIGN_4
  2534. .L321_bodyB:
  2535. LD_SX 0*SIZE(ptrba), xvec0;
  2536. LD_SX 0*SIZE(ptrbb), xvec2;
  2537. SHUF_SX $0x50, xvec2, xvec3;
  2538. SHUF_SX $0x05, xvec2, xvec4;
  2539. MUL_SX xvec0, xvec3, xvec3;
  2540. ADD_SX xvec3, xvec15, xvec15;
  2541. MUL_SX xvec0, xvec4, xvec4;
  2542. ADD_SX xvec4, xvec14, xvec14;
  2543. LD_SX 4*SIZE(ptrba), xvec0;
  2544. SHUF_SX $0xfa, xvec2, xvec5;
  2545. SHUF_SX $0xaf, xvec2, xvec6;
  2546. MUL_SX xvec0, xvec5, xvec5;
  2547. ADD_SX xvec5, xvec15, xvec15;
  2548. MUL_SX xvec0, xvec6, xvec6;
  2549. ADD_SX xvec6, xvec14, xvec14;
  2550. LD_SX 8*SIZE(ptrba), xvec0;
  2551. LD_SX 4*SIZE(ptrbb), xvec2;
  2552. SHUF_SX $0x50, xvec2, xvec3;
  2553. SHUF_SX $0x05, xvec2, xvec4;
  2554. MUL_SX xvec0, xvec3, xvec3;
  2555. ADD_SX xvec3, xvec15, xvec15;
  2556. MUL_SX xvec0, xvec4, xvec4;
  2557. ADD_SX xvec4, xvec14, xvec14;
  2558. LD_SX 12*SIZE(ptrba), xvec0;
  2559. SHUF_SX $0xfa, xvec2, xvec5;
  2560. SHUF_SX $0xaf, xvec2, xvec6;
  2561. MUL_SX xvec0, xvec5, xvec5;
  2562. ADD_SX xvec5, xvec15, xvec15;
  2563. MUL_SX xvec0, xvec6, xvec6;
  2564. ADD_SX xvec6, xvec14, xvec14;
  2565. ADDQ $16*SIZE, ptrba;
  2566. ADDQ $8*SIZE, ptrbb;
  2567. DECQ k;
  2568. JG .L321_bodyB;
  2569. ALIGN_4
  2570. .L321_loopE:
  2571. #ifndef TRMMKERNEL
  2572. TEST $2, bk;
  2573. #else
  2574. TEST $2, kkk;
  2575. #endif
  2576. JLE .L322_loopE;
  2577. ALIGN_4
  2578. .L322_bodyB:
  2579. LD_SX 0*SIZE(ptrba), xvec0;
  2580. LD_SX 0*SIZE(ptrbb), xvec2;
  2581. SHUF_SX $0x50, xvec2, xvec3;
  2582. SHUF_SX $0x05, xvec2, xvec4;
  2583. MUL_SX xvec0, xvec3, xvec3;
  2584. ADD_SX xvec3, xvec15, xvec15;
  2585. MUL_SX xvec0, xvec4, xvec4;
  2586. ADD_SX xvec4, xvec14, xvec14;
  2587. LD_SX 4*SIZE(ptrba), xvec0;
  2588. SHUF_SX $0xfa, xvec2, xvec5;
  2589. SHUF_SX $0xaf, xvec2, xvec6;
  2590. MUL_SX xvec0, xvec5, xvec5;
  2591. ADD_SX xvec5, xvec15, xvec15;
  2592. MUL_SX xvec0, xvec6, xvec6;
  2593. ADD_SX xvec6, xvec14, xvec14;
  2594. ADDQ $8*SIZE, ptrba;
  2595. ADDQ $4*SIZE, ptrbb;
  2596. .L322_loopE:
  2597. #ifndef TRMMKERNEL
  2598. TEST $1, bk;
  2599. #else
  2600. TEST $1, kkk;
  2601. #endif
  2602. JLE .L323_loopE;
  2603. ALIGN_4
  2604. .L323_bodyB:
  2605. LD_SX 0*SIZE(ptrba), xvec0;
  2606. LD_SX 0*SIZE(ptrbb), xvec2;
  2607. SHUF_SX $0x50, xvec2, xvec3;
  2608. SHUF_SX $0x05, xvec2, xvec4;
  2609. MUL_SX xvec0, xvec3, xvec3;
  2610. ADD_SX xvec3, xvec15, xvec15;
  2611. MUL_SX xvec0, xvec4, xvec4;
  2612. ADD_SX xvec4, xvec14, xvec14;
  2613. ADDQ $4*SIZE, ptrba;
  2614. ADDQ $2*SIZE, ptrbb;
  2615. .L323_loopE:
  2616. BROAD_SX MEMALPHA, xvec7;
  2617. MUL_SX xvec7, xvec15, xvec15;
  2618. MUL_SX xvec7, xvec14, xvec14;
  2619. #### Writing back ####
  2620. #ifndef TRMMKERNEL
  2621. LDL_SX 0*SIZE(C0), xvec0, xvec0;
  2622. LDH_SX 2*SIZE(C1), xvec0, xvec0;
  2623. LDL_SX 0*SIZE(C1), xvec1, xvec1;
  2624. LDH_SX 2*SIZE(C0), xvec1, xvec1;
  2625. ADD_SX xvec0, xvec15, xvec15;
  2626. ADD_SX xvec1, xvec14, xvec14;
  2627. #endif
  2628. STL_SX xvec15, 0*SIZE(C0);
  2629. STH_SX xvec15, 2*SIZE(C1);
  2630. STL_SX xvec14, 0*SIZE(C1);
  2631. STH_SX xvec14, 2*SIZE(C0);
  2632. #if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
  2633. MOVQ bk, %rax;
  2634. SUBQ kkk, %rax;
  2635. LEAQ (,%rax, SIZE), %rax;
  2636. LEAQ (ptrba, %rax, 4), ptrba;
  2637. LEAQ (ptrbb, %rax, 2), ptrbb;
  2638. #endif
  2639. #if defined(TRMMKERNEL) && defined(LEFT)
  2640. ADDQ $4, kk
  2641. #endif
  2642. ADDQ $4*SIZE, C0;
  2643. ADDQ $4*SIZE, C1;
  2644. .L32_loopE:
  2645. TEST $2, bm;
  2646. JLE .L33_loopE;
  2647. ALIGN_4
  2648. .L33_bodyB:
  2649. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2650. MOVQ bb, ptrbb;
  2651. #else
  2652. MOVQ bb, ptrbb;
  2653. MOVQ kk, %rax;
  2654. LEAQ (,%rax, SIZE), %rax;
  2655. LEAQ (ptrba, %rax, 2), ptrba;
  2656. LEAQ (ptrbb, %rax, 2), ptrbb;
  2657. #endif
  2658. #### Initial ####
  2659. XOR_SY yvec15, yvec15, yvec15;
  2660. XOR_SY yvec14, yvec14, yvec14;
  2661. XOR_SY yvec13, yvec13, yvec13;
  2662. XOR_SY yvec12, yvec12, yvec12;
  2663. #ifndef TRMMKERNEL
  2664. MOVQ bk, k;
  2665. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2666. MOVQ bk, %rax;
  2667. SUBQ kk, %rax;
  2668. MOVQ %rax, kkk;
  2669. #else
  2670. MOVQ kk, %rax;
  2671. #ifdef LEFT
  2672. ADDQ $2, %rax;
  2673. #else
  2674. ADDQ $2, %rax;
  2675. #endif
  2676. MOVQ %rax, kkk;
  2677. #endif
  2678. SARQ $2, k;
  2679. JLE .L331_loopE;
  2680. ALIGN_4
  2681. .L331_bodyB:
  2682. LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3
  2683. EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2
  2684. ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3
  2685. MUL_SX xvec0, xvec2, xvec2;
  2686. ADD_SX xvec2, xvec15, xvec15;
  2687. MUL_SX xvec0, xvec3, xvec3;
  2688. ADD_SX xvec3, xvec14, xvec14;
  2689. LD_SX 4*SIZE(ptrba), xvec0;
  2690. EDUP_SX 4*SIZE(ptrbb), xvec2;
  2691. ODUP_SX 4*SIZE(ptrbb), xvec3;
  2692. MUL_SX xvec0, xvec2, xvec2;
  2693. ADD_SX xvec2, xvec15, xvec15;
  2694. MUL_SX xvec0, xvec3, xvec3;
  2695. ADD_SX xvec3, xvec14, xvec14;
  2696. ADDQ $8*SIZE, ptrba;
  2697. ADDQ $8*SIZE, ptrbb;
  2698. DECQ k;
  2699. JG .L331_bodyB;
  2700. ALIGN_4
  2701. .L331_loopE:
  2702. #ifndef TRMMKERNEL
  2703. TEST $2, bk;
  2704. #else
  2705. TEST $2, kkk;
  2706. #endif
  2707. JLE .L332_loopE;
  2708. ALIGN_4
  2709. .L332_bodyB:
  2710. LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3
  2711. EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2
  2712. ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3
  2713. MUL_SX xvec0, xvec2, xvec2;
  2714. ADD_SX xvec2, xvec15, xvec15;
  2715. MUL_SX xvec0, xvec3, xvec3;
  2716. ADD_SX xvec3, xvec14, xvec14;
  2717. ADDQ $4*SIZE, ptrba;
  2718. ADDQ $4*SIZE, ptrbb;
  2719. .L332_loopE:
  2720. #ifndef TRMMKERNEL
  2721. TEST $1, bk;
  2722. #else
  2723. TEST $1, kkk;
  2724. #endif
  2725. JLE .L333_loopE;
  2726. ALIGN_4
  2727. .L333_bodyB:
  2728. movss 0*SIZE(ptrba), xvec0;
  2729. movss 1*SIZE(ptrba), xvec1;
  2730. movss 0*SIZE(ptrbb), xvec2;
  2731. XOR_SY yvec3, yvec3, yvec3;
  2732. movss xvec2, xvec3;
  2733. mulss xvec0, xvec2;
  2734. addss xvec2, xvec15;
  2735. mulss xvec1, xvec3;
  2736. SHUF_SX $0xe1, xvec3, xvec4;
  2737. ADD_SX xvec4, xvec15, xvec15;
  2738. movss 1*SIZE(ptrbb), xvec5;
  2739. XOR_SY yvec6, yvec6, yvec6;
  2740. movss xvec5, xvec6;
  2741. mulss xvec0, xvec5;
  2742. addss xvec5, xvec14;
  2743. mulss xvec1, xvec6;
  2744. SHUF_SX $0xe1, xvec6, xvec7;
  2745. ADD_SX xvec7, xvec14, xvec14
  2746. ADDQ $2*SIZE, ptrba;
  2747. ADDQ $2*SIZE, ptrbb;
  2748. .L333_loopE:
  2749. BROAD_SX MEMALPHA, xvec7;
  2750. MUL_SX xvec7, xvec15, xvec15;
  2751. MUL_SX xvec7, xvec14, xvec14;
  2752. SHUF_SX $0xee, xvec15, xvec13;
  2753. SHUF_SX $0xee, xvec14, xvec12;
  2754. SHUF_SX $0x44, xvec15, xvec11;
  2755. SHUF_SX $0x44, xvec14, xvec10;
  2756. ADD_SX xvec13, xvec11, xvec11;
  2757. ADD_SX xvec12, xvec10, xvec10;
  2758. #ifndef TRMMKERNEL
  2759. LDL_SX 0*SIZE(C0), xvec0, xvec0;
  2760. LDL_SX 0*SIZE(C1), xvec1, xvec1;
  2761. ADD_SX xvec0, xvec11, xvec11;
  2762. ADD_SX xvec1, xvec10, xvec10;
  2763. #endif
  2764. STL_SX xvec11, 0*SIZE(C0);
  2765. STL_SX xvec10, 0*SIZE(C1);
  2766. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2767. MOVQ bk, %rax;
  2768. SUBQ kkk, %rax;
  2769. LEAQ (,%rax, SIZE), %rax;
  2770. LEAQ (ptrba, %rax, 2), ptrba;
  2771. LEAQ (ptrbb, %rax, 2), ptrbb;
  2772. #endif
  2773. #if defined(TRMMKERNEL) && defined(LEFT)
  2774. ADDQ $2, kk;
  2775. #endif
  2776. ADDQ $2*SIZE, C0;
  2777. ADDQ $2*SIZE, C1;
  2778. #### Writing Back ####
  2779. .L33_loopE:
  2780. TEST $1, bm;
  2781. JLE .L34_loopE;
  2782. ALIGN_4
  2783. .L34_bodyB:
  2784. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2785. MOVQ bb, ptrbb;
  2786. #else
  2787. MOVQ bb, ptrbb;
  2788. MOVQ kk, %rax;
  2789. LEAQ (, %rax, SIZE), %rax;
  2790. ADDQ %rax, ptrba;
  2791. LEAQ (ptrbb, %rax, 2), ptrbb;
  2792. #endif
  2793. #### Initial ####
  2794. XOR_SY yvec15, yvec15, yvec15;
  2795. XOR_SY yvec14, yvec14, yvec14;
  2796. #ifndef TRMMKERNEL
  2797. MOVQ bk, k;
  2798. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2799. MOVQ bk, %rax;
  2800. SUBQ kk, %rax;
  2801. MOVQ %rax, kkk;
  2802. #else
  2803. MOVQ kk, %rax;
  2804. #ifdef LEFT
  2805. ADDQ $1, %rax;
  2806. #else
  2807. ADDQ $2, %rax;
  2808. #endif
  2809. MOVQ %rax, kkk;
  2810. #endif
  2811. SARQ $2, k;
  2812. JLE .L341_loopE;
  2813. ALIGN_4
  2814. .L341_bodyB:
  2815. movss 0*SIZE(ptrba), xvec0;
  2816. movss 0*SIZE(ptrbb), xvec1;
  2817. mulss xvec0, xvec1;
  2818. addss xvec1, xvec15;
  2819. movss 1*SIZE(ptrbb), xvec2;
  2820. mulss xvec0, xvec2;
  2821. addss xvec2, xvec14;
  2822. movss 1*SIZE(ptrba), xvec0;
  2823. movss 2*SIZE(ptrbb), xvec1;
  2824. mulss xvec0, xvec1;
  2825. addss xvec1, xvec15;
  2826. movss 3*SIZE(ptrbb), xvec2;
  2827. mulss xvec0, xvec2;
  2828. addss xvec2, xvec14;
  2829. movss 2*SIZE(ptrba), xvec0;
  2830. movss 4*SIZE(ptrbb), xvec1;
  2831. mulss xvec0, xvec1;
  2832. addss xvec1, xvec15;
  2833. movss 5*SIZE(ptrbb), xvec2;
  2834. mulss xvec0, xvec2;
  2835. addss xvec2, xvec14;
  2836. movss 3*SIZE(ptrba), xvec0;
  2837. movss 6*SIZE(ptrbb), xvec1;
  2838. mulss xvec0, xvec1;
  2839. addss xvec1, xvec15;
  2840. movss 7*SIZE(ptrbb), xvec2;
  2841. mulss xvec0, xvec2;
  2842. addss xvec2, xvec14;
  2843. addq $4*SIZE, ptrba;
  2844. addq $8*SIZE, ptrbb;
  2845. decq k;
  2846. jg .L341_bodyB;
  2847. ALIGN_4
  2848. .L341_loopE:
  2849. #ifndef TRMMKERNEL
  2850. TEST $2, bk;
  2851. #else
  2852. TEST $2, kkk;
  2853. #endif
  2854. JLE .L342_loopE;
  2855. ALIGN_4
  2856. .L342_bodyB:
  2857. movss 0*SIZE(ptrba), xvec0;
  2858. movss 0*SIZE(ptrbb), xvec1;
  2859. mulss xvec0, xvec1;
  2860. addss xvec1, xvec15;
  2861. movss 1*SIZE(ptrbb), xvec2;
  2862. mulss xvec0, xvec2;
  2863. addss xvec2, xvec14;
  2864. movss 1*SIZE(ptrba), xvec0;
  2865. movss 2*SIZE(ptrbb), xvec1;
  2866. mulss xvec0, xvec1;
  2867. addss xvec1, xvec15;
  2868. movss 3*SIZE(ptrbb), xvec2;
  2869. mulss xvec0, xvec2;
  2870. addss xvec2, xvec14;
  2871. addq $2*SIZE, ptrba;
  2872. addq $4*SIZE, ptrbb;
  2873. .L342_loopE:
  2874. #ifndef TRMMKERNEL
  2875. TEST $1, bk;
  2876. #else
  2877. TEST $1, kkk;
  2878. #endif
  2879. JLE .L343_loopE;
  2880. ALIGN_4
  2881. .L343_bodyB:
  2882. movss 0*SIZE(ptrba), xvec0;
  2883. movss 0*SIZE(ptrbb), xvec1;
  2884. mulss xvec0, xvec1;
  2885. addss xvec1, xvec15;
  2886. movss 1*SIZE(ptrbb), xvec2;
  2887. mulss xvec0, xvec2;
  2888. addss xvec2, xvec14;
  2889. addq $1*SIZE, ptrba;
  2890. addq $2*SIZE, ptrbb
  2891. .L343_loopE:
  2892. #### Writing back ####
  2893. movss MEMALPHA, xvec7;
  2894. mulss xvec7, xvec15;
  2895. mulss xvec7, xvec14;
  2896. movss 0*SIZE(C0), xvec0;
  2897. movss 0*SIZE(C1), xvec1;
  2898. #ifndef TRMMKERNEL
  2899. addss xvec0, xvec15;
  2900. addss xvec1, xvec14;
  2901. #endif
  2902. movss xvec15, 0*SIZE(C0);
  2903. movss xvec14, 0*SIZE(C1);
  2904. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2905. MOVQ bk, %rax;
  2906. SUBQ kkk, %rax;
  2907. LEAQ (,%rax, SIZE), %rax;
  2908. ADDQ %rax, ptrba;
  2909. LEAQ (ptrbb, %rax, 2), ptrbb;
  2910. #endif
  2911. #if defined(TRMMKERNEL) && defined(LEFT)
  2912. ADDQ $1, kk;
  2913. #endif
  2914. addq $1*SIZE, C0;
  2915. addq $1*SIZE, C1;
  2916. .L34_loopE:
  2917. #if defined(TRMMKERNEL) && !defined(LEFT)
  2918. ADDQ $2, kk;
  2919. #endif
  2920. MOVQ bk, k;
  2921. SALQ $3, k;
  2922. ADDQ k, bb;
  2923. LEAQ (C, ldc, 2), C;
  2924. .L30_loopE:
  2925. TEST $1, bn;
  2926. JLE .L40_loopE;
  2927. ALIGN_4
  2928. .L40_bodyB:
  2929. #if defined(TRMMKERNEL)&&defined(LEFT)
  2930. MOVQ OFFSET, %rax;
  2931. MOVQ %rax, kk;
  2932. #endif
  2933. MOVQ C, C0;
  2934. MOVQ ba, ptrba;
  2935. MOVQ bm, i;
  2936. SARQ $3, i;
  2937. JLE .L41_loopE;
  2938. ALIGN_4
  2939. .L41_bodyB:
  2940. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2941. MOVQ bb, ptrbb;
  2942. #else
  2943. MOVQ bb, ptrbb;
  2944. MOVQ kk, %rax
  2945. LEAQ (, %rax, SIZE), %rax;
  2946. LEAQ (ptrba, %rax, 8), ptrba;
  2947. ADDQ %rax, ptrbb;
  2948. #endif
  2949. #### initial ####
  2950. XOR_SY yvec15, yvec15, yvec15;
  2951. #ifndef TRMMKERNEL
  2952. MOVQ bk, k;
  2953. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2954. MOVQ bk, %rax;
  2955. SUBQ kk, %rax;
  2956. MOVQ %rax, kkk;
  2957. #else
  2958. MOVQ kk, %rax;
  2959. #ifdef LEFT
  2960. ADDQ $8, %rax;
  2961. #else
  2962. ADDQ $1, %rax;
  2963. #endif
  2964. MOVQ %rax, kkk;
  2965. #endif
  2966. SARQ $2, k;
  2967. JLE .L411_loopE;
  2968. ALIGN_4
  2969. .L411_bodyB:
  2970. LD_SY 0*SIZE(ptrba), yvec0;
  2971. BROAD_SY 0*SIZE(ptrbb), yvec1;
  2972. MUL_SY yvec0, yvec1, yvec2;
  2973. ADD_SY yvec2, yvec15, yvec15;
  2974. LD_SY 8*SIZE(ptrba), yvec0;
  2975. BROAD_SY 1*SIZE(ptrbb), yvec1;
  2976. MUL_SY yvec0, yvec1, yvec2;
  2977. ADD_SY yvec2, yvec15, yvec15;
  2978. LD_SY 16*SIZE(ptrba), yvec0;
  2979. BROAD_SY 2*SIZE(ptrbb), yvec1;
  2980. MUL_SY yvec0, yvec1, yvec2;
  2981. ADD_SY yvec2, yvec15, yvec15;
  2982. LD_SY 24*SIZE(ptrba), yvec0;
  2983. BROAD_SY 3*SIZE(ptrbb), yvec1;
  2984. MUL_SY yvec0, yvec1, yvec2;
  2985. ADD_SY yvec2, yvec15, yvec15;
  2986. ADDQ $32*SIZE, ptrba;
  2987. ADDQ $4*SIZE, ptrbb;
  2988. DECQ k;
  2989. JG .L411_bodyB;
  2990. ALIGN_4
  2991. .L411_loopE:
  2992. #ifndef TRMMKERNEL
  2993. TEST $2, bk;
  2994. #else
  2995. TEST $2, kkk;
  2996. #endif
  2997. JLE .L412_loopE;
  2998. ALIGN_4
  2999. .L412_bodyB:
  3000. LD_SY 0*SIZE(ptrba), yvec0;
  3001. BROAD_SY 0*SIZE(ptrbb), yvec1;
  3002. MUL_SY yvec0, yvec1, yvec2;
  3003. ADD_SY yvec2, yvec15, yvec15;
  3004. LD_SY 8*SIZE(ptrba), yvec0;
  3005. BROAD_SY 1*SIZE(ptrbb), yvec1;
  3006. MUL_SY yvec0, yvec1, yvec2;
  3007. ADD_SY yvec2, yvec15, yvec15;
  3008. ADDQ $16*SIZE, ptrba;
  3009. ADDQ $2*SIZE, ptrbb;
  3010. .L412_loopE:
  3011. #ifndef TRMMKERNEL
  3012. TEST $1, bk;
  3013. #else
  3014. TEST $1, kkk;
  3015. #endif
  3016. JLE .L413_loopE;
  3017. ALIGN_4
  3018. .L413_bodyB:
  3019. LD_SY 0*SIZE(ptrba), yvec0;
  3020. BROAD_SY 0*SIZE(ptrbb), yvec1;
  3021. MUL_SY yvec0, yvec1, yvec2;
  3022. ADD_SY yvec2, yvec15, yvec15;
  3023. ADDQ $8*SIZE, ptrba;
  3024. ADDQ $1*SIZE, ptrbb;
  3025. .L413_loopE:
  3026. #### Writing ####
  3027. BROAD_SY MEMALPHA, yvec7;
  3028. MUL_SY yvec7, yvec15, yvec15;
  3029. EXTRA_SY $1, yvec15, xvec14;
  3030. SHUF_SX $0x44, xvec15, xvec13;
  3031. SHUF_SX $0xee, xvec15, xvec12;
  3032. SHUF_SX $0x44, xvec14, xvec11;
  3033. SHUF_SX $0xee, xvec14, xvec10;
  3034. #ifndef TRMMKERNEL
  3035. LDL_SX 0*SIZE(C0), xvec0, xvec0;
  3036. LDL_SX 2*SIZE(C0), xvec1, xvec1;
  3037. LDL_SX 4*SIZE(C0), xvec2, xvec2;
  3038. LDL_SX 6*SIZE(C0), xvec3, xvec3;
  3039. ADD_SX xvec0, xvec13, xvec13;
  3040. ADD_SX xvec1, xvec12, xvec12;
  3041. ADD_SX xvec2, xvec11, xvec11;
  3042. ADD_SX xvec3, xvec10, xvec10;
  3043. #endif
  3044. STL_SX xvec13, 0*SIZE(C0);
  3045. STL_SX xvec12, 2*SIZE(C0);
  3046. STL_SX xvec11, 4*SIZE(C0);
  3047. STL_SX xvec10, 6*SIZE(C0);
  3048. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  3049. MOVQ bk, %rax;
  3050. SUBQ kkk, %rax;
  3051. LEAQ (,%rax, SIZE), %rax;
  3052. LEAQ (ptrba, %rax, 8), ptrba;
  3053. ADDQ %rax, ptrbb;
  3054. #endif
  3055. #if defined(TRMMKERNEL)&&defined(LEFT)
  3056. ADDQ $8, kk;
  3057. #endif
  3058. ADDQ $8*SIZE, C0;
  3059. DECQ i;
  3060. JG .L41_bodyB;
  3061. ALIGN_4
  3062. .L41_loopE:
  3063. TEST $4, bm;
  3064. JLE .L42_loopE;
  3065. ALIGN_4
  3066. .L42_bodyB:
  3067. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  3068. MOVQ bb, ptrbb;
  3069. #else
  3070. MOVQ bb, ptrbb;
  3071. MOVQ kk, %rax;
  3072. LEAQ (,%rax, SIZE), %rax;
  3073. LEAQ (ptrba, %rax, 4), ptrba;
  3074. ADDQ %rax, ptrbb;
  3075. #endif
  3076. XOR_SY yvec15, yvec15, yvec15;
  3077. #ifndef TRMMKERNEL
  3078. MOVQ bk, k;
  3079. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3080. MOVQ bk, %rax;
  3081. SUBQ kk, %rax;
  3082. MOVQ %rax, kkk;
  3083. #else
  3084. MOVQ kk, %rax;
  3085. #ifdef LEFT
  3086. ADDQ $4, %rax;
  3087. #else
  3088. ADDQ $1, %rax;
  3089. #endif
  3090. MOVQ %rax, kkk
  3091. #endif
  3092. SARQ $2, k;
  3093. JLE .L421_loopE;
  3094. ALIGN_4
  3095. .L421_bodyB:
  3096. LD_SX 0*SIZE(ptrba), xvec0;
  3097. BROAD_SX 0*SIZE(ptrbb), xvec1;
  3098. MUL_SX xvec0, xvec1, xvec1;
  3099. ADD_SX xvec1, xvec15, xvec15;
  3100. LD_SX 4*SIZE(ptrba), xvec0;
  3101. BROAD_SX 1*SIZE(ptrbb), xvec1;
  3102. MUL_SX xvec0, xvec1, xvec1;
  3103. ADD_SX xvec1, xvec15, xvec15;
  3104. LD_SX 8*SIZE(ptrba), xvec0;
  3105. BROAD_SX 2*SIZE(ptrbb), xvec1;
  3106. MUL_SX xvec0, xvec1, xvec1;
  3107. ADD_SX xvec1, xvec15, xvec15;
  3108. LD_SX 12*SIZE(ptrba), xvec0;
  3109. BROAD_SX 3*SIZE(ptrbb), xvec1;
  3110. MUL_SX xvec0, xvec1, xvec1;
  3111. ADD_SX xvec1, xvec15, xvec15;
  3112. ADDQ $16*SIZE, ptrba;
  3113. ADDQ $4*SIZE, ptrbb;
  3114. DECQ k;
  3115. JG .L421_bodyB;
  3116. ALIGN_4
  3117. .L421_loopE:
  3118. #ifndef TRMMKERNEL
  3119. TEST $2, bk;
  3120. #else
  3121. TEST $2, kkk;
  3122. #endif
  3123. JLE .L422_loopE;
  3124. ALIGN_4
  3125. .L422_bodyB:
  3126. LD_SX 0*SIZE(ptrba), xvec0;
  3127. BROAD_SX 0*SIZE(ptrbb), xvec1;
  3128. MUL_SX xvec0, xvec1, xvec1;
  3129. ADD_SX xvec1, xvec15, xvec15;
  3130. LD_SX 4*SIZE(ptrba), xvec0;
  3131. BROAD_SX 1*SIZE(ptrbb), xvec1;
  3132. MUL_SX xvec0, xvec1, xvec1;
  3133. ADD_SX xvec1, xvec15, xvec15;
  3134. ADDQ $8*SIZE, ptrba;
  3135. ADDQ $2*SIZE, ptrbb;
  3136. .L422_loopE:
  3137. #ifndef TRMMKERNEL
  3138. TEST $1, bk;
  3139. #else
  3140. TEST $1, kkk;
  3141. #endif
  3142. JLE .L423_loopE;
  3143. ALIGN_4
  3144. .L423_bodyB:
  3145. LD_SX 0*SIZE(ptrba), xvec0;
  3146. BROAD_SX 0*SIZE(ptrbb), xvec1;
  3147. MUL_SX xvec0, xvec1, xvec1;
  3148. ADD_SX xvec1, xvec15, xvec15;
  3149. ADDQ $4*SIZE, ptrba;
  3150. ADDQ $1*SIZE, ptrbb;
  3151. .L423_loopE:
  3152. #### Writing back ####
  3153. BROAD_SX MEMALPHA, xvec7;
  3154. MUL_SX xvec7, xvec15, xvec15;
  3155. #ifndef TRMMKERNEL
  3156. LDL_SX 0*SIZE(C0), xvec0, xvec0;
  3157. LDH_SX 2*SIZE(C0), xvec0, xvec0;
  3158. ADD_SX xvec0, xvec15, xvec15;
  3159. #endif
  3160. STL_SX xvec15, 0*SIZE(C0);
  3161. STH_SX xvec15, 2*SIZE(C0);
  3162. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  3163. MOVQ bk, %rax;
  3164. SUBQ kkk, %rax;
  3165. LEAQ (, %rax, SIZE), %rax;
  3166. LEAQ (ptrba, %rax, 4), ptrba;
  3167. ADDQ %rax, ptrbb;
  3168. #endif
  3169. #if defined(TRMMKERNEL) && defined(LEFT)
  3170. ADDQ $4, kk
  3171. #endif
  3172. ADDQ $4*SIZE, C0;
  3173. .L42_loopE:
  3174. TEST $2, bm;
  3175. JLE .L43_loopE;
  3176. ALIGN_4
  3177. .L43_bodyB:
  3178. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3179. MOVQ bb, ptrbb;
  3180. #else
  3181. MOVQ bb, ptrbb;
  3182. MOVQ kk, %rax
  3183. LEAQ (, %rax, SIZE), %rax
  3184. LEAQ (ptrba, %rax, 2), ptrba
  3185. ADDQ %rax, ptrbb;
  3186. #endif
  3187. XOR_SY yvec15, yvec15, yvec15;
  3188. XOR_SY yvec14, yvec14, yvec14;
  3189. #ifndef TRMMKERNEL
  3190. MOVQ bk, k;
  3191. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  3192. MOVQ bk, %rax;
  3193. SUBQ kk, %rax;
  3194. MOVQ %rax, kkk;
  3195. #else
  3196. MOVQ kk, %rax;
  3197. #ifdef LEFT
  3198. ADDQ $2, %rax;
  3199. #else
  3200. ADDQ $1, %rax;
  3201. #endif
  3202. MOVQ %rax, kkk;
  3203. #endif
  3204. SARQ $2, k;
  3205. JLE .L431_loopE;
  3206. ALIGN_4
  3207. .L431_bodyB:
  3208. vmovss 0*SIZE(ptrba), xvec0;
  3209. vmovss 1*SIZE(ptrba), xvec1;
  3210. vmovss 0*SIZE(ptrbb), xvec2;
  3211. vmulss xvec2, xvec0, xvec0;
  3212. vaddss xvec0, xvec15, xvec15;
  3213. vmulss xvec2, xvec1, xvec1;
  3214. vaddss xvec1, xvec14, xvec14;
  3215. vmovss 2*SIZE(ptrba), xvec3;
  3216. vmovss 3*SIZE(ptrba), xvec4;
  3217. vmovss 1*SIZE(ptrbb), xvec5;
  3218. vmulss xvec5, xvec3, xvec3;
  3219. vaddss xvec3, xvec15, xvec15;
  3220. vmulss xvec5, xvec4, xvec4;
  3221. vaddss xvec4, xvec14, xvec14;
  3222. vmovss 4*SIZE(ptrba), xvec0;
  3223. vmovss 5*SIZE(ptrba), xvec1;
  3224. vmovss 2*SIZE(ptrbb), xvec2;
  3225. vmulss xvec2, xvec0, xvec0;
  3226. vaddss xvec0, xvec15, xvec15;
  3227. vmulss xvec2, xvec1, xvec1;
  3228. vaddss xvec1, xvec14, xvec14;
  3229. vmovss 6*SIZE(ptrba), xvec3;
  3230. vmovss 7*SIZE(ptrba), xvec4;
  3231. vmovss 3*SIZE(ptrbb), xvec5;
  3232. vmulss xvec5, xvec3, xvec3;
  3233. vaddss xvec3, xvec15, xvec15;
  3234. vmulss xvec5, xvec4, xvec4;
  3235. vaddss xvec4, xvec14, xvec14;
  3236. addq $8*SIZE, ptrba;
  3237. addq $4*SIZE, ptrbb;
  3238. decq k;
  3239. JG .L431_bodyB;
  3240. ALIGN_4
  3241. .L431_loopE:
  3242. #ifndef TRMMKERNEL
  3243. TEST $2, bk;
  3244. #else
  3245. TEST $2, kkk;
  3246. #endif
  3247. JLE .L432_loopE;
  3248. ALIGN_4
  3249. .L432_bodyB:
  3250. vmovss 0*SIZE(ptrba), xvec0;
  3251. vmovss 1*SIZE(ptrba), xvec1;
  3252. vmovss 0*SIZE(ptrbb), xvec2;
  3253. vmulss xvec2, xvec0, xvec0;
  3254. vaddss xvec0, xvec15, xvec15;
  3255. vmulss xvec2, xvec1, xvec1;
  3256. vaddss xvec1, xvec14, xvec14;
  3257. vmovss 2*SIZE(ptrba), xvec3;
  3258. vmovss 3*SIZE(ptrba), xvec4;
  3259. vmovss 1*SIZE(ptrbb), xvec5;
  3260. vmulss xvec5, xvec3, xvec3;
  3261. vaddss xvec3, xvec15, xvec15;
  3262. vmulss xvec5, xvec4, xvec4;
  3263. vaddss xvec4, xvec14, xvec14;
  3264. addq $4*SIZE, ptrba;
  3265. addq $2*SIZE, ptrbb;
  3266. .L432_loopE:
  3267. #ifndef TRMMKERNEL
  3268. TEST $1, bk;
  3269. #else
  3270. TEST $1, kkk;
  3271. #endif
  3272. JLE .L433_loopE;
  3273. ALIGN_4
  3274. .L433_bodyB:
  3275. vmovss 0*SIZE(ptrba), xvec0;
  3276. vmovss 1*SIZE(ptrba), xvec1;
  3277. vmovss 0*SIZE(ptrbb), xvec2;
  3278. vmulss xvec2, xvec0, xvec0;
  3279. vaddss xvec0, xvec15, xvec15;
  3280. vmulss xvec2, xvec1, xvec1;
  3281. vaddss xvec1, xvec14, xvec14;
  3282. addq $2*SIZE, ptrba;
  3283. addq $1*SIZE, ptrbb;
  3284. .L433_loopE:
  3285. #### Writing Back ####
  3286. vmovss MEMALPHA, xvec7;
  3287. vmulss xvec7, xvec15, xvec15;
  3288. vmulss xvec7, xvec14, xvec14;
  3289. #ifndef TRMMKERNEL
  3290. vaddss 0*SIZE(C0), xvec15, xvec15;
  3291. vaddss 1*SIZE(C0), xvec14, xvec14;
  3292. #endif
  3293. vmovss xvec15, 0*SIZE(C0);
  3294. vmovss xvec14, 1*SIZE(C0);
  3295. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3296. MOVQ bk, %rax;
  3297. SUBQ kkk, %rax;
  3298. LEAQ (,%rax, SIZE), %rax;
  3299. LEAQ (ptrba, %rax, 2), ptrba;
  3300. ADDQ %rax, ptrbb;
  3301. #endif
  3302. #if defined(TRMMKERNEL) && defined(LEFT)
  3303. addq $2, kk
  3304. #endif
  3305. addq $2*SIZE, C0;
  3306. .L43_loopE:
  3307. TEST $1, bm;
  3308. JLE .L44_loopE;
  3309. ALIGN_4
  3310. .L44_bodyB:
  3311. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3312. MOVQ bb, ptrbb;
  3313. #else
  3314. MOVQ bb, ptrbb;
  3315. MOVQ kk, %rax;
  3316. LEAQ (, %rax, SIZE), %rax;
  3317. ADDQ %rax, ptrba;
  3318. ADDQ %rax, ptrbb;
  3319. #endif
  3320. XOR_SY yvec15, yvec15, yvec15;
  3321. #ifndef TRMMKERNEL
  3322. MOVQ bk, k;
  3323. #elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  3324. MOVQ bk, %rax;
  3325. SUBQ kk, %rax;
  3326. MOVQ %rax, kkk;
  3327. #else
  3328. MOVQ kk, %rax;
  3329. #ifdef LEFT
  3330. ADDQ $1, %rax;
  3331. #else
  3332. ADDQ $1, %rax;
  3333. #endif
  3334. MOVQ %rax, kkk;
  3335. #endif
  3336. SARQ $2, k;
  3337. JLE .L441_loopE;
  3338. ALIGN_4
  3339. .L441_bodyB:
  3340. vmovss 0*SIZE(ptrba), xvec0;
  3341. vmovss 0*SIZE(ptrbb), xvec1;
  3342. vmulss xvec0, xvec1, xvec1;
  3343. vaddss xvec1, xvec15, xvec15;
  3344. vmovss 1*SIZE(ptrba), xvec0;
  3345. vmovss 1*SIZE(ptrbb), xvec1;
  3346. vmulss xvec0, xvec1, xvec1;
  3347. vaddss xvec1, xvec15, xvec15;
  3348. vmovss 2*SIZE(ptrba), xvec0;
  3349. vmovss 2*SIZE(ptrbb), xvec1;
  3350. vmulss xvec0, xvec1, xvec1;
  3351. vaddss xvec1, xvec15, xvec15;
  3352. vmovss 3*SIZE(ptrba), xvec0;
  3353. vmovss 3*SIZE(ptrbb), xvec1;
  3354. vmulss xvec0, xvec1, xvec1;
  3355. vaddss xvec1, xvec15, xvec15;
  3356. addq $4*SIZE, ptrba;
  3357. addq $4*SIZE, ptrbb;
  3358. decq k;
  3359. JG .L441_bodyB;
  3360. ALIGN_4
  3361. .L441_loopE:
  3362. #ifndef TRMMKERNEL
  3363. TEST $2, bk;
  3364. #else
  3365. TEST $2, kkk;
  3366. #endif
  3367. JLE .L442_loopE;
  3368. ALIGN_4
  3369. .L442_bodyB:
  3370. vmovss 0*SIZE(ptrba), xvec0;
  3371. vmovss 0*SIZE(ptrbb), xvec1;
  3372. vmulss xvec0, xvec1, xvec1;
  3373. vaddss xvec1, xvec15, xvec15;
  3374. vmovss 1*SIZE(ptrba), xvec0;
  3375. vmovss 1*SIZE(ptrbb), xvec1;
  3376. vmulss xvec0, xvec1, xvec1;
  3377. vaddss xvec1, xvec15, xvec15;
  3378. addq $2*SIZE, ptrba;
  3379. addq $2*SIZE, ptrbb;
  3380. .L442_loopE:
  3381. #ifndef TRMMKERNEL
  3382. TEST $1, bk;
  3383. #else
  3384. TEST $1, kkk;
  3385. #endif
  3386. JLE .L443_loopE;
  3387. ALIGN_4
  3388. .L443_bodyB:
  3389. vmovss 0*SIZE(ptrba), xvec0;
  3390. vmovss 0*SIZE(ptrbb), xvec1;
  3391. vmulss xvec0, xvec1, xvec1;
  3392. vaddss xvec1, xvec15, xvec15;
  3393. addq $1*SIZE, ptrba;
  3394. addq $1*SIZE, ptrbb;
  3395. .L443_loopE:
  3396. #### Writing Back ####
  3397. vmovss MEMALPHA, xvec7;
  3398. vmulss xvec7, xvec15, xvec15;
  3399. #ifndef TRMMKERNEL
  3400. vaddss 0*SIZE(C0), xvec15, xvec15;
  3401. #endif
  3402. vmovss xvec15, 0*SIZE(C0);
  3403. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3404. MOVQ bk, %rax;
  3405. SUBQ kkk, %rax;
  3406. LEAQ (,%rax, SIZE), %rax;
  3407. ADDQ %rax, ptrba;
  3408. ADDQ %rax, ptrbb;
  3409. #endif
  3410. #if defined(TRMMKERNEL) && defined(LEFT)
  3411. addq $1, kk
  3412. #endif
  3413. addq $1*SIZE, C0;
  3414. .L44_loopE:
  3415. MOV bk, k;
  3416. SALQ $2, k;
  3417. ADDQ k, bb;
  3418. ADDQ ldc, C;
  3419. .L40_loopE:
  3420. movq 0(%rsp), %rbx;
  3421. movq 8(%rsp), %rbp;
  3422. movq 16(%rsp), %r12;
  3423. movq 24(%rsp), %r13;
  3424. movq 32(%rsp), %r14;
  3425. movq 40(%rsp), %r15;
  3426. vzeroupper
  3427. #ifdef WINDOWS_ABI
  3428. movq 48(%rsp), %rdi
  3429. movq 56(%rsp), %rsi
  3430. movups 64(%rsp), %xmm6
  3431. movups 80(%rsp), %xmm7
  3432. movups 96(%rsp), %xmm8
  3433. movups 112(%rsp), %xmm9
  3434. movups 128(%rsp), %xmm10
  3435. movups 144(%rsp), %xmm11
  3436. movups 160(%rsp), %xmm12
  3437. movups 176(%rsp), %xmm13
  3438. movups 192(%rsp), %xmm14
  3439. movups 208(%rsp), %xmm15
  3440. #endif
  3441. addq $STACKSIZE, %rsp;
  3442. ret
  3443. EPILOGUE