You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x8_sandy.S 72 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173
  1. /*****************************************************************************
  2. Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the ISCAS nor the names of its contributors may
  14. be used to endorse or promote products derived from this software
  15. without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define old_bm %rdi
  30. #define old_bn %rsi
  31. #define old_bk %rdx
  32. #define bm %r13
  33. #define bn %r14
  34. #define bk %r15
  35. #define ALPHA %xmm0
  36. #define ba %rcx
  37. #define bb %r8
  38. #define C %r9
  39. #define ldc %r10
  40. #define i %r11
  41. #define k %rax
  42. #define ptrba %rdi
  43. #define ptrbb %rsi
  44. #define C0 %rbx
  45. #define C1 %rbp
  46. #define prebb %r12
  47. #ifndef WINDOWS_ABI
  48. #define STACKSIZE 128
  49. #define old_ldc 8+STACKSIZE(%rsp)
  50. #define old_offset 16+STACKSIZE(%rsp)
  51. #define MEMALPHA 48(%rsp)
  52. #define j 56(%rsp)
  53. #define OFFSET 64(%rsp)
  54. #define kk 72(%rsp)
  55. #define kkk 80(%rsp)
  56. #else
  57. #define STACKSIZE 512
  58. #define OLD_A 40 + STACKSIZE(%rsp)
  59. #define OLD_B 48 + STACKSIZE(%rsp)
  60. #define OLD_C 56 + STACKSIZE(%rsp)
  61. #define old_ldc 64 + STACKSIZE(%rsp)
  62. #define old_offset 72 + STACKSIZE(%rsp)
  63. #define MEMALPHA 224(%rsp)
  64. #define j 232(%rsp)
  65. #define OFFSET 240(%rsp)
  66. #define kk 248(%rsp)
  67. #define kkk 256(%rsp)
  68. #endif
  69. #define PREFETCH0 prefetcht0
  70. #define PREFETCH1 prefetcht0
  71. #define PREFETCH2 prefetcht2
  72. #define xvec0 %xmm0
  73. #define xvec1 %xmm1
  74. #define xvec2 %xmm2
  75. #define xvec3 %xmm3
  76. #define xvec4 %xmm4
  77. #define xvec5 %xmm5
  78. #define xvec6 %xmm6
  79. #define xvec7 %xmm7
  80. #define xvec8 %xmm8
  81. #define xvec9 %xmm9
  82. #define xvec10 %xmm10
  83. #define xvec11 %xmm11
  84. #define xvec12 %xmm12
  85. #define xvec13 %xmm13
  86. #define xvec14 %xmm14
  87. #define xvec15 %xmm15
  88. #define yvec0 %ymm0
  89. #define yvec1 %ymm1
  90. #define yvec2 %ymm2
  91. #define yvec3 %ymm3
  92. #define yvec4 %ymm4
  93. #define yvec5 %ymm5
  94. #define yvec6 %ymm6
  95. #define yvec7 %ymm7
  96. #define yvec8 %ymm8
  97. #define yvec9 %ymm9
  98. #define yvec10 %ymm10
  99. #define yvec11 %ymm11
  100. #define yvec12 %ymm12
  101. #define yvec13 %ymm13
  102. #define yvec14 %ymm14
  103. #define yvec15 %ymm15
  104. #define LEAQ leaq
  105. #define ADDQ addq
  106. #define MULQ imulq
  107. #define SARQ sarq
  108. #define SALQ salq
  109. #define ANDQ andq
  110. #define SUBQ subq
  111. #define DECQ decq
  112. #define JG jg
  113. #define JLE jle
  114. #define TEST testq
  115. #define OR orq
  116. #define JNE jne
  117. #define NOP
  118. #define XOR xorpd
  119. #undef MOVQ
  120. #define MOVQ movq
  121. #define XOR_DY vxorpd
  122. #define XOR_DX vxorpd
  123. #define LD_DY vmovapd
  124. #define LD_DX vmovapd
  125. #define LDL_DX vmovlpd
  126. #define LDL_DY vmovlpd
  127. #define LDH_DX vmovhpd
  128. #define LDH_DY vmovhpd
  129. #define ST_DY vmovapd
  130. #define ST_DX vmovapd
  131. #define STL_DX vmovlpd
  132. #define STL_DY vmovlpd
  133. #define STH_DX vmovhpd
  134. #define STH_DY vmovhpd
  135. #define EDUP_DY vmovddup
  136. #define ADD_DY vaddpd
  137. #define ADD_DX vaddpd
  138. #define ADD1_DY vaddpd
  139. #define ADD2_DY vaddpd
  140. #define ADDSUB_DY vaddsubpd
  141. #define MUL_DY vmulpd
  142. #define MUL_DX vmulpd
  143. #define SHUF_DY vperm2f128
  144. #define SHUF_DX vpshufd
  145. #define VPERMILP_DY vpermilpd
  146. #define BROAD_DY vbroadcastsd
  147. #define BROAD_DX vmovddup
  148. #define MOV_DY vmovapd
  149. #define MOV_DX vmovapd
  150. #define REVS_DY vshufpd
  151. #define REVS_DX vmovsd
  152. #define EXTRA_DY vextractf128
  153. PROLOGUE
  154. subq $STACKSIZE, %rsp;
  155. movq %rbx, 0(%rsp);
  156. movq %rbp, 8(%rsp);
  157. movq %r12, 16(%rsp);
  158. movq %r13, 24(%rsp);
  159. movq %r14, 32(%rsp);
  160. movq %r15, 40(%rsp);
  161. #ifdef WINDOWS_ABI
  162. movq %rdi, 48(%rsp)
  163. movq %rsi, 56(%rsp)
  164. movups %xmm6, 64(%rsp)
  165. movups %xmm7, 80(%rsp)
  166. movups %xmm8, 96(%rsp)
  167. movups %xmm9, 112(%rsp)
  168. movups %xmm10, 128(%rsp)
  169. movups %xmm11, 144(%rsp)
  170. movups %xmm12, 160(%rsp)
  171. movups %xmm13, 176(%rsp)
  172. movups %xmm14, 192(%rsp)
  173. movups %xmm15, 208(%rsp)
  174. movq ARG1, old_bm
  175. movq ARG2, old_bn
  176. movq ARG3, old_bk
  177. movq OLD_A, ba
  178. movq OLD_B, bb
  179. movq OLD_C, C
  180. movq old_ldc, ldc
  181. #ifdef TRMMKERNEL
  182. movq old_offset, %r11
  183. #endif
  184. movaps %xmm3, %xmm0
  185. #else
  186. movq old_ldc, ldc
  187. #ifdef TRMMKERNEL
  188. movq old_offset, %r11
  189. #endif
  190. #endif
  191. vzeroupper
  192. vmovlps ALPHA, MEMALPHA
  193. movq old_bm, bm
  194. movq old_bn, bn
  195. movq old_bk, bk
  196. leaq (, ldc, SIZE), ldc
  197. #ifdef TRMMKERNEL
  198. movq %r11, OFFSET
  199. #ifndef LEFT
  200. negq %r11;
  201. #endif
  202. movq %r11, kk
  203. #endif
  204. MOVQ bn,j;
  205. SARQ $2,j; # Rn = 4
  206. JLE .L0_loopE;
  207. ALIGN_5;
  208. .L0_bodyB:;
  209. #if defined(TRMMKERNEL) && defined(LEFT)
  210. MOVQ OFFSET, %rax;
  211. MOVQ %rax, kk;
  212. #endif
  213. MOVQ C,C0;
  214. LEAQ (C,ldc,2),C1;
  215. MOVQ bk, k;
  216. SALQ $5, k;
  217. LEAQ (bb, k, 1), prebb;
  218. MOVQ ba,ptrba;
  219. MOVQ bm,i;
  220. SARQ $3,i; # Rm = 8
  221. JLE .L1_loopE;
  222. ALIGN_5;
  223. .L1_bodyB:;
  224. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  225. MOVQ bb, ptrbb;
  226. #else
  227. MOVQ bb, ptrbb;
  228. MOVQ kk, %rax;
  229. LEAQ (, %rax, SIZE), %rax;
  230. LEAQ (ptrba, %rax, 8), ptrba;
  231. LEAQ (ptrbb, %rax, 4), ptrbb;
  232. #endif
  233. #### Initial Results Register ####
  234. PREFETCH2 0*SIZE(prebb);
  235. XOR_DY yvec15, yvec15, yvec15;
  236. PREFETCH2 8*SIZE(prebb);
  237. XOR_DY yvec14, yvec14, yvec14;
  238. XOR_DY yvec13, yvec13, yvec13;
  239. ADDQ $16*SIZE, prebb
  240. XOR_DY yvec12, yvec12, yvec12;
  241. PREFETCH0 3*SIZE(C0)
  242. LD_DY 0*SIZE(ptrbb), yvec2;
  243. PREFETCH0 3*SIZE(C0, ldc, 1)
  244. XOR_DY yvec11, yvec11, yvec11;
  245. PREFETCH0 3*SIZE(C1)
  246. XOR_DY yvec10, yvec10, yvec10;
  247. PREFETCH0 3*SIZE(C1, ldc, 1)
  248. LD_DY 0*SIZE(ptrba), yvec0;
  249. XOR_DY yvec9, yvec9, yvec9;
  250. XOR_DY yvec8, yvec8, yvec8;
  251. VPERMILP_DY $0x05, yvec2, yvec3;
  252. #ifndef TRMMKERNEL
  253. MOVQ bk,k;
  254. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  255. MOVQ bk, %rax;
  256. SUBQ kk, %rax;
  257. MOVQ %rax, kkk;
  258. #else
  259. MOVQ kk, %rax;
  260. #ifdef LEFT
  261. ADDQ $8, %rax;
  262. #else
  263. ADDQ $4, %rax;
  264. #endif
  265. MOVQ %rax, kkk;
  266. #endif
  267. SARQ $2,k;
  268. JLE .L2_loopE;
  269. ALIGN_5;
  270. .L2_bodyB:;
  271. # Computing kernel
  272. #### Unroll times 1 ####
  273. LD_DY 4*SIZE(ptrba), yvec1;
  274. MUL_DY yvec0, yvec2, yvec6;
  275. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  276. MUL_DY yvec0, yvec3, yvec7;
  277. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  278. ADD_DY yvec15, yvec6, yvec15;
  279. ADD_DY yvec13, yvec7, yvec13;
  280. PREFETCH0 64*SIZE(ptrba)
  281. MUL_DY yvec1, yvec2, yvec6;
  282. LD_DY 4*SIZE(ptrbb), yvec2;
  283. MUL_DY yvec1, yvec3, yvec7;
  284. VPERMILP_DY $0x05, yvec2, yvec3;
  285. ADD_DY yvec14, yvec6, yvec14;
  286. ADD_DY yvec12, yvec7, yvec12;
  287. MUL_DY yvec0, yvec4, yvec6;
  288. MUL_DY yvec0, yvec5, yvec7;
  289. LD_DY 8*SIZE(ptrba), yvec0;
  290. ADD_DY yvec11, yvec6, yvec11;
  291. ADD_DY yvec9, yvec7, yvec9;
  292. MUL_DY yvec1, yvec4, yvec6;
  293. MUL_DY yvec1, yvec5, yvec7;
  294. ADD_DY yvec10, yvec6, yvec10;
  295. ADD_DY yvec8, yvec7, yvec8;
  296. #### Unroll times 2 ####
  297. LD_DY 12*SIZE(ptrba), yvec1;
  298. MUL_DY yvec0, yvec2, yvec6;
  299. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  300. MUL_DY yvec0, yvec3, yvec7;
  301. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  302. ADD_DY yvec15, yvec6, yvec15;
  303. ADD_DY yvec13, yvec7, yvec13;
  304. PREFETCH0 72*SIZE(ptrba)
  305. MUL_DY yvec1, yvec2, yvec6;
  306. LD_DY 8*SIZE(ptrbb), yvec2;
  307. MUL_DY yvec1, yvec3, yvec7;
  308. VPERMILP_DY $0x05, yvec2, yvec3;
  309. ADD_DY yvec14, yvec6, yvec14;
  310. ADD_DY yvec12, yvec7, yvec12;
  311. MUL_DY yvec0, yvec4, yvec6;
  312. MUL_DY yvec0, yvec5, yvec7;
  313. LD_DY 16*SIZE(ptrba), yvec0;
  314. ADD_DY yvec11, yvec6, yvec11;
  315. ADD_DY yvec9, yvec7, yvec9;
  316. MUL_DY yvec1, yvec4, yvec6;
  317. MUL_DY yvec1, yvec5, yvec7;
  318. ADD_DY yvec10, yvec6, yvec10;
  319. ADD_DY yvec8, yvec7, yvec8;
  320. #### Unroll times 3 ####
  321. LD_DY 20*SIZE(ptrba), yvec1;
  322. MUL_DY yvec0, yvec2, yvec6;
  323. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  324. MUL_DY yvec0, yvec3, yvec7;
  325. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  326. ADD_DY yvec15, yvec6, yvec15;
  327. ADD_DY yvec13, yvec7, yvec13;
  328. PREFETCH0 80*SIZE(ptrba)
  329. MUL_DY yvec1, yvec2, yvec6;
  330. LD_DY 12*SIZE(ptrbb), yvec2;
  331. ADDQ $16*SIZE, ptrbb;
  332. MUL_DY yvec1, yvec3, yvec7;
  333. VPERMILP_DY $0x05, yvec2, yvec3;
  334. ADD_DY yvec14, yvec6, yvec14;
  335. ADD_DY yvec12, yvec7, yvec12;
  336. MUL_DY yvec0, yvec4, yvec6;
  337. MUL_DY yvec0, yvec5, yvec7;
  338. LD_DY 24*SIZE(ptrba), yvec0;
  339. ADD_DY yvec11, yvec6, yvec11;
  340. ADD_DY yvec9, yvec7, yvec9;
  341. MUL_DY yvec1, yvec4, yvec6;
  342. MUL_DY yvec1, yvec5, yvec7;
  343. ADD_DY yvec10, yvec6, yvec10;
  344. ADD_DY yvec8, yvec7, yvec8;
  345. #### Unroll times 4 ####
  346. LD_DY 28*SIZE(ptrba), yvec1;
  347. MUL_DY yvec0, yvec2, yvec6;
  348. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  349. MUL_DY yvec0, yvec3, yvec7;
  350. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  351. ADDQ $32*SIZE, ptrba;
  352. ADD_DY yvec15, yvec6, yvec15;
  353. ADD_DY yvec13, yvec7, yvec13;
  354. PREFETCH0 88*SIZE(ptrba)
  355. MUL_DY yvec1, yvec2, yvec6;
  356. LD_DY 0*SIZE(ptrbb), yvec2;
  357. MUL_DY yvec1, yvec3, yvec7;
  358. VPERMILP_DY $0x05, yvec2, yvec3;
  359. ADD_DY yvec14, yvec6, yvec14;
  360. ADD_DY yvec12, yvec7, yvec12;
  361. MUL_DY yvec0, yvec4, yvec6;
  362. MUL_DY yvec0, yvec5, yvec7;
  363. LD_DY 0*SIZE(ptrba), yvec0;
  364. ADD_DY yvec11, yvec6, yvec11;
  365. ADD_DY yvec9, yvec7, yvec9;
  366. MUL_DY yvec1, yvec4, yvec6;
  367. MUL_DY yvec1, yvec5, yvec7;
  368. ADD_DY yvec10, yvec6, yvec10;
  369. ADD_DY yvec8, yvec7, yvec8;
  370. .L2_bodyE:;
  371. DECQ k;
  372. JG .L2_bodyB;
  373. ALIGN_5
  374. .L2_loopE:;
  375. PREFETCH2 0*SIZE(prebb);
  376. ADDQ $8*SIZE, prebb;
  377. #ifndef TRMMKERNEL
  378. TEST $2, bk;
  379. #else
  380. MOVQ kkk, %rax;
  381. TEST $2, %rax;
  382. #endif
  383. JLE .L3_loopE;
  384. ALIGN_5
  385. .L3_bodyB:
  386. #### Unroll times 1 ####
  387. PREFETCH0 64*SIZE(ptrba)
  388. LD_DY 4*SIZE(ptrba), yvec1;
  389. MUL_DY yvec0, yvec2, yvec6;
  390. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  391. MUL_DY yvec0, yvec3, yvec7;
  392. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  393. ADD_DY yvec15, yvec6, yvec15;
  394. ADD_DY yvec13, yvec7, yvec13;
  395. MUL_DY yvec1, yvec2, yvec6;
  396. LD_DY 4*SIZE(ptrbb), yvec2;
  397. ADDQ $8*SIZE, ptrbb;
  398. MUL_DY yvec1, yvec3, yvec7;
  399. VPERMILP_DY $0x05, yvec2, yvec3;
  400. ADD_DY yvec14, yvec6, yvec14;
  401. ADD_DY yvec12, yvec7, yvec12;
  402. MUL_DY yvec0, yvec4, yvec6;
  403. MUL_DY yvec0, yvec5, yvec7;
  404. LD_DY 8*SIZE(ptrba), yvec0;
  405. ADD_DY yvec11, yvec6, yvec11;
  406. ADD_DY yvec9, yvec7, yvec9;
  407. MUL_DY yvec1, yvec4, yvec6;
  408. MUL_DY yvec1, yvec5, yvec7;
  409. ADD_DY yvec10, yvec6, yvec10;
  410. ADD_DY yvec8, yvec7, yvec8;
  411. #### Unroll times 2 ####
  412. PREFETCH0 72*SIZE(ptrba)
  413. LD_DY 12*SIZE(ptrba), yvec1;
  414. MUL_DY yvec0, yvec2, yvec6;
  415. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  416. MUL_DY yvec0, yvec3, yvec7;
  417. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  418. ADDQ $16*SIZE, ptrba;
  419. ADD_DY yvec15, yvec6, yvec15;
  420. ADD_DY yvec13, yvec7, yvec13;
  421. MUL_DY yvec1, yvec2, yvec6;
  422. LD_DY 0*SIZE(ptrbb), yvec2;
  423. MUL_DY yvec1, yvec3, yvec7;
  424. VPERMILP_DY $0x05, yvec2, yvec3;
  425. ADD_DY yvec14, yvec6, yvec14;
  426. ADD_DY yvec12, yvec7, yvec12;
  427. MUL_DY yvec0, yvec4, yvec6;
  428. MUL_DY yvec0, yvec5, yvec7;
  429. LD_DY 0*SIZE(ptrba), yvec0;
  430. ADD_DY yvec11, yvec6, yvec11;
  431. ADD_DY yvec9, yvec7, yvec9;
  432. MUL_DY yvec1, yvec4, yvec6;
  433. MUL_DY yvec1, yvec5, yvec7;
  434. ADD_DY yvec10, yvec6, yvec10;
  435. ADD_DY yvec8, yvec7, yvec8;
  436. .L3_loopE:
  437. PREFETCH2 0*SIZE(prebb);
  438. ADDQ $8*SIZE, prebb
  439. #ifndef TRMMKERNEL
  440. TEST $1, bk;
  441. #else
  442. MOVQ kkk, %rax;
  443. TEST $1, %rax;
  444. #endif
  445. JLE .L4_loopE;
  446. ALIGN_5
  447. .L4_bodyB:;
  448. #### Unroll times 1 ####
  449. PREFETCH0 64*SIZE(ptrba)
  450. LD_DY 4*SIZE(ptrba), yvec1;
  451. MUL_DY yvec0, yvec2, yvec6;
  452. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  453. MUL_DY yvec0, yvec3, yvec7;
  454. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  455. ADDQ $8*SIZE, ptrba;
  456. ADD_DY yvec15, yvec6, yvec15;
  457. ADD_DY yvec13, yvec7, yvec13;
  458. MUL_DY yvec1, yvec2, yvec6;
  459. MUL_DY yvec1, yvec3, yvec7;
  460. ADDQ $4*SIZE, ptrbb;
  461. ADD_DY yvec14, yvec6, yvec14;
  462. ADD_DY yvec12, yvec7, yvec12;
  463. MUL_DY yvec0, yvec4, yvec6;
  464. MUL_DY yvec0, yvec5, yvec7;
  465. ADD_DY yvec11, yvec6, yvec11;
  466. ADD_DY yvec9, yvec7, yvec9;
  467. MUL_DY yvec1, yvec4, yvec6;
  468. MUL_DY yvec1, yvec5, yvec7;
  469. ADD_DY yvec10, yvec6, yvec10;
  470. ADD_DY yvec8, yvec7, yvec8;
  471. .L4_loopE:;
  472. #### Load Alpha ####
  473. BROAD_DY MEMALPHA,yvec7;
  474. #### Multiply Alpha ####
  475. MUL_DY yvec7,yvec15,yvec15;
  476. MUL_DY yvec7,yvec14,yvec14;
  477. MUL_DY yvec7,yvec13,yvec13;
  478. MUL_DY yvec7,yvec12,yvec12;
  479. MUL_DY yvec7,yvec11,yvec11;
  480. MUL_DY yvec7,yvec10,yvec10;
  481. MUL_DY yvec7,yvec9,yvec9;
  482. MUL_DY yvec7,yvec8,yvec8;
  483. #### Reverse the Results ####
  484. MOV_DY yvec15,yvec7;
  485. REVS_DY $0x0a,yvec13,yvec15,yvec15;
  486. REVS_DY $0x0a,yvec7,yvec13,yvec13;
  487. MOV_DY yvec14,yvec7;
  488. REVS_DY $0x0a,yvec12,yvec14,yvec14;
  489. REVS_DY $0x0a,yvec7,yvec12,yvec12;
  490. MOV_DY yvec11,yvec7;
  491. REVS_DY $0x0a,yvec9,yvec11,yvec11;
  492. REVS_DY $0x0a,yvec7,yvec9,yvec9;
  493. MOV_DY yvec10,yvec7;
  494. REVS_DY $0x0a,yvec8,yvec10,yvec10;
  495. REVS_DY $0x0a,yvec7,yvec8,yvec8;
  496. #### Testing alignment ####
  497. MOVQ C0, %rax;
  498. OR ldc, %rax;
  499. TEST $15, %rax;
  500. JNE .L4_loopEx; # Unalign part write back
  501. ALIGN_5
  502. #### Writing Back ####
  503. EXTRA_DY $1,yvec15,xvec7;
  504. EXTRA_DY $1,yvec14,xvec6;
  505. EXTRA_DY $1,yvec13,xvec5;
  506. EXTRA_DY $1,yvec12,xvec4;
  507. EXTRA_DY $1,yvec11,xvec3;
  508. EXTRA_DY $1,yvec10,xvec2;
  509. EXTRA_DY $1,yvec9,xvec1;
  510. EXTRA_DY $1,yvec8,xvec0;
  511. #ifndef TRMMKERNEL
  512. ADD_DY 0*SIZE(C0),xvec15,xvec15;
  513. ADD_DY 2*SIZE(C1),xvec7,xvec7;
  514. ADD_DY 4*SIZE(C0),xvec14,xvec14;
  515. ADD_DY 6*SIZE(C1),xvec6,xvec6;
  516. ADD_DY 0*SIZE(C0,ldc,1),xvec13,xvec13;
  517. ADD_DY 2*SIZE(C1,ldc,1),xvec5,xvec5;
  518. ADD_DY 4*SIZE(C0,ldc,1),xvec12,xvec12;
  519. ADD_DY 6*SIZE(C1,ldc,1),xvec4,xvec4;
  520. ADD_DY 0*SIZE(C1),xvec11,xvec11;
  521. ADD_DY 2*SIZE(C0),xvec3,xvec3;
  522. ADD_DY 4*SIZE(C1),xvec10,xvec10;
  523. ADD_DY 6*SIZE(C0),xvec2,xvec2;
  524. ADD_DY 0*SIZE(C1,ldc,1),xvec9,xvec9;
  525. ADD_DY 2*SIZE(C0,ldc,1),xvec1,xvec1;
  526. ADD_DY 4*SIZE(C1,ldc,1),xvec8,xvec8;
  527. ADD_DY 6*SIZE(C0,ldc,1),xvec0,xvec0;
  528. #endif
  529. ST_DY xvec15, 0*SIZE(C0);
  530. ST_DY xvec7, 2*SIZE(C1);
  531. ST_DY xvec14, 4*SIZE(C0);
  532. ST_DY xvec6, 6*SIZE(C1);
  533. ST_DY xvec13, 0*SIZE(C0,ldc,1);
  534. ST_DY xvec5, 2*SIZE(C1,ldc,1);
  535. ST_DY xvec12, 4*SIZE(C0,ldc,1);
  536. ST_DY xvec4, 6*SIZE(C1,ldc,1);
  537. ST_DY xvec11, 0*SIZE(C1);
  538. ST_DY xvec3, 2*SIZE(C0);
  539. ST_DY xvec10, 4*SIZE(C1);
  540. ST_DY xvec2, 6*SIZE(C0);
  541. ST_DY xvec9, 0*SIZE(C1,ldc,1);
  542. ST_DY xvec1, 2*SIZE(C0,ldc,1);
  543. ST_DY xvec8, 4*SIZE(C1,ldc,1);
  544. ST_DY xvec0, 6*SIZE(C0,ldc,1);
  545. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  546. MOVQ bk, %rax;
  547. SUBQ kkk, %rax;
  548. LEAQ (, %rax, SIZE), %rax;
  549. LEAQ (ptrba, %rax, 8), ptrba;
  550. LEAQ (ptrbb, %rax, 4), ptrbb;
  551. #endif
  552. #if defined(TRMMKERNEL) && defined(LEFT)
  553. ADDQ $8, kk
  554. #endif
  555. ADDQ $8*SIZE,C0;
  556. ADDQ $8*SIZE,C1;
  557. .L1_bodyE:;
  558. DECQ i;
  559. JG .L1_bodyB;
  560. JMP .L1_loopE;
  561. ALIGN_5;
  562. .L4_loopEx:;
  563. EXTRA_DY $1, yvec15, xvec7;
  564. #ifndef TRMMKERNEL
  565. LDL_DY 0*SIZE(C0), xvec6, xvec6;
  566. LDH_DY 1*SIZE(C0), xvec6, xvec6;
  567. ADD_DY xvec6, xvec15, xvec15;
  568. LDL_DY 2*SIZE(C1), xvec5, xvec5;
  569. LDH_DY 3*SIZE(C1), xvec5, xvec5;
  570. ADD_DY xvec5, xvec7, xvec7;
  571. #endif
  572. STL_DY xvec15, 0*SIZE(C0);
  573. STH_DY xvec15, 1*SIZE(C0);
  574. STL_DY xvec7, 2*SIZE(C1);
  575. STH_DY xvec7, 3*SIZE(C1);
  576. EXTRA_DY $1, yvec14, xvec4;
  577. #ifndef TRMMKERNEL
  578. LDL_DY 4*SIZE(C0), xvec3, xvec3;
  579. LDH_DY 5*SIZE(C0), xvec3, xvec3;
  580. ADD_DY xvec3, xvec14, xvec14;
  581. LDL_DY 6*SIZE(C1), xvec2, xvec2;
  582. LDH_DY 7*SIZE(C1), xvec2, xvec2;
  583. ADD_DY xvec2, xvec4, xvec4;
  584. #endif
  585. STL_DY xvec14, 4*SIZE(C0);
  586. STH_DY xvec14, 5*SIZE(C0);
  587. STL_DY xvec4, 6*SIZE(C1);
  588. STH_DY xvec4, 7*SIZE(C1);
  589. EXTRA_DY $1, yvec13, xvec7;
  590. #ifndef TRMMKERNEL
  591. LDL_DY 0*SIZE(C0, ldc, 1), xvec6, xvec6;
  592. LDH_DY 1*SIZE(C0, ldc, 1), xvec6, xvec6;
  593. ADD_DY xvec6, xvec13, xvec13;
  594. LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5;
  595. LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5;
  596. ADD_DY xvec5, xvec7, xvec7;
  597. #endif
  598. STL_DY xvec13, 0*SIZE(C0, ldc, 1);
  599. STH_DY xvec13, 1*SIZE(C0, ldc, 1);
  600. STL_DY xvec7, 2*SIZE(C1, ldc, 1);
  601. STH_DY xvec7, 3*SIZE(C1, ldc, 1);
  602. EXTRA_DY $1, yvec12, xvec4;
  603. #ifndef TRMMKERNEL
  604. LDL_DY 4*SIZE(C0, ldc, 1), xvec3, xvec3;
  605. LDH_DY 5*SIZE(C0, ldc, 1), xvec3, xvec3;
  606. ADD_DY xvec3, xvec12, xvec12;
  607. LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2;
  608. LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2;
  609. ADD_DY xvec2, xvec4, xvec4;
  610. #endif
  611. STL_DY xvec12, 4*SIZE(C0, ldc, 1);
  612. STH_DY xvec12, 5*SIZE(C0, ldc ,1);
  613. STL_DY xvec4, 6*SIZE(C1, ldc, 1);
  614. STH_DY xvec4, 7*SIZE(C1, ldc, 1);
  615. EXTRA_DY $1, yvec11, xvec7;
  616. #ifndef TRMMKERNEL
  617. LDL_DY 0*SIZE(C1), xvec6, xvec6;
  618. LDH_DY 1*SIZE(C1), xvec6, xvec6;
  619. ADD_DY xvec6, xvec11, xvec11;
  620. LDL_DY 2*SIZE(C0), xvec5, xvec5;
  621. LDH_DY 3*SIZE(C0), xvec5, xvec5;
  622. ADD_DY xvec5, xvec7, xvec7;
  623. #endif
  624. STL_DY xvec11, 0*SIZE(C1);
  625. STH_DY xvec11, 1*SIZE(C1);
  626. STL_DY xvec7, 2*SIZE(C0);
  627. STH_DY xvec7, 3*SIZE(C0);
  628. EXTRA_DY $1, yvec10, xvec4;
  629. #ifndef TRMMKERNEL
  630. LDL_DY 4*SIZE(C1), xvec3, xvec3;
  631. LDH_DY 5*SIZE(C1), xvec3, xvec3;
  632. ADD_DY xvec3, xvec10, xvec10;
  633. LDL_DY 6*SIZE(C0), xvec2, xvec2;
  634. LDH_DY 7*SIZE(C0), xvec2, xvec2;
  635. ADD_DY xvec2, xvec4, xvec4;
  636. #endif
  637. STL_DY xvec10, 4*SIZE(C1);
  638. STH_DY xvec10, 5*SIZE(C1);
  639. STL_DY xvec4, 6*SIZE(C0);
  640. STH_DY xvec4, 7*SIZE(C0);
  641. EXTRA_DY $1, yvec9, xvec7;
  642. #ifndef TRMMKERNEL
  643. LDL_DY 0*SIZE(C1, ldc, 1), xvec6, xvec6;
  644. LDH_DY 1*SIZE(C1, ldc, 1), xvec6, xvec6;
  645. ADD_DY xvec6, xvec9, xvec9;
  646. LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5;
  647. LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5;
  648. ADD_DY xvec5, xvec7, xvec7;
  649. #endif
  650. STL_DY xvec9, 0*SIZE(C1, ldc, 1);
  651. STH_DY xvec9, 1*SIZE(C1, ldc, 1);
  652. STL_DY xvec7, 2*SIZE(C0, ldc, 1);
  653. STH_DY xvec7, 3*SIZE(C0, ldc, 1);
  654. EXTRA_DY $1, yvec8, xvec4;
  655. #ifndef TRMMKERNEL
  656. LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3;
  657. LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3;
  658. ADD_DY xvec3, xvec8, xvec8;
  659. LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2;
  660. LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2;
  661. ADD_DY xvec2, xvec4, xvec4;
  662. #endif
  663. STL_DY xvec8, 4*SIZE(C1, ldc, 1);
  664. STH_DY xvec8, 5*SIZE(C1, ldc, 1);
  665. STL_DY xvec4, 6*SIZE(C0, ldc, 1);
  666. STH_DY xvec4, 7*SIZE(C0, ldc, 1);
  667. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  668. MOVQ bk, %rax;
  669. SUBQ kkk, %rax;
  670. LEAQ (, %rax, SIZE), %rax;
  671. LEAQ (ptrba, %rax, 8), ptrba;
  672. LEAQ (ptrbb, %rax, 4), ptrbb;
  673. #endif
  674. #if defined(TRMMKERNEL) && defined(LEFT)
  675. ADDQ $8, kk
  676. #endif
  677. ADDQ $8*SIZE, C0;
  678. ADDQ $8*SIZE, C1;
  679. DECQ i;
  680. JG .L1_bodyB;
  681. ALIGN_5
  682. .L1_loopE:;
  683. TEST $4, bm; # Rm = 4
  684. JLE .L5_loopE;
  685. ALIGN_5
  686. .L5_bodyB:;
  687. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  688. MOVQ bb, ptrbb;
  689. #else
  690. MOVQ bb, ptrbb;
  691. MOVQ kk, %rax;
  692. LEAQ (, %rax, SIZE), %rax;
  693. LEAQ (ptrba, %rax, 4), ptrba;
  694. LEAQ (ptrbb, %rax, 4), ptrbb;
  695. #endif
  696. #### Initial Results Register ####
  697. XOR_DY yvec15, yvec15, yvec15;
  698. XOR_DY yvec13, yvec13, yvec13;
  699. LD_DY 0*SIZE(ptrbb), yvec2;
  700. XOR_DY yvec11, yvec11, yvec11;
  701. XOR_DY yvec9, yvec9, yvec9;
  702. LD_DY 0*SIZE(ptrba), yvec0;
  703. VPERMILP_DY $0x05, yvec2, yvec3;
  704. #ifndef TRMMKERNEL
  705. MOVQ bk, k;
  706. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  707. MOVQ bk, %rax;
  708. SUBQ kk, %rax;
  709. MOVQ %rax, kkk;
  710. #else
  711. MOVQ kk, %rax;
  712. #ifdef LEFT
  713. ADDQ $4, %rax;
  714. #else
  715. ADDQ $4, %rax;
  716. #endif
  717. MOVQ %rax, kkk;
  718. #endif
  719. SARQ $2, k;
  720. JLE .L6_loopE;
  721. ALIGN_5;
  722. .L6_bodyB:;
  723. # Computing kernel
  724. #### Untoll time 1 ####
  725. LD_DY 4*SIZE(ptrba), yvec1;
  726. MUL_DY yvec0, yvec2, yvec6;
  727. ADD_DY yvec15, yvec6, yvec15;
  728. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  729. MUL_DY yvec0, yvec3, yvec7;
  730. ADD_DY yvec13, yvec7, yvec13;
  731. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  732. LD_DY 4*SIZE(ptrbb), yvec2;
  733. MUL_DY yvec0, yvec4, yvec6;
  734. ADD_DY yvec11, yvec6, yvec11;
  735. VPERMILP_DY $0x05, yvec2, yvec3;
  736. MUL_DY yvec0, yvec5, yvec7;
  737. ADD_DY yvec9, yvec7, yvec9;
  738. #### Untoll time 2 ####
  739. LD_DY 8*SIZE(ptrba), yvec0;
  740. MUL_DY yvec1, yvec2, yvec6;
  741. ADD_DY yvec15, yvec6, yvec15;
  742. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  743. MUL_DY yvec1, yvec3, yvec7;
  744. ADD_DY yvec13, yvec7, yvec13;
  745. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  746. LD_DY 8*SIZE(ptrbb), yvec2;
  747. MUL_DY yvec1, yvec4, yvec6;
  748. ADD_DY yvec11, yvec6, yvec11;
  749. VPERMILP_DY $0x05, yvec2, yvec3;
  750. MUL_DY yvec1, yvec5, yvec7;
  751. ADD_DY yvec9, yvec7, yvec9;
  752. #### Untoll time 3 ####
  753. LD_DY 12*SIZE(ptrba), yvec1;
  754. MUL_DY yvec0, yvec2, yvec6;
  755. ADD_DY yvec15, yvec6, yvec15;
  756. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  757. ADDQ $16*SIZE, ptrba;
  758. MUL_DY yvec0, yvec3, yvec7;
  759. ADD_DY yvec13, yvec7, yvec13;
  760. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  761. LD_DY 12*SIZE(ptrbb), yvec2;
  762. MUL_DY yvec0, yvec4, yvec6;
  763. ADD_DY yvec11, yvec6, yvec11;
  764. VPERMILP_DY $0x05, yvec2, yvec3;
  765. ADDQ $16*SIZE, ptrbb;
  766. MUL_DY yvec0, yvec5, yvec7;
  767. ADD_DY yvec9, yvec7, yvec9;
  768. #### Untoll time 4 ####
  769. LD_DY 0*SIZE(ptrba), yvec0;
  770. MUL_DY yvec1, yvec2, yvec6;
  771. ADD_DY yvec15, yvec6, yvec15;
  772. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  773. MUL_DY yvec1, yvec3, yvec7;
  774. ADD_DY yvec13, yvec7, yvec13;
  775. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  776. LD_DY 0*SIZE(ptrbb), yvec2;
  777. MUL_DY yvec1, yvec4, yvec6;
  778. ADD_DY yvec11, yvec6, yvec11;
  779. VPERMILP_DY $0x05, yvec2, yvec3;
  780. MUL_DY yvec1, yvec5, yvec7;
  781. ADD_DY yvec9, yvec7, yvec9;
  782. DECQ k;
  783. JG .L6_bodyB;
  784. ALIGN_5
  785. .L6_loopE:;
  786. #ifndef TRMMKERNEL
  787. TEST $2, bk;
  788. #else
  789. MOVQ kkk, %rax;
  790. TEST $2, %rax;
  791. #endif
  792. JLE .L7_loopE;
  793. ALIGN_5
  794. .L7_bodyB:;
  795. #### Untoll time 1 ####
  796. LD_DY 4*SIZE(ptrba), yvec1;
  797. MUL_DY yvec0, yvec2, yvec6;
  798. ADD_DY yvec15, yvec6, yvec15;
  799. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  800. ADDQ $8*SIZE, ptrba;
  801. MUL_DY yvec0, yvec3, yvec7;
  802. ADD_DY yvec13, yvec7, yvec13;
  803. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  804. LD_DY 4*SIZE(ptrbb), yvec2;
  805. MUL_DY yvec0, yvec4, yvec6;
  806. ADD_DY yvec11, yvec6, yvec11;
  807. VPERMILP_DY $0x05, yvec2, yvec3;
  808. ADDQ $8*SIZE, ptrbb;
  809. MUL_DY yvec0, yvec5, yvec7;
  810. ADD_DY yvec9, yvec7, yvec9;
  811. #### Untoll time 2 ####
  812. LD_DY 0*SIZE(ptrba), yvec0;
  813. MUL_DY yvec1, yvec2, yvec6;
  814. ADD_DY yvec15, yvec6, yvec15;
  815. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  816. MUL_DY yvec1, yvec3, yvec7;
  817. ADD_DY yvec13, yvec7, yvec13;
  818. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  819. LD_DY 0*SIZE(ptrbb), yvec2;
  820. MUL_DY yvec1, yvec4, yvec6;
  821. ADD_DY yvec11, yvec6, yvec11;
  822. VPERMILP_DY $0x05, yvec2, yvec3;
  823. MUL_DY yvec1, yvec5, yvec7;
  824. ADD_DY yvec9, yvec7, yvec9;
  825. .L7_loopE:;
  826. #ifndef TRMMKERNEL
  827. TEST $1, bk
  828. #else
  829. MOVQ kkk, %rax;
  830. TEST $1, %rax;
  831. #endif
  832. JLE .L8_loopE;
  833. ALIGN_5
  834. .L8_bodyB:;
  835. #### Untoll time 1 ####
  836. MUL_DY yvec0, yvec2, yvec6;
  837. ADD_DY yvec15, yvec6, yvec15;
  838. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  839. ADDQ $4*SIZE, ptrba;
  840. MUL_DY yvec0, yvec3, yvec7;
  841. ADD_DY yvec13, yvec7, yvec13;
  842. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  843. MUL_DY yvec0, yvec4, yvec6;
  844. ADD_DY yvec11, yvec6, yvec11;
  845. ADDQ $4*SIZE, ptrbb;
  846. MUL_DY yvec0, yvec5, yvec7;
  847. ADD_DY yvec9, yvec7, yvec9;
  848. .L8_loopE:;
  849. #### Load Alpha ####
  850. BROAD_DY MEMALPHA, yvec7;
  851. #### Multiply Alpha ####
  852. MUL_DY yvec7,yvec15,yvec15;
  853. MUL_DY yvec7,yvec13,yvec13;
  854. MUL_DY yvec7,yvec11,yvec11;
  855. MUL_DY yvec7,yvec9,yvec9;
  856. #### Reverse the Results ####
  857. MOV_DY yvec15, yvec7;
  858. REVS_DY $0x0a,yvec13,yvec15,yvec15;
  859. REVS_DY $0x0a,yvec7,yvec13,yvec13;
  860. MOV_DY yvec11,yvec7;
  861. REVS_DY $0x0a,yvec9,yvec11,yvec11;
  862. REVS_DY $0x0a,yvec7,yvec9,yvec9;
  863. #### Testing alignment ####
  864. MOVQ C0, %rax;
  865. OR ldc, %rax;
  866. TEST $15, %rax;
  867. JNE .L8_loopEx; # Unalign part write back
  868. ALIGN_5
  869. #### Writing Back ####
  870. EXTRA_DY $1,yvec15,xvec7;
  871. EXTRA_DY $1,yvec13,xvec5;
  872. EXTRA_DY $1,yvec11,xvec3;
  873. EXTRA_DY $1,yvec9,xvec1;
  874. #ifndef TRMMKERNEL
  875. ADD_DX 0*SIZE(C0), xvec15, xvec15;
  876. ADD_DX 2*SIZE(C1), xvec7, xvec7;
  877. ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13;
  878. ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5;
  879. ADD_DX 0*SIZE(C1), xvec11, xvec11;
  880. ADD_DX 2*SIZE(C0), xvec3, xvec3;
  881. ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9;
  882. ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1;
  883. #endif
  884. ST_DX xvec15, 0*SIZE(C0);
  885. ST_DX xvec7, 2*SIZE(C1);
  886. ST_DX xvec13, 0*SIZE(C0,ldc,1);
  887. ST_DX xvec5, 2*SIZE(C1,ldc,1);
  888. ST_DX xvec11, 0*SIZE(C1);
  889. ST_DX xvec3, 2*SIZE(C0);
  890. ST_DX xvec9, 0*SIZE(C1,ldc,1);
  891. ST_DX xvec1, 2*SIZE(C0,ldc,1);
  892. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  893. MOVQ bk, %rax;
  894. SUBQ kkk, %rax;
  895. LEAQ (, %rax, SIZE), %rax;
  896. LEAQ (ptrba, %rax, 4), ptrba;
  897. LEAQ (ptrbb, %rax, 4), ptrbb;
  898. #endif
  899. #if defined(TRMMKERNEL)&&defined(LEFT)
  900. ADDQ $4, kk
  901. #endif
  902. ADDQ $4*SIZE, C0;
  903. ADDQ $4*SIZE, C1;
  904. JMP .L5_loopE;
  905. ALIGN_5
  906. .L8_loopEx:;
  907. EXTRA_DY $1,yvec15,xvec7;
  908. EXTRA_DY $1,yvec13,xvec5;
  909. EXTRA_DY $1,yvec11,xvec3;
  910. EXTRA_DY $1,yvec9,xvec1;
  911. #ifndef TRMMKERNEL
  912. LDL_DX 0*SIZE(C0), xvec14, xvec14;
  913. LDH_DX 1*SIZE(C0), xvec14, xvec14;
  914. LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12;
  915. LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12;
  916. LDL_DX 0*SIZE(C1), xvec10, xvec10;
  917. LDH_DX 1*SIZE(C1), xvec10, xvec10;
  918. LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8;
  919. LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8;
  920. ADD_DX xvec14, xvec15, xvec15;
  921. ADD_DX xvec12, xvec13, xvec13;
  922. ADD_DX xvec10, xvec11, xvec11;
  923. ADD_DX xvec8, xvec9, xvec9;
  924. #endif
  925. STL_DX xvec15, 0*SIZE(C0);
  926. STH_DX xvec15, 1*SIZE(C0);
  927. STL_DX xvec13, 0*SIZE(C0, ldc, 1);
  928. STH_DX xvec13, 1*SIZE(C0, ldc, 1);
  929. STL_DX xvec11, 0*SIZE(C1);
  930. STH_DX xvec11, 1*SIZE(C1);
  931. STL_DX xvec9, 0*SIZE(C1, ldc, 1);
  932. STH_DX xvec9, 1*SIZE(C1, ldc, 1);
  933. #ifndef TRMMKERNEL
  934. LDL_DX 2*SIZE(C0), xvec0, xvec0;
  935. LDH_DX 3*SIZE(C0), xvec0, xvec0;
  936. LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2;
  937. LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2;
  938. LDL_DX 2*SIZE(C1), xvec4, xvec4;
  939. LDH_DX 3*SIZE(C1), xvec4, xvec4;
  940. LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6;
  941. LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6;
  942. ADD_DX xvec0, xvec3, xvec3;
  943. ADD_DX xvec2, xvec1, xvec1;
  944. ADD_DX xvec4, xvec7, xvec7;
  945. ADD_DX xvec6, xvec5, xvec5;
  946. #endif
  947. STL_DX xvec3, 2*SIZE(C0);
  948. STH_DX xvec3, 3*SIZE(C0);
  949. STL_DX xvec1, 2*SIZE(C0, ldc, 1);
  950. STH_DX xvec1, 3*SIZE(C0, ldc, 1);
  951. STL_DX xvec7, 2*SIZE(C1);
  952. STH_DX xvec7, 3*SIZE(C1);
  953. STL_DX xvec5, 2*SIZE(C1, ldc, 1);
  954. STH_DX xvec5, 3*SIZE(C1, ldc, 1);
  955. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  956. MOVQ bk, %rax;
  957. SUBQ kkk, %rax;
  958. LEAQ (, %rax, SIZE), %rax;
  959. LEAQ (ptrba, %rax, 4), ptrba;
  960. LEAQ (ptrbb, %rax, 4), ptrbb;
  961. #endif
  962. #if defined(TRMMKERNEL)&&defined(LEFT)
  963. ADDQ $4, kk
  964. #endif
  965. ADDQ $4*SIZE, C0;
  966. ADDQ $4*SIZE, C1;
  967. .L5_loopE:;
  968. TEST $2, bm;
  969. JLE .L9_loopE;
  970. ALIGN_5
  971. .L9_bodyB:;
  972. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  973. MOVQ bb, ptrbb;
  974. #else
  975. MOVQ bb, ptrbb;
  976. MOVQ kk, %rax;
  977. LEAQ (, %rax, SIZE), %rax;
  978. LEAQ (ptrba, %rax, 2), ptrba;
  979. LEAQ (ptrbb, %rax, 4), ptrbb
  980. #endif
  981. #### Initial Results Register ####
  982. LD_DX 0*SIZE(ptrbb), xvec2;
  983. XOR_DY yvec15, yvec15, yvec15;
  984. LD_DX 2*SIZE(ptrbb), xvec3;
  985. XOR_DY yvec13, yvec13, yvec13;
  986. LD_DX 0*SIZE(ptrba), xvec0;
  987. XOR_DY yvec11, yvec11, yvec11;
  988. SHUF_DX $0x4e, xvec2, xvec4;
  989. XOR_DY yvec9, yvec9, yvec9;
  990. #ifndef TRMMKERNEL
  991. MOVQ bk, k;
  992. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  993. MOVQ bk, %rax;
  994. SUBQ kk, %rax;
  995. MOVQ %rax, kkk;
  996. #else
  997. MOVQ kk, %rax;
  998. #ifdef LEFT
  999. ADDQ $2, %rax;
  1000. #else
  1001. ADDQ $4, %rax;
  1002. #endif
  1003. MOVQ %rax, kkk;
  1004. #endif
  1005. SARQ $2, k;
  1006. JLE .L10_loopE;
  1007. ALIGN_5;
  1008. .L10_bodyB:;
  1009. # Computing kernel
  1010. ##### Unroll time 1 ####
  1011. LD_DX 4*SIZE(ptrbb), xvec6;
  1012. SHUF_DX $0x4e, xvec3, xvec5;
  1013. MUL_DX xvec0, xvec2, xvec2;
  1014. ADD_DX xvec2, xvec15, xvec15;
  1015. LD_DX 6*SIZE(ptrbb), xvec7;
  1016. MUL_DX xvec0, xvec3, xvec3;
  1017. ADD_DX xvec3, xvec11, xvec11;
  1018. LD_DX 2*SIZE(ptrba), xvec1;
  1019. MUL_DX xvec0, xvec4, xvec4;
  1020. ADD_DX xvec4, xvec13, xvec13;
  1021. SHUF_DX $0x4e, xvec6, xvec4;
  1022. MUL_DX xvec0, xvec5, xvec5;
  1023. ADD_DX xvec5, xvec9, xvec9;
  1024. #### Unroll time 2 ####
  1025. LD_DX 8*SIZE(ptrbb), xvec2;
  1026. SHUF_DX $0x4e, xvec7, xvec5;
  1027. MUL_DX xvec1, xvec6, xvec6;
  1028. ADD_DX xvec6, xvec15, xvec15;
  1029. LD_DX 10*SIZE(ptrbb), xvec3;
  1030. MUL_DX xvec1, xvec7, xvec7;
  1031. ADD_DX xvec7, xvec11, xvec11;
  1032. LD_DX 4*SIZE(ptrba), xvec0;
  1033. MUL_DX xvec1, xvec4, xvec4;
  1034. ADD_DX xvec4, xvec13, xvec13;
  1035. SHUF_DX $0x4e, xvec2, xvec4;
  1036. MUL_DX xvec1, xvec5, xvec5;
  1037. ADD_DX xvec5, xvec9, xvec9;
  1038. ##### Unroll time 3 ####
  1039. LD_DX 12*SIZE(ptrbb), xvec6;
  1040. SHUF_DX $0x4e, xvec3, xvec5;
  1041. MUL_DX xvec0, xvec2, xvec2;
  1042. ADD_DX xvec2, xvec15, xvec15;
  1043. LD_DX 14*SIZE(ptrbb), xvec7;
  1044. MUL_DX xvec0, xvec3, xvec3;
  1045. ADD_DX xvec3, xvec11, xvec11;
  1046. ADDQ $16*SIZE, ptrbb;
  1047. LD_DX 6*SIZE(ptrba), xvec1;
  1048. MUL_DX xvec0, xvec4, xvec4;
  1049. ADD_DX xvec4, xvec13, xvec13;
  1050. SHUF_DX $0x4e, xvec6, xvec4;
  1051. ADDQ $8*SIZE, ptrba;
  1052. MUL_DX xvec0, xvec5, xvec5;
  1053. ADD_DX xvec5, xvec9, xvec9;
  1054. #### Unroll time 4 ####
  1055. LD_DX 0*SIZE(ptrbb), xvec2;
  1056. SHUF_DX $0x4e, xvec7, xvec5;
  1057. MUL_DX xvec1, xvec6, xvec6;
  1058. ADD_DX xvec6, xvec15, xvec15;
  1059. LD_DX 2*SIZE(ptrbb), xvec3;
  1060. MUL_DX xvec1, xvec7, xvec7;
  1061. ADD_DX xvec7, xvec11, xvec11;
  1062. LD_DX 0*SIZE(ptrba), xvec0;
  1063. MUL_DX xvec1, xvec4, xvec4;
  1064. ADD_DX xvec4, xvec13, xvec13;
  1065. SHUF_DX $0x4e, xvec2, xvec4;
  1066. MUL_DX xvec1, xvec5, xvec5;
  1067. ADD_DX xvec5, xvec9, xvec9;
  1068. DECQ k;
  1069. JG .L10_bodyB;
  1070. ALIGN_5
  1071. .L10_loopE:;
  1072. #ifndef TRMMKERNEL
  1073. TEST $2, bk
  1074. #else
  1075. MOVQ kkk, %rax;
  1076. TEST $2, %rax;
  1077. #endif
  1078. JLE .L11_loopE;
  1079. ALIGN_5
  1080. .L11_bodyB:;
  1081. ##### Unroll time 1 ####
  1082. LD_DX 4*SIZE(ptrbb), xvec6;
  1083. SHUF_DX $0x4e, xvec3, xvec5;
  1084. MUL_DX xvec0, xvec2, xvec2;
  1085. ADD_DX xvec2, xvec15, xvec15;
  1086. LD_DX 6*SIZE(ptrbb), xvec7;
  1087. MUL_DX xvec0, xvec3, xvec3;
  1088. ADD_DX xvec3, xvec11, xvec11;
  1089. ADDQ $8*SIZE, ptrbb;
  1090. LD_DX 2*SIZE(ptrba), xvec1;
  1091. MUL_DX xvec0, xvec4, xvec4;
  1092. ADD_DX xvec4, xvec13, xvec13;
  1093. SHUF_DX $0x4e, xvec6, xvec4;
  1094. ADDQ $4*SIZE, ptrba;
  1095. MUL_DX xvec0, xvec5, xvec5;
  1096. ADD_DX xvec5, xvec9, xvec9;
  1097. #### Unroll time 2 ####
  1098. LD_DX 0*SIZE(ptrbb), xvec2;
  1099. SHUF_DX $0x4e, xvec7, xvec5;
  1100. MUL_DX xvec1, xvec6, xvec6;
  1101. ADD_DX xvec6, xvec15, xvec15;
  1102. LD_DX 2*SIZE(ptrbb), xvec3;
  1103. MUL_DX xvec1, xvec7, xvec7;
  1104. ADD_DX xvec7, xvec11, xvec11;
  1105. LD_DX 0*SIZE(ptrba), xvec0;
  1106. MUL_DX xvec1, xvec4, xvec4;
  1107. ADD_DX xvec4, xvec13, xvec13;
  1108. SHUF_DX $0x4e, xvec2, xvec4;
  1109. MUL_DX xvec1, xvec5, xvec5;
  1110. ADD_DX xvec5, xvec9, xvec9;
  1111. .L11_loopE:;
  1112. #ifndef TRMMKERNEL
  1113. TEST $1, bk
  1114. #else
  1115. MOVQ kkk, %rax;
  1116. TEST $1, %rax;
  1117. #endif
  1118. JLE .L12_loopE;
  1119. ALIGN_5
  1120. .L12_bodyB:;
  1121. SHUF_DX $0x4e, xvec3, xvec5;
  1122. MUL_DX xvec0, xvec2, xvec2;
  1123. ADD_DX xvec2, xvec15, xvec15;
  1124. ADDQ $4*SIZE, ptrbb;
  1125. MUL_DX xvec0, xvec3, xvec3;
  1126. ADD_DX xvec3, xvec11, xvec11;
  1127. ADDQ $2*SIZE, ptrba;
  1128. MUL_DX xvec0, xvec4, xvec4;
  1129. ADD_DX xvec4, xvec13, xvec13;
  1130. MUL_DX xvec0, xvec5, xvec5;
  1131. ADD_DX xvec5, xvec9, xvec9;
  1132. .L12_loopE:;
  1133. #### Load Alpha ####
  1134. BROAD_DX MEMALPHA, xvec7;
  1135. #### Multiply Alpha ####
  1136. MUL_DX xvec7, xvec15, xvec15;
  1137. MUL_DX xvec7, xvec13, xvec13;
  1138. MUL_DX xvec7, xvec11, xvec11;
  1139. MUL_DX xvec7, xvec9, xvec9;
  1140. #### Reverse the Results ####
  1141. MOV_DX xvec15, xvec6;
  1142. REVS_DX xvec13, xvec15, xvec15;
  1143. REVS_DX xvec6, xvec13, xvec13;
  1144. MOV_DX xvec11, xvec6;
  1145. REVS_DX xvec9, xvec11, xvec11;
  1146. REVS_DX xvec6, xvec9, xvec9;
  1147. #### Testing Alignment ####
  1148. MOVQ C0, %rax;
  1149. OR ldc, %rax;
  1150. TEST $15, %rax;
  1151. JNE .L12_loopEx;
  1152. ALIGN_5
  1153. #### Writing Back ####
  1154. #ifndef TRMMKERNEL
  1155. ADD_DX 0*SIZE(C0), xvec13, xvec13;
  1156. ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15;
  1157. ADD_DX 0*SIZE(C1), xvec9, xvec9;
  1158. ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11;
  1159. #endif
  1160. ST_DX xvec13, 0*SIZE(C0);
  1161. ST_DX xvec15, 0*SIZE(C0, ldc, 1);
  1162. ST_DX xvec9, 0*SIZE(C1);
  1163. ST_DX xvec11, 0*SIZE(C1, ldc, 1);
  1164. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1165. MOVQ bk, %rax;
  1166. SUBQ kkk, %rax;
  1167. LEAQ (,%rax, SIZE), %rax;
  1168. LEAQ (ptrba, %rax, 2), ptrba;
  1169. LEAQ (ptrbb, %rax, 4), ptrbb;
  1170. #endif
  1171. #if defined(TRMMKERNEL) && defined(LEFT)
  1172. ADDQ $2, kk
  1173. #endif
  1174. ADDQ $2*SIZE, C0
  1175. ADDQ $2*SIZE, C1
  1176. JMP .L9_loopE;
  1177. ALIGN_5
  1178. .L12_loopEx:
  1179. #ifndef TRMMKERNEL
  1180. LDL_DX 0*SIZE(C0), xvec14, xvec14;
  1181. LDH_DX 1*SIZE(C0), xvec14, xvec14;
  1182. LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12;
  1183. LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12;
  1184. LDL_DX 0*SIZE(C1), xvec10, xvec10;
  1185. LDH_DX 1*SIZE(C1), xvec10, xvec10;
  1186. LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8;
  1187. LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8;
  1188. ADD_DX xvec14, xvec13, xvec13;
  1189. ADD_DX xvec12, xvec15, xvec15;
  1190. ADD_DX xvec10, xvec9, xvec9;
  1191. ADD_DX xvec8, xvec11, xvec11;
  1192. #endif
  1193. STL_DX xvec13, 0*SIZE(C0);
  1194. STH_DX xvec13, 1*SIZE(C0);
  1195. STL_DX xvec15, 0*SIZE(C0, ldc, 1);
  1196. STH_DX xvec15, 1*SIZE(C0, ldc, 1);
  1197. STL_DX xvec9, 0*SIZE(C1);
  1198. STH_DX xvec9, 1*SIZE(C1);
  1199. STL_DX xvec11, 0*SIZE(C1, ldc, 1);
  1200. STH_DX xvec11, 1*SIZE(C1, ldc, 1);
  1201. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1202. MOVQ bk, %rax;
  1203. SUBQ kkk, %rax;
  1204. LEAQ (,%rax, SIZE), %rax;
  1205. LEAQ (ptrba, %rax, 2), ptrba;
  1206. LEAQ (ptrbb, %rax, 4), ptrbb;
  1207. #endif
  1208. #if defined(TRMMKERNEL) && defined(LEFT)
  1209. ADDQ $2, kk
  1210. #endif
  1211. ADDQ $2*SIZE, C0;
  1212. ADDQ $2*SIZE, C1;
  1213. .L9_loopE:;
  1214. TEST $1, bm
  1215. JLE .L13_loopE;
  1216. ALIGN_5
  1217. .L13_bodyB:;
  1218. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1219. MOVQ bb, ptrbb;
  1220. #else
  1221. MOVQ bb, ptrbb;
  1222. MOVQ kk, %rax;
  1223. LEAQ (,%rax, SIZE), %rax;
  1224. ADDQ %rax, ptrba;
  1225. LEAQ (ptrbb, %rax, 4), ptrbb;
  1226. #endif
  1227. #### Initial Results Register ####
  1228. XOR_DY yvec15, yvec15, yvec15;
  1229. #ifndef TRMMKERNEL
  1230. MOVQ bk, k;
  1231. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1232. MOVQ bk, %rax;
  1233. SUBQ kk, %rax;
  1234. MOVQ %rax, kkk;
  1235. #else
  1236. MOVQ kk, %rax;
  1237. #ifdef LEFT
  1238. ADDQ $1, %rax;
  1239. #else
  1240. ADDQ $4, %rax;
  1241. #endif
  1242. MOVQ %rax, kkk;
  1243. #endif
  1244. SARQ $2, k;
  1245. JLE .L14_loopE;
  1246. ALIGN_5
  1247. .L14_bodyB:;
  1248. BROAD_DY 0*SIZE(ptrba), yvec0;
  1249. LD_DY 0*SIZE(ptrbb), yvec2;
  1250. MUL_DY yvec0, yvec2, yvec6;
  1251. ADD_DY yvec15, yvec6, yvec15;
  1252. BROAD_DY 1*SIZE(ptrba), yvec1;
  1253. LD_DY 4*SIZE(ptrbb), yvec3;
  1254. MUL_DY yvec1, yvec3, yvec7;
  1255. ADD_DY yvec15, yvec7, yvec15;
  1256. BROAD_DY 2*SIZE(ptrba), yvec0;
  1257. LD_DY 8*SIZE(ptrbb), yvec2;
  1258. MUL_DY yvec0, yvec2, yvec6;
  1259. ADD_DY yvec15, yvec6, yvec15;
  1260. BROAD_DY 3*SIZE(ptrba), yvec1;
  1261. LD_DY 12*SIZE(ptrbb), yvec3;
  1262. MUL_DY yvec1, yvec3, yvec7;
  1263. ADD_DY yvec15, yvec7, yvec15;
  1264. ADDQ $4*SIZE, ptrba;
  1265. ADDQ $16*SIZE, ptrbb;
  1266. DECQ k;
  1267. JG .L14_bodyB;
  1268. ALIGN_5
  1269. .L14_loopE:
  1270. #ifndef TRMMKERNEL
  1271. TEST $2, bk;
  1272. #else
  1273. MOVQ kkk, %rax;
  1274. TEST $2, %rax;
  1275. #endif
  1276. JLE .L15_loopE;
  1277. ALIGN_5
  1278. .L15_bodyB:
  1279. BROAD_DY 0*SIZE(ptrba), yvec0;
  1280. LD_DY 0*SIZE(ptrbb), yvec2;
  1281. MUL_DY yvec0, yvec2, yvec6;
  1282. ADD_DY yvec15, yvec6, yvec15;
  1283. BROAD_DY 1*SIZE(ptrba), yvec1;
  1284. LD_DY 4*SIZE(ptrbb), yvec3;
  1285. MUL_DY yvec1, yvec3, yvec7;
  1286. ADD_DY yvec15, yvec7, yvec15;
  1287. ADDQ $2*SIZE, ptrba;
  1288. ADDQ $8*SIZE, ptrbb;
  1289. .L15_loopE:;
  1290. #ifndef TRMMKERNEL
  1291. TEST $1, bk;
  1292. #else
  1293. MOVQ kkk, %rax;
  1294. TEST $1, %rax;
  1295. #endif
  1296. JLE .L16_loopE;
  1297. ALIGN_5
  1298. .L16_bodyB:;
  1299. BROAD_DY 0*SIZE(ptrba), yvec0;
  1300. LD_DY 0*SIZE(ptrbb), yvec2;
  1301. MUL_DY yvec0, yvec2, yvec6;
  1302. ADD_DY yvec15, yvec6, yvec15;
  1303. ADDQ $1*SIZE, ptrba;
  1304. ADDQ $4*SIZE, ptrbb;
  1305. .L16_loopE:
  1306. #### Load Alpha ####
  1307. BROAD_DY MEMALPHA, yvec7;
  1308. #### Multiply Alpha ####
  1309. MUL_DY yvec15, yvec7, yvec15;
  1310. #### Writing Back ####
  1311. EXTRA_DY $1, yvec15, xvec7;
  1312. #ifndef TRMMKERNEL
  1313. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1314. LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0;
  1315. LDL_DX 0*SIZE(C1), xvec1, xvec1;
  1316. LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1;
  1317. ADD_DX xvec0, xvec15, xvec15;
  1318. ADD_DX xvec1, xvec7, xvec7;
  1319. #endif
  1320. STL_DX xvec15, 0*SIZE(C0);
  1321. STH_DX xvec15, 0*SIZE(C0, ldc, 1);
  1322. STL_DX xvec7, 0*SIZE(C1);
  1323. STH_DX xvec7, 0*SIZE(C1, ldc, 1);
  1324. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1325. MOVQ bk, %rax;
  1326. SUBQ kkk, %rax;
  1327. LEAQ (,%rax, SIZE), %rax;
  1328. ADDQ %rax, ptrba;
  1329. LEAQ (ptrbb, %rax, 4), ptrbb;
  1330. #endif
  1331. #if defined(TRMMKERNEL)&&defined(LEFT)
  1332. ADDQ $1, kk
  1333. #endif
  1334. ADDQ $1*SIZE, C0
  1335. ADDQ $1*SIZE, C1
  1336. .L13_loopE:;
  1337. #if defined(TRMMKERNEL)&&!defined(LEFT)
  1338. ADDQ $4, kk
  1339. #endif
  1340. MOVQ bk,k;
  1341. SALQ $5,k;
  1342. ADDQ k,bb;
  1343. LEAQ (C,ldc,4),C;
  1344. .L0_bodyE:;
  1345. DECQ j;
  1346. JG .L0_bodyB;
  1347. ALIGN_5;
  1348. .L0_loopE:;
  1349. TEST $2, bn;
  1350. JLE .L20_loopE;
  1351. ALIGN_5;
  1352. .L20_loopB:;
  1353. #if defined(TRMMKERNEL) && defined(LEFT)
  1354. MOVQ OFFSET, %rax;
  1355. MOVQ %rax, kk
  1356. #endif
  1357. MOVQ C, C0;
  1358. LEAQ (C, ldc, 1), C1;
  1359. MOVQ ba, ptrba;
  1360. MOVQ bm, i;
  1361. SARQ $3, i; # Rm = 8
  1362. JLE .L21_loopE;
  1363. ALIGN_5;
  1364. .L21_bodyB:;
  1365. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1366. MOVQ bb, ptrbb;
  1367. #else
  1368. MOVQ bb, ptrbb;
  1369. MOVQ kk, %rax;
  1370. LEAQ (, %rax, SIZE), %rax;
  1371. LEAQ (ptrba, %rax, 8), ptrba;
  1372. LEAQ (ptrbb, %rax, 2), ptrbb;
  1373. #endif
  1374. #### Initial Results Register ####
  1375. XOR_DY yvec15, yvec15, yvec15;
  1376. XOR_DY yvec14, yvec14, yvec14;
  1377. XOR_DY yvec13, yvec13, yvec13;
  1378. XOR_DY yvec12, yvec12, yvec12;
  1379. XOR_DY yvec11, yvec11, yvec11;
  1380. XOR_DY yvec10, yvec10, yvec10;
  1381. XOR_DY yvec9, yvec9, yvec9;
  1382. XOR_DY yvec8, yvec8, yvec8;
  1383. #ifndef TRMMKERNEL
  1384. MOVQ bk, k;
  1385. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1386. MOVQ bk, %rax;
  1387. SUBQ kk, %rax;
  1388. MOVQ %rax, kkk;
  1389. #else
  1390. MOVQ kk, %rax;
  1391. #ifdef LEFT
  1392. ADDQ $8, %rax;
  1393. #else
  1394. ADDQ $2, %rax;
  1395. #endif
  1396. MOVQ %rax, kkk;
  1397. #endif
  1398. SARQ $2, k;
  1399. JLE .L211_loopE;
  1400. ALIGN_5;
  1401. .L211_bodyB:
  1402. # Computing kernel
  1403. #### Unroll time 1 ####
  1404. LD_DX 0*SIZE(ptrba), xvec0;
  1405. LD_DX 0*SIZE(ptrbb), xvec4;
  1406. MOV_DX xvec4, xvec5;
  1407. MUL_DX xvec0, xvec4, xvec4;
  1408. ADD_DX xvec4, xvec15, xvec15;
  1409. LD_DX 2*SIZE(ptrba), xvec1;
  1410. MOV_DX xvec5, xvec6;
  1411. MUL_DX xvec1, xvec5, xvec5;
  1412. ADD_DX xvec5, xvec14, xvec14;
  1413. LD_DX 4*SIZE(ptrba), xvec2;
  1414. MOV_DX xvec6, xvec7;
  1415. MUL_DX xvec2, xvec6, xvec6;
  1416. ADD_DX xvec6, xvec13, xvec13;
  1417. LD_DX 6*SIZE(ptrba), xvec3;
  1418. SHUF_DX $0x4e, xvec7, xvec4;
  1419. MUL_DX xvec3, xvec7, xvec7;
  1420. ADD_DX xvec7, xvec12, xvec12;
  1421. MOV_DX xvec4, xvec5;
  1422. MUL_DX xvec0, xvec4, xvec4;
  1423. ADD_DX xvec4, xvec11, xvec11;
  1424. MOV_DX xvec5, xvec6;
  1425. MUL_DX xvec1, xvec5, xvec5;
  1426. ADD_DX xvec5, xvec10, xvec10;
  1427. MOV_DX xvec6, xvec7;
  1428. MUL_DX xvec2, xvec6, xvec6;
  1429. ADD_DX xvec6, xvec9, xvec9;
  1430. MUL_DX xvec3, xvec7, xvec7;
  1431. ADD_DX xvec7, xvec8, xvec8;
  1432. #### Unroll time 2 ####
  1433. LD_DX 8*SIZE(ptrba), xvec0;
  1434. LD_DX 2*SIZE(ptrbb), xvec4;
  1435. MOV_DX xvec4, xvec5;
  1436. MUL_DX xvec0, xvec4, xvec4;
  1437. ADD_DX xvec4, xvec15, xvec15;
  1438. LD_DX 10*SIZE(ptrba), xvec1;
  1439. MOV_DX xvec5, xvec6;
  1440. MUL_DX xvec1, xvec5, xvec5;
  1441. ADD_DX xvec5, xvec14, xvec14;
  1442. LD_DX 12*SIZE(ptrba), xvec2;
  1443. MOV_DX xvec6, xvec7;
  1444. MUL_DX xvec2, xvec6, xvec6;
  1445. ADD_DX xvec6, xvec13, xvec13;
  1446. LD_DX 14*SIZE(ptrba), xvec3;
  1447. SHUF_DX $0x4e, xvec7, xvec4;
  1448. MUL_DX xvec3, xvec7, xvec7;
  1449. ADD_DX xvec7, xvec12, xvec12;
  1450. MOV_DX xvec4, xvec5;
  1451. MUL_DX xvec0, xvec4, xvec4;
  1452. ADD_DX xvec4, xvec11, xvec11;
  1453. MOV_DX xvec5, xvec6;
  1454. MUL_DX xvec1, xvec5, xvec5;
  1455. ADD_DX xvec5, xvec10, xvec10;
  1456. MOV_DX xvec6, xvec7;
  1457. MUL_DX xvec2, xvec6, xvec6;
  1458. ADD_DX xvec6, xvec9, xvec9;
  1459. MUL_DX xvec3, xvec7, xvec7;
  1460. ADD_DX xvec7, xvec8, xvec8;
  1461. #### Unroll time 3 ####
  1462. LD_DX 16*SIZE(ptrba), xvec0;
  1463. LD_DX 4*SIZE(ptrbb), xvec4;
  1464. MOV_DX xvec4, xvec5;
  1465. MUL_DX xvec0, xvec4, xvec4;
  1466. ADD_DX xvec4, xvec15, xvec15;
  1467. LD_DX 18*SIZE(ptrba), xvec1;
  1468. MOV_DX xvec5, xvec6;
  1469. MUL_DX xvec1, xvec5, xvec5;
  1470. ADD_DX xvec5, xvec14, xvec14;
  1471. LD_DX 20*SIZE(ptrba), xvec2;
  1472. MOV_DX xvec6, xvec7;
  1473. MUL_DX xvec2, xvec6, xvec6;
  1474. ADD_DX xvec6, xvec13, xvec13;
  1475. LD_DX 22*SIZE(ptrba), xvec3;
  1476. SHUF_DX $0x4e, xvec7, xvec4;
  1477. MUL_DX xvec3, xvec7, xvec7;
  1478. ADD_DX xvec7, xvec12, xvec12;
  1479. MOV_DX xvec4, xvec5;
  1480. MUL_DX xvec0, xvec4, xvec4;
  1481. ADD_DX xvec4, xvec11, xvec11;
  1482. MOV_DX xvec5, xvec6;
  1483. MUL_DX xvec1, xvec5, xvec5;
  1484. ADD_DX xvec5, xvec10, xvec10;
  1485. MOV_DX xvec6, xvec7;
  1486. MUL_DX xvec2, xvec6, xvec6;
  1487. ADD_DX xvec6, xvec9, xvec9;
  1488. MUL_DX xvec3, xvec7, xvec7;
  1489. ADD_DX xvec7, xvec8, xvec8;
  1490. #### Unroll time 4 ####
  1491. LD_DX 24*SIZE(ptrba), xvec0;
  1492. LD_DX 6*SIZE(ptrbb), xvec4;
  1493. MOV_DX xvec4, xvec5;
  1494. MUL_DX xvec0, xvec4, xvec4;
  1495. ADD_DX xvec4, xvec15, xvec15;
  1496. ADDQ $8*SIZE, ptrbb;
  1497. LD_DX 26*SIZE(ptrba), xvec1;
  1498. MOV_DX xvec5, xvec6;
  1499. MUL_DX xvec1, xvec5, xvec5;
  1500. ADD_DX xvec5, xvec14, xvec14;
  1501. LD_DX 28*SIZE(ptrba), xvec2;
  1502. MOV_DX xvec6, xvec7;
  1503. MUL_DX xvec2, xvec6, xvec6;
  1504. ADD_DX xvec6, xvec13, xvec13;
  1505. LD_DX 30*SIZE(ptrba), xvec3;
  1506. SHUF_DX $0x4e, xvec7, xvec4;
  1507. MUL_DX xvec3, xvec7, xvec7;
  1508. ADD_DX xvec7, xvec12, xvec12;
  1509. ADDQ $32*SIZE, ptrba;
  1510. MOV_DX xvec4, xvec5;
  1511. MUL_DX xvec0, xvec4, xvec4;
  1512. ADD_DX xvec4, xvec11, xvec11;
  1513. MOV_DX xvec5, xvec6;
  1514. MUL_DX xvec1, xvec5, xvec5;
  1515. ADD_DX xvec5, xvec10, xvec10;
  1516. MOV_DX xvec6, xvec7;
  1517. MUL_DX xvec2, xvec6, xvec6;
  1518. ADD_DX xvec6, xvec9, xvec9;
  1519. MUL_DX xvec3, xvec7, xvec7;
  1520. ADD_DX xvec7, xvec8, xvec8;
  1521. DECQ k;
  1522. JG .L211_bodyB;
  1523. ALIGN_5
  1524. .L211_loopE:
  1525. #ifndef TRMMKERNEL
  1526. TEST $2, bk;
  1527. #else
  1528. MOVQ kkk, %rax;
  1529. TEST $2, %rax;
  1530. #endif
  1531. JLE .L212_loopE;
  1532. ALIGN_5;
  1533. .L212_bodyB:
  1534. # Computing kernel
  1535. #### Unroll time 1 ####
  1536. LD_DX 0*SIZE(ptrba), xvec0;
  1537. LD_DX 0*SIZE(ptrbb), xvec4;
  1538. MOV_DX xvec4, xvec5;
  1539. MUL_DX xvec0, xvec4, xvec4;
  1540. ADD_DX xvec4, xvec15, xvec15;
  1541. LD_DX 2*SIZE(ptrba), xvec1;
  1542. MOV_DX xvec5, xvec6;
  1543. MUL_DX xvec1, xvec5, xvec5;
  1544. ADD_DX xvec5, xvec14, xvec14;
  1545. LD_DX 4*SIZE(ptrba), xvec2;
  1546. MOV_DX xvec6, xvec7;
  1547. MUL_DX xvec2, xvec6, xvec6;
  1548. ADD_DX xvec6, xvec13, xvec13;
  1549. LD_DX 6*SIZE(ptrba), xvec3;
  1550. SHUF_DX $0x4e, xvec7, xvec4;
  1551. MUL_DX xvec3, xvec7, xvec7;
  1552. ADD_DX xvec7, xvec12, xvec12;
  1553. MOV_DX xvec4, xvec5;
  1554. MUL_DX xvec0, xvec4, xvec4;
  1555. ADD_DX xvec4, xvec11, xvec11;
  1556. MOV_DX xvec5, xvec6;
  1557. MUL_DX xvec1, xvec5, xvec5;
  1558. ADD_DX xvec5, xvec10, xvec10;
  1559. MOV_DX xvec6, xvec7;
  1560. MUL_DX xvec2, xvec6, xvec6;
  1561. ADD_DX xvec6, xvec9, xvec9;
  1562. MUL_DX xvec3, xvec7, xvec7;
  1563. ADD_DX xvec7, xvec8, xvec8;
  1564. #### Unroll time 2 ####
  1565. LD_DX 8*SIZE(ptrba), xvec0;
  1566. LD_DX 2*SIZE(ptrbb), xvec4;
  1567. MOV_DX xvec4, xvec5;
  1568. MUL_DX xvec0, xvec4, xvec4;
  1569. ADD_DX xvec4, xvec15, xvec15;
  1570. ADDQ $4*SIZE, ptrbb;
  1571. LD_DX 10*SIZE(ptrba), xvec1;
  1572. MOV_DX xvec5, xvec6;
  1573. MUL_DX xvec1, xvec5, xvec5;
  1574. ADD_DX xvec5, xvec14, xvec14;
  1575. LD_DX 12*SIZE(ptrba), xvec2;
  1576. MOV_DX xvec6, xvec7;
  1577. MUL_DX xvec2, xvec6, xvec6;
  1578. ADD_DX xvec6, xvec13, xvec13;
  1579. LD_DX 14*SIZE(ptrba), xvec3;
  1580. SHUF_DX $0x4e, xvec7, xvec4;
  1581. MUL_DX xvec3, xvec7, xvec7;
  1582. ADD_DX xvec7, xvec12, xvec12;
  1583. ADDQ $16*SIZE, ptrba;
  1584. MOV_DX xvec4, xvec5;
  1585. MUL_DX xvec0, xvec4, xvec4;
  1586. ADD_DX xvec4, xvec11, xvec11;
  1587. MOV_DX xvec5, xvec6;
  1588. MUL_DX xvec1, xvec5, xvec5;
  1589. ADD_DX xvec5, xvec10, xvec10;
  1590. MOV_DX xvec6, xvec7;
  1591. MUL_DX xvec2, xvec6, xvec6;
  1592. ADD_DX xvec6, xvec9, xvec9;
  1593. MUL_DX xvec3, xvec7, xvec7;
  1594. ADD_DX xvec7, xvec8, xvec8;
  1595. .L212_loopE:
  1596. #ifndef TRMMKERNEL
  1597. TEST $1, bk;
  1598. #else
  1599. MOVQ kkk, %rax;
  1600. TEST $1, %rax;
  1601. #endif
  1602. JLE .L213_loopE;
  1603. ALIGN_5
  1604. .L213_bodyB:
  1605. #### Unroll time 1 ####
  1606. LD_DX 0*SIZE(ptrba), xvec0;
  1607. LD_DX 0*SIZE(ptrbb), xvec4;
  1608. MOV_DX xvec4, xvec5;
  1609. MUL_DX xvec0, xvec4, xvec4;
  1610. ADD_DX xvec4, xvec15, xvec15;
  1611. ADDQ $2*SIZE, ptrbb;
  1612. LD_DX 2*SIZE(ptrba), xvec1;
  1613. MOV_DX xvec5, xvec6;
  1614. MUL_DX xvec1, xvec5, xvec5;
  1615. ADD_DX xvec5, xvec14, xvec14;
  1616. LD_DX 4*SIZE(ptrba), xvec2;
  1617. MOV_DX xvec6, xvec7;
  1618. MUL_DX xvec2, xvec6, xvec6;
  1619. ADD_DX xvec6, xvec13, xvec13;
  1620. LD_DX 6*SIZE(ptrba), xvec3;
  1621. SHUF_DX $0x4e, xvec7, xvec4;
  1622. MUL_DX xvec3, xvec7, xvec7;
  1623. ADD_DX xvec7, xvec12, xvec12;
  1624. ADDQ $8*SIZE, ptrba;
  1625. MOV_DX xvec4, xvec5;
  1626. MUL_DX xvec0, xvec4, xvec4;
  1627. ADD_DX xvec4, xvec11, xvec11;
  1628. MOV_DX xvec5, xvec6;
  1629. MUL_DX xvec1, xvec5, xvec5;
  1630. ADD_DX xvec5, xvec10, xvec10;
  1631. MOV_DX xvec6, xvec7;
  1632. MUL_DX xvec2, xvec6, xvec6;
  1633. ADD_DX xvec6, xvec9, xvec9;
  1634. MUL_DX xvec3, xvec7, xvec7;
  1635. ADD_DX xvec7, xvec8, xvec8;
  1636. .L213_loopE:
  1637. #### Multiply Alpha ####
  1638. BROAD_DX MEMALPHA, xvec7;
  1639. MUL_DX xvec7, xvec15, xvec15;
  1640. MUL_DX xvec7, xvec14, xvec14;
  1641. MUL_DX xvec7, xvec13, xvec13;
  1642. MUL_DX xvec7, xvec12, xvec12;
  1643. MUL_DX xvec7, xvec11, xvec11;
  1644. MUL_DX xvec7, xvec10, xvec10;
  1645. MUL_DX xvec7, xvec9, xvec9;
  1646. MUL_DX xvec7, xvec8, xvec8;
  1647. #### Reverse #####
  1648. MOV_DX xvec15, xvec6;
  1649. REVS_DX xvec11, xvec15, xvec15;
  1650. REVS_DX xvec6, xvec11, xvec11;
  1651. MOV_DX xvec14, xvec6;
  1652. REVS_DX xvec10, xvec14, xvec14;
  1653. REVS_DX xvec6, xvec10, xvec10;
  1654. MOV_DX xvec13, xvec6;
  1655. REVS_DX xvec9, xvec13, xvec13;
  1656. REVS_DX xvec6, xvec9, xvec9;
  1657. MOV_DX xvec12, xvec6;
  1658. REVS_DX xvec8, xvec12, xvec12;
  1659. REVS_DX xvec6, xvec8, xvec8;
  1660. #### Testing Alignment ####
  1661. MOVQ C0, %rax;
  1662. OR ldc, %rax;
  1663. TEST $15, %rax;
  1664. JNE .L213_loopEx;
  1665. ALIGN_5
  1666. #### Writing Back ####
  1667. #ifndef TRMMKERNEL
  1668. ADD_DX 0*SIZE(C0), xvec11, xvec11;
  1669. ADD_DX 2*SIZE(C0), xvec10, xvec10;
  1670. ADD_DX 4*SIZE(C0), xvec9, xvec9;
  1671. ADD_DX 6*SIZE(C0), xvec8, xvec8;
  1672. ADD_DX 0*SIZE(C1), xvec15, xvec15;
  1673. ADD_DX 2*SIZE(C1), xvec14, xvec14;
  1674. ADD_DX 4*SIZE(C1), xvec13, xvec13;
  1675. ADD_DX 6*SIZE(C1), xvec12, xvec12;
  1676. #endif
  1677. ST_DX xvec11, 0*SIZE(C0);
  1678. ST_DX xvec10, 2*SIZE(C0);
  1679. ST_DX xvec9, 4*SIZE(C0);
  1680. ST_DX xvec8, 6*SIZE(C0);
  1681. ST_DX xvec15, 0*SIZE(C1);
  1682. ST_DX xvec14, 2*SIZE(C1);
  1683. ST_DX xvec13, 4*SIZE(C1);
  1684. ST_DX xvec12, 6*SIZE(C1);
  1685. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1686. MOVQ bk, %rax;
  1687. SUBQ kkk, %rax;
  1688. LEAQ (,%rax, SIZE), %rax;
  1689. LEAQ (ptrba, %rax, 8), ptrba;
  1690. LEAQ (ptrbb, %rax, 2), ptrbb;
  1691. #endif
  1692. #if defined(TRMMKERNEL) && defined(LEFT)
  1693. ADDQ $8, kk
  1694. #endif
  1695. ADDQ $8*SIZE, C0;
  1696. ADDQ $8*SIZE, C1;
  1697. DECQ i;
  1698. JG .L21_bodyB;
  1699. JMP .L21_loopE;
  1700. ALIGN_5
  1701. .L213_loopEx:;
  1702. #ifndef TRMMKERNEL
  1703. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1704. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  1705. LDL_DX 2*SIZE(C0), xvec1, xvec1;
  1706. LDH_DX 3*SIZE(C0), xvec1, xvec1;
  1707. LDL_DX 4*SIZE(C0), xvec2, xvec2;
  1708. LDH_DX 5*SIZE(C0), xvec2, xvec2;
  1709. LDL_DX 6*SIZE(C0), xvec3, xvec3;
  1710. LDH_DX 7*SIZE(C0), xvec3, xvec3;
  1711. ADD_DX xvec0, xvec11, xvec11;
  1712. ADD_DX xvec1, xvec10, xvec10;
  1713. ADD_DX xvec2, xvec9, xvec9;
  1714. ADD_DX xvec3, xvec8, xvec8;
  1715. #endif
  1716. STL_DX xvec11, 0*SIZE(C0);
  1717. STH_DX xvec11, 1*SIZE(C0);
  1718. STL_DX xvec10, 2*SIZE(C0);
  1719. STH_DX xvec10, 3*SIZE(C0);
  1720. STL_DX xvec9, 4*SIZE(C0);
  1721. STH_DX xvec9, 5*SIZE(C0);
  1722. STL_DX xvec8, 6*SIZE(C0);
  1723. STH_DX xvec8, 7*SIZE(C0);
  1724. #ifndef TRMMKERNEL
  1725. LDL_DX 0*SIZE(C1), xvec4, xvec4;
  1726. LDH_DX 1*SIZE(C1), xvec4, xvec4;
  1727. LDL_DX 2*SIZE(C1), xvec5, xvec5;
  1728. LDH_DX 3*SIZE(C1), xvec5, xvec5;
  1729. LDL_DX 4*SIZE(C1), xvec6, xvec6;
  1730. LDH_DX 5*SIZE(C1), xvec6, xvec6;
  1731. LDL_DX 6*SIZE(C1), xvec7, xvec7;
  1732. LDH_DX 7*SIZE(C1), xvec7, xvec7;
  1733. ADD_DX xvec4, xvec15, xvec15;
  1734. ADD_DX xvec5, xvec14, xvec14;
  1735. ADD_DX xvec6, xvec13, xvec13;
  1736. ADD_DX xvec7, xvec12, xvec12;
  1737. #endif
  1738. STL_DX xvec15, 0*SIZE(C1);
  1739. STH_DX xvec15, 1*SIZE(C1);
  1740. STL_DX xvec14, 2*SIZE(C1);
  1741. STH_DX xvec14, 3*SIZE(C1);
  1742. STL_DX xvec13, 4*SIZE(C1);
  1743. STH_DX xvec13, 5*SIZE(C1);
  1744. STL_DX xvec12, 6*SIZE(C1);
  1745. STH_DX xvec12, 7*SIZE(C1);
  1746. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1747. MOVQ bk, %rax;
  1748. SUBQ kkk, %rax;
  1749. LEAQ (,%rax, SIZE), %rax;
  1750. LEAQ (ptrba, %rax, 8), ptrba;
  1751. LEAQ (ptrbb, %rax, 2), ptrbb;
  1752. #endif
  1753. #if defined(TRMMKERNEL) && defined(LEFT)
  1754. ADDQ $8, kk
  1755. #endif
  1756. ADDQ $8*SIZE, C0;
  1757. ADDQ $8*SIZE, C1;
  1758. DECQ i;
  1759. JG .L21_bodyB;
  1760. .L21_loopE:;
  1761. TEST $4, bm; # Rm = 4
  1762. JLE .L22_loopE;
  1763. ALIGN_5;
  1764. .L22_bodyB:;
  1765. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1766. MOVQ bb, ptrbb;
  1767. #else
  1768. MOVQ bb, ptrbb;
  1769. MOVQ kk, %rax;
  1770. LEAQ (,%rax, SIZE), %rax;
  1771. LEAQ (ptrba, %rax, 4), ptrba;
  1772. LEAQ (ptrbb, %rax, 2), ptrbb;
  1773. #endif
  1774. #### Initial Results Register ####
  1775. XOR_DY yvec15, yvec15, yvec15;
  1776. XOR_DY yvec14, yvec14, yvec14;
  1777. XOR_DY yvec11, yvec11, yvec11;
  1778. XOR_DY yvec10, yvec10, yvec10;
  1779. #ifndef TRMMKERNEL
  1780. MOVQ bk, k;
  1781. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1782. MOVQ bk, %rax;
  1783. SUBQ kk, %rax;
  1784. MOVQ %rax, kkk;
  1785. #else
  1786. MOVQ kk, %rax;
  1787. #ifdef LEFT
  1788. ADDQ $4, %rax;
  1789. #else
  1790. ADDQ $2, %rax;
  1791. #endif
  1792. MOVQ %rax, kkk;
  1793. #endif
  1794. SARQ $2, k;
  1795. JLE .L221_loopE;
  1796. ALIGN_5
  1797. .L221_bodyB:;
  1798. # Computing kernel
  1799. #### Unroll time 1 ####
  1800. LD_DX 0*SIZE(ptrba), xvec0;
  1801. LD_DX 0*SIZE(ptrbb), xvec4;
  1802. MOV_DX xvec4, xvec5;
  1803. MUL_DX xvec0, xvec4, xvec4;
  1804. ADD_DX xvec4, xvec15, xvec15;
  1805. LD_DX 2*SIZE(ptrba), xvec1;
  1806. SHUF_DX $0x4e, xvec5, xvec4;
  1807. MUL_DX xvec1, xvec5, xvec5;
  1808. ADD_DX xvec5, xvec14, xvec14;
  1809. MOV_DX xvec4, xvec5;
  1810. MUL_DX xvec0, xvec4, xvec4;
  1811. ADD_DX xvec4, xvec11, xvec11;
  1812. MUL_DX xvec1, xvec5, xvec5;
  1813. ADD_DX xvec5, xvec10, xvec10;
  1814. #### Unroll time 2 ####
  1815. LD_DX 4*SIZE(ptrba), xvec0;
  1816. LD_DX 2*SIZE(ptrbb), xvec4;
  1817. MOV_DX xvec4, xvec5;
  1818. MUL_DX xvec0, xvec4, xvec4;
  1819. ADD_DX xvec4, xvec15, xvec15;
  1820. LD_DX 6*SIZE(ptrba), xvec1;
  1821. SHUF_DX $0x4e, xvec5, xvec4;
  1822. MUL_DX xvec1, xvec5, xvec5;
  1823. ADD_DX xvec5, xvec14, xvec14;
  1824. MOV_DX xvec4, xvec5;
  1825. MUL_DX xvec0, xvec4, xvec4;
  1826. ADD_DX xvec4, xvec11, xvec11;
  1827. MUL_DX xvec1, xvec5, xvec5;
  1828. ADD_DX xvec5, xvec10, xvec10;
  1829. #### Unroll time 3 ####
  1830. LD_DX 8*SIZE(ptrba), xvec0;
  1831. LD_DX 4*SIZE(ptrbb), xvec4;
  1832. MOV_DX xvec4, xvec5;
  1833. MUL_DX xvec0, xvec4, xvec4;
  1834. ADD_DX xvec4, xvec15, xvec15;
  1835. LD_DX 10*SIZE(ptrba), xvec1;
  1836. SHUF_DX $0x4e, xvec5, xvec4;
  1837. MUL_DX xvec1, xvec5, xvec5;
  1838. ADD_DX xvec5, xvec14, xvec14;
  1839. MOV_DX xvec4, xvec5;
  1840. MUL_DX xvec0, xvec4, xvec4;
  1841. ADD_DX xvec4, xvec11, xvec11;
  1842. MUL_DX xvec1, xvec5, xvec5;
  1843. ADD_DX xvec5, xvec10, xvec10;
  1844. #### Unroll time 4 ####
  1845. LD_DX 12*SIZE(ptrba), xvec0;
  1846. LD_DX 6*SIZE(ptrbb), xvec4;
  1847. MOV_DX xvec4, xvec5;
  1848. MUL_DX xvec0, xvec4, xvec4;
  1849. ADD_DX xvec4, xvec15, xvec15;
  1850. ADDQ $8*SIZE, ptrbb;
  1851. LD_DX 14*SIZE(ptrba), xvec1;
  1852. SHUF_DX $0x4e, xvec5, xvec4;
  1853. MUL_DX xvec1, xvec5, xvec5;
  1854. ADD_DX xvec5, xvec14, xvec14;
  1855. ADDQ $16*SIZE, ptrba;
  1856. MOV_DX xvec4, xvec5;
  1857. MUL_DX xvec0, xvec4, xvec4;
  1858. ADD_DX xvec4, xvec11, xvec11;
  1859. MUL_DX xvec1, xvec5, xvec5;
  1860. ADD_DX xvec5, xvec10, xvec10;
  1861. DECQ k;
  1862. JG .L221_bodyB;
  1863. ALIGN_5
  1864. .L221_loopE:;
  1865. #ifndef TRMMKERNEL
  1866. TEST $2, bk;
  1867. #else
  1868. MOVQ kkk, %rax;
  1869. TEST $2, %rax;
  1870. #endif
  1871. JLE .L222_loopE;
  1872. ALIGN_5
  1873. .L222_bodyB:
  1874. #### Unroll time 1 ####
  1875. LD_DX 0*SIZE(ptrba), xvec0;
  1876. LD_DX 0*SIZE(ptrbb), xvec4;
  1877. MOV_DX xvec4, xvec5;
  1878. MUL_DX xvec0, xvec4, xvec4;
  1879. ADD_DX xvec4, xvec15, xvec15;
  1880. LD_DX 2*SIZE(ptrba), xvec1;
  1881. SHUF_DX $0x4e, xvec5, xvec4;
  1882. MUL_DX xvec1, xvec5, xvec5;
  1883. ADD_DX xvec5, xvec14, xvec14;
  1884. MOV_DX xvec4, xvec5;
  1885. MUL_DX xvec0, xvec4, xvec4;
  1886. ADD_DX xvec4, xvec11, xvec11;
  1887. MUL_DX xvec1, xvec5, xvec5;
  1888. ADD_DX xvec5, xvec10, xvec10;
  1889. #### Unroll time 2 ####
  1890. LD_DX 4*SIZE(ptrba), xvec0;
  1891. LD_DX 2*SIZE(ptrbb), xvec4;
  1892. MOV_DX xvec4, xvec5;
  1893. MUL_DX xvec0, xvec4, xvec4;
  1894. ADD_DX xvec4, xvec15, xvec15;
  1895. ADDQ $4*SIZE, ptrbb;
  1896. LD_DX 6*SIZE(ptrba), xvec1;
  1897. SHUF_DX $0x4e, xvec5, xvec4;
  1898. MUL_DX xvec1, xvec5, xvec5;
  1899. ADD_DX xvec5, xvec14, xvec14;
  1900. ADDQ $8*SIZE, ptrba;
  1901. MOV_DX xvec4, xvec5;
  1902. MUL_DX xvec0, xvec4, xvec4;
  1903. ADD_DX xvec4, xvec11, xvec11;
  1904. MUL_DX xvec1, xvec5, xvec5;
  1905. ADD_DX xvec5, xvec10, xvec10;
  1906. .L222_loopE:
  1907. #ifndef TRMMKERNEL
  1908. TEST $1, bk
  1909. #else
  1910. MOVQ kkk, %rax;
  1911. TEST $1, %rax;
  1912. #endif
  1913. JLE .L223_loopE;
  1914. ALIGN_5
  1915. .L223_bodyB:
  1916. #### Unroll time 1 ####
  1917. LD_DX 0*SIZE(ptrba), xvec0;
  1918. LD_DX 0*SIZE(ptrbb), xvec4;
  1919. MOV_DX xvec4, xvec5;
  1920. MUL_DX xvec0, xvec4, xvec4;
  1921. ADD_DX xvec4, xvec15, xvec15;
  1922. ADDQ $2*SIZE, ptrbb;
  1923. LD_DX 2*SIZE(ptrba), xvec1;
  1924. SHUF_DX $0x4e, xvec5, xvec4;
  1925. MUL_DX xvec1, xvec5, xvec5;
  1926. ADD_DX xvec5, xvec14, xvec14;
  1927. ADDQ $4*SIZE, ptrba;
  1928. MOV_DX xvec4, xvec5;
  1929. MUL_DX xvec0, xvec4, xvec4;
  1930. ADD_DX xvec4, xvec11, xvec11;
  1931. MUL_DX xvec1, xvec5, xvec5;
  1932. ADD_DX xvec5, xvec10, xvec10;
  1933. .L223_loopE:
  1934. #### Multiply Alpha ####
  1935. BROAD_DX MEMALPHA, xvec7;
  1936. MUL_DX xvec7, xvec15, xvec15;
  1937. MUL_DX xvec7, xvec14, xvec14;
  1938. MUL_DX xvec7, xvec11, xvec11;
  1939. MUL_DX xvec7, xvec10, xvec10;
  1940. #### Reverse #####
  1941. MOV_DX xvec15, xvec6;
  1942. REVS_DX xvec11, xvec15, xvec15;
  1943. REVS_DX xvec6, xvec11, xvec11;
  1944. MOV_DX xvec14, xvec6;
  1945. REVS_DX xvec10, xvec14, xvec14;
  1946. REVS_DX xvec6, xvec10, xvec10;
  1947. #### Testing Alignment ####
  1948. MOVQ C0, %rax;
  1949. OR ldc, %rax;
  1950. TEST $15, %rax;
  1951. JNE .L223_loopEx;
  1952. ALIGN_5
  1953. #### Writing Back ####
  1954. #ifndef TRMMKERNEL
  1955. ADD_DX 0*SIZE(C0), xvec11, xvec11;
  1956. ADD_DX 2*SIZE(C0), xvec10, xvec10;
  1957. ADD_DX 0*SIZE(C1), xvec15, xvec15;
  1958. ADD_DX 2*SIZE(C1), xvec14, xvec14;
  1959. #endif
  1960. ST_DX xvec11, 0*SIZE(C0);
  1961. ST_DX xvec10, 2*SIZE(C0);
  1962. ST_DX xvec15, 0*SIZE(C1);
  1963. ST_DX xvec14, 2*SIZE(C1);
  1964. #if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
  1965. MOVQ bk, %rax;
  1966. SUBQ kkk, %rax;
  1967. LEAQ (,%rax, SIZE), %rax;
  1968. LEAQ (ptrba, %rax, 4), ptrba;
  1969. LEAQ (ptrbb, %rax, 2), ptrbb;
  1970. #endif
  1971. #if defined(TRMMKERNEL) && defined(LEFT)
  1972. ADDQ $4, kk
  1973. #endif
  1974. ADDQ $4*SIZE, C0;
  1975. ADDQ $4*SIZE, C1;
  1976. JMP .L22_loopE;
  1977. ALIGN_5
  1978. .L223_loopEx:;
  1979. #ifndef TRMMKERNEL
  1980. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1981. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  1982. LDL_DX 2*SIZE(C0), xvec1, xvec1;
  1983. LDH_DX 3*SIZE(C0), xvec1, xvec1;
  1984. ADD_DX xvec0, xvec11, xvec11;
  1985. ADD_DX xvec1, xvec10, xvec10;
  1986. #endif
  1987. STL_DX xvec11, 0*SIZE(C0);
  1988. STH_DX xvec11, 1*SIZE(C0);
  1989. STL_DX xvec10, 2*SIZE(C0);
  1990. STH_DX xvec10, 3*SIZE(C0);
  1991. #ifndef TRMMKERNEL
  1992. LDL_DX 0*SIZE(C1), xvec4, xvec4;
  1993. LDH_DX 1*SIZE(C1), xvec4, xvec4;
  1994. LDL_DX 2*SIZE(C1), xvec5, xvec5;
  1995. LDH_DX 3*SIZE(C1), xvec5, xvec5;
  1996. ADD_DX xvec4, xvec15, xvec15;
  1997. ADD_DX xvec5, xvec14, xvec14;
  1998. #endif
  1999. STL_DX xvec15, 0*SIZE(C1);
  2000. STH_DX xvec15, 1*SIZE(C1);
  2001. STL_DX xvec14, 2*SIZE(C1);
  2002. STH_DX xvec14, 3*SIZE(C1);
  2003. #if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
  2004. MOVQ bk, %rax;
  2005. SUBQ kkk, %rax;
  2006. LEAQ (,%rax, SIZE), %rax;
  2007. LEAQ (ptrba, %rax, 4), ptrba;
  2008. LEAQ (ptrbb, %rax, 2), ptrbb;
  2009. #endif
  2010. #if defined(TRMMKERNEL) && defined(LEFT)
  2011. ADDQ $4, kk
  2012. #endif
  2013. ADDQ $4*SIZE, C0;
  2014. ADDQ $4*SIZE, C1;
  2015. .L22_loopE:;
  2016. TEST $2, bm; # Rm = 2
  2017. JLE .L23_loopE;
  2018. ALIGN_5;
  2019. .L23_bodyB:
  2020. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2021. MOVQ bb, ptrbb;
  2022. #else
  2023. MOVQ bb, ptrbb;
  2024. MOVQ kk, %rax;
  2025. LEAQ (,%rax, SIZE), %rax;
  2026. LEAQ (ptrba, %rax, 2), ptrba;
  2027. LEAQ (ptrbb, %rax, 2), ptrbb;
  2028. #endif
  2029. XOR_DY yvec15, yvec15, yvec15;
  2030. XOR_DY yvec11, yvec11, yvec11;
  2031. #ifndef TRMMKERNEL
  2032. MOVQ bk, k;
  2033. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2034. MOVQ bk, %rax;
  2035. SUBQ kk, %rax;
  2036. MOVQ %rax, kkk;
  2037. #else
  2038. MOVQ kk, %rax;
  2039. #ifdef LEFT
  2040. ADDQ $2, %rax;
  2041. #else
  2042. ADDQ $2, %rax;
  2043. #endif
  2044. MOVQ %rax, kkk;
  2045. #endif
  2046. SARQ $2, k;
  2047. JLE .L231_loopE;
  2048. ALIGN_5
  2049. .L231_bodyB:
  2050. # Computing kernel
  2051. #### Unroll time 1 ####
  2052. LD_DX 0*SIZE(ptrba), xvec0;
  2053. LD_DX 0*SIZE(ptrbb), xvec4;
  2054. SHUF_DX $0x4e, xvec4, xvec5;
  2055. MUL_DX xvec0, xvec4, xvec4;
  2056. ADD_DX xvec4, xvec15, xvec15;
  2057. MUL_DX xvec0, xvec5, xvec5;
  2058. ADD_DX xvec5, xvec11, xvec11;
  2059. #### Unroll time 2 ####
  2060. LD_DX 2*SIZE(ptrba), xvec0;
  2061. LD_DX 2*SIZE(ptrbb), xvec4;
  2062. SHUF_DX $0x4e, xvec4, xvec5;
  2063. MUL_DX xvec0, xvec4, xvec4;
  2064. ADD_DX xvec4, xvec15, xvec15;
  2065. MUL_DX xvec0, xvec5, xvec5;
  2066. ADD_DX xvec5, xvec11, xvec11;
  2067. #### Unroll time 3 ####
  2068. LD_DX 4*SIZE(ptrba), xvec0;
  2069. LD_DX 4*SIZE(ptrbb), xvec4;
  2070. SHUF_DX $0x4e, xvec4, xvec5;
  2071. MUL_DX xvec0, xvec4, xvec4;
  2072. ADD_DX xvec4, xvec15, xvec15;
  2073. MUL_DX xvec0, xvec5, xvec5;
  2074. ADD_DX xvec5, xvec11, xvec11;
  2075. #### Unroll time 4 ####
  2076. LD_DX 6*SIZE(ptrba), xvec0;
  2077. LD_DX 6*SIZE(ptrbb), xvec4;
  2078. SHUF_DX $0x4e, xvec4, xvec5;
  2079. MUL_DX xvec0, xvec4, xvec4;
  2080. ADD_DX xvec4, xvec15, xvec15;
  2081. ADDQ $8*SIZE, ptrba;
  2082. MUL_DX xvec0, xvec5, xvec5;
  2083. ADD_DX xvec5, xvec11, xvec11;
  2084. ADDQ $8*SIZE, ptrbb;
  2085. DECQ k;
  2086. JG .L231_bodyB;
  2087. ALIGN_5
  2088. .L231_loopE:
  2089. #ifndef TRMMKERNEL
  2090. TEST $2, bk;
  2091. #else
  2092. MOVQ kkk, %rax;
  2093. TEST $2, %rax;
  2094. #endif
  2095. JLE .L232_loopE;
  2096. ALIGN_5
  2097. .L232_bodyB:
  2098. #### Unroll time 1 ####
  2099. LD_DX 0*SIZE(ptrba), xvec0;
  2100. LD_DX 0*SIZE(ptrbb), xvec4;
  2101. SHUF_DX $0x4e, xvec4, xvec5;
  2102. MUL_DX xvec0, xvec4, xvec4;
  2103. ADD_DX xvec4, xvec15, xvec15;
  2104. MUL_DX xvec0, xvec5, xvec5;
  2105. ADD_DX xvec5, xvec11, xvec11;
  2106. #### Unroll time 2 ####
  2107. LD_DX 2*SIZE(ptrba), xvec0;
  2108. LD_DX 2*SIZE(ptrbb), xvec4;
  2109. SHUF_DX $0x4e, xvec4, xvec5;
  2110. MUL_DX xvec0, xvec4, xvec4;
  2111. ADD_DX xvec4, xvec15, xvec15;
  2112. ADDQ $4*SIZE, ptrba;
  2113. MUL_DX xvec0, xvec5, xvec5;
  2114. ADD_DX xvec5, xvec11, xvec11;
  2115. ADDQ $4*SIZE, ptrbb;
  2116. .L232_loopE:
  2117. #ifndef TRMMKERNEL
  2118. TEST $1, bk;
  2119. #else
  2120. MOVQ kkk, %rax;
  2121. TEST $1, %rax;
  2122. #endif
  2123. JLE .L233_loopE;
  2124. ALIGN_5
  2125. .L233_bodyB:
  2126. #### Unroll time 1 ####
  2127. LD_DX 0*SIZE(ptrba), xvec0;
  2128. LD_DX 0*SIZE(ptrbb), xvec4;
  2129. SHUF_DX $0x4e, xvec4, xvec5;
  2130. MUL_DX xvec0, xvec4, xvec4;
  2131. ADD_DX xvec4, xvec15, xvec15;
  2132. ADDQ $2*SIZE, ptrba;
  2133. MUL_DX xvec0, xvec5, xvec5;
  2134. ADD_DX xvec5, xvec11, xvec11;
  2135. ADDQ $2*SIZE, ptrbb;
  2136. .L233_loopE:
  2137. #### Multiply Alpha ####
  2138. BROAD_DX MEMALPHA, xvec7;
  2139. MUL_DX xvec7, xvec15, xvec15;
  2140. MUL_DX xvec7, xvec11, xvec11;
  2141. #### Reverse #####
  2142. MOV_DX xvec15, xvec6;
  2143. REVS_DX xvec11, xvec15, xvec15;
  2144. REVS_DX xvec6, xvec11, xvec11;
  2145. #### Testing Alignment ####
  2146. MOVQ C0, %rax;
  2147. OR ldc, %rax;
  2148. TEST $15, %rax;
  2149. JNE .L233_loopEx;
  2150. ALIGN_5
  2151. #### Writing Back ####
  2152. #ifndef TRMMKERNEL
  2153. ADD_DX 0*SIZE(C0), xvec11, xvec11;
  2154. ADD_DX 0*SIZE(C1), xvec15, xvec15;
  2155. #endif
  2156. ST_DX xvec11, 0*SIZE(C0);
  2157. ST_DX xvec15, 0*SIZE(C1);
  2158. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2159. MOVQ bk, %rax;
  2160. SUBQ kkk, %rax;
  2161. LEAQ (,%rax, SIZE), %rax;
  2162. LEAQ (ptrba, %rax, 2), ptrba;
  2163. LEAQ (ptrbb, %rax, 2), ptrbb;
  2164. #endif
  2165. #if defined(TRMMKERNEL) && defined(LEFT)
  2166. ADDQ $2, kk;
  2167. #endif
  2168. ADDQ $2*SIZE, C0;
  2169. ADDQ $2*SIZE, C1;
  2170. JMP .L23_loopE;
  2171. ALIGN_5
  2172. .L233_loopEx:;
  2173. #ifndef TRMMKERNEL
  2174. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2175. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  2176. ADD_DX xvec0, xvec11, xvec11;
  2177. #endif
  2178. STL_DX xvec11, 0*SIZE(C0);
  2179. STH_DX xvec11, 1*SIZE(C0);
  2180. #ifndef TRMMKERNEL
  2181. LDL_DX 0*SIZE(C1), xvec4, xvec4;
  2182. LDH_DX 1*SIZE(C1), xvec4, xvec4;
  2183. ADD_DX xvec4, xvec15, xvec15;
  2184. #endif
  2185. STL_DX xvec15, 0*SIZE(C1);
  2186. STH_DX xvec15, 1*SIZE(C1);
  2187. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2188. MOVQ bk, %rax;
  2189. SUBQ kkk, %rax;
  2190. LEAQ (,%rax, SIZE), %rax;
  2191. LEAQ (ptrba, %rax, 2), ptrba;
  2192. LEAQ (ptrbb, %rax, 2), ptrbb;
  2193. #endif
  2194. #if defined(TRMMKERNEL) && defined(LEFT)
  2195. ADDQ $2, kk;
  2196. #endif
  2197. ADDQ $2*SIZE, C0;
  2198. ADDQ $2*SIZE, C1;
  2199. .L23_loopE:
  2200. TEST $1, bm; # Rm = 1
  2201. JLE .L24_loopE;
  2202. ALIGN_5;
  2203. .L24_bodyB:
  2204. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2205. MOVQ bb, ptrbb;
  2206. #else
  2207. MOVQ bb, ptrbb;
  2208. MOVQ kk, %rax;
  2209. LEAQ (, %rax, SIZE), %rax;
  2210. ADDQ %rax, ptrba;
  2211. LEAQ (ptrbb, %rax, 2), ptrbb;
  2212. #endif
  2213. XOR_DY yvec15, yvec15, yvec15;
  2214. #ifndef TRMMKERNEL
  2215. MOVQ bk, k;
  2216. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2217. MOVQ bk, %rax;
  2218. SUBQ kk, %rax;
  2219. MOVQ %rax, kkk;
  2220. #else
  2221. MOVQ kk, %rax;
  2222. #ifdef LEFT
  2223. ADDQ $1, %rax;
  2224. #else
  2225. ADDQ $2, %rax;
  2226. #endif
  2227. MOVQ %rax, kkk;
  2228. #endif
  2229. SARQ $2, k;
  2230. JLE .L241_loopE;
  2231. ALIGN_5
  2232. .L241_bodyB:
  2233. BROAD_DX 0*SIZE(ptrba), xvec0;
  2234. LD_DX 0*SIZE(ptrbb), xvec2;
  2235. MUL_DX xvec0, xvec2, xvec2;
  2236. ADD_DX xvec2, xvec15, xvec15;
  2237. BROAD_DX 1*SIZE(ptrba), xvec1;
  2238. LD_DX 2*SIZE(ptrbb), xvec3;
  2239. MUL_DX xvec1, xvec3, xvec3;
  2240. ADD_DX xvec3, xvec15, xvec15;
  2241. BROAD_DX 2*SIZE(ptrba), xvec0;
  2242. LD_DX 4*SIZE(ptrbb), xvec2;
  2243. MUL_DX xvec0, xvec2, xvec2;
  2244. ADD_DX xvec2, xvec15, xvec15;
  2245. BROAD_DX 3*SIZE(ptrba), xvec1;
  2246. LD_DX 6*SIZE(ptrbb), xvec3;
  2247. MUL_DX xvec1, xvec3, xvec3;
  2248. ADD_DX xvec3, xvec15, xvec15;
  2249. ADDQ $4*SIZE, ptrba;
  2250. ADDQ $8*SIZE, ptrbb;
  2251. DECQ k;
  2252. JG .L241_bodyB;
  2253. ALIGN_5
  2254. .L241_loopE:
  2255. #ifndef TRMMKERNEL
  2256. TEST $2, bk;
  2257. #else
  2258. MOVQ kkk, %rax;
  2259. TEST $2, %rax;
  2260. #endif
  2261. JLE .L242_loopE;
  2262. ALIGN_5
  2263. .L242_bodyB:
  2264. BROAD_DX 0*SIZE(ptrba), xvec0;
  2265. LD_DX 0*SIZE(ptrbb), xvec2;
  2266. MUL_DX xvec0, xvec2, xvec2;
  2267. ADD_DX xvec2, xvec15, xvec15;
  2268. BROAD_DX 1*SIZE(ptrba), xvec1;
  2269. LD_DX 2*SIZE(ptrbb), xvec3;
  2270. MUL_DX xvec1, xvec3, xvec3;
  2271. ADD_DX xvec3, xvec15, xvec15;
  2272. ADDQ $2*SIZE, ptrba;
  2273. ADDQ $4*SIZE, ptrbb;
  2274. .L242_loopE:
  2275. #ifndef TRMMKERNEL
  2276. TEST $1, bk;
  2277. #else
  2278. MOVQ kkk, %rax;
  2279. TEST $1, %rax;
  2280. #endif
  2281. JLE .L243_loopE;
  2282. ALIGN_5
  2283. .L243_bodyB:
  2284. BROAD_DX 0*SIZE(ptrba), xvec0;
  2285. LD_DX 0*SIZE(ptrbb), xvec2;
  2286. MUL_DX xvec0, xvec2, xvec2;
  2287. ADD_DX xvec2, xvec15, xvec15;
  2288. ADDQ $1*SIZE, ptrba;
  2289. ADDQ $2*SIZE, ptrbb;
  2290. .L243_loopE:
  2291. BROAD_DX MEMALPHA, xvec7;
  2292. MUL_DX xvec7, xvec15, xvec15;
  2293. #ifndef TRMMKERNEL
  2294. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2295. LDH_DX 0*SIZE(C1), xvec0, xvec0;
  2296. ADD_DX xvec0, xvec15, xvec15;
  2297. #endif
  2298. STL_DX xvec15, 0*SIZE(C0);
  2299. STH_DX xvec15, 0*SIZE(C1);
  2300. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2301. MOVQ bk, %rax;
  2302. SUBQ kkk, %rax;
  2303. LEAQ (,%rax, SIZE), %rax;
  2304. ADDQ %rax, ptrba;
  2305. LEAQ (ptrbb, %rax, 2), ptrbb;
  2306. #endif
  2307. #if defined(TRMMKERNEL) && defined(LEFT)
  2308. ADDQ $1, kk;
  2309. #endif
  2310. ADDQ $1*SIZE, C0;
  2311. ADDQ $1*SIZE, C1;
  2312. .L24_loopE:
  2313. #if defined(TRMMKERNEL) && !defined(LEFT)
  2314. ADDQ $2, kk;
  2315. #endif
  2316. MOVQ bk, k;
  2317. SALQ $4, k;
  2318. ADDQ k, bb;
  2319. LEAQ (C, ldc, 2), C;
  2320. .L20_loopE:;
  2321. TEST $1, bn; # Rn = 1
  2322. JLE .L30_loopE;
  2323. ALIGN_5
  2324. .L30_bodyB:
  2325. #if defined(TRMMKERNEL)&&defined(LEFT)
  2326. MOVQ OFFSET, %rax;
  2327. MOVQ %rax, kk;
  2328. #endif
  2329. MOVQ C, C0;
  2330. MOVQ ba, ptrba;
  2331. MOVQ bm, i;
  2332. SARQ $3, i;
  2333. JLE .L31_loopE;
  2334. ALIGN_5
  2335. .L31_bodyB:
  2336. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2337. MOVQ bb, ptrbb;
  2338. #else
  2339. MOVQ bb, ptrbb;
  2340. MOVQ kk, %rax
  2341. LEAQ (, %rax, SIZE), %rax;
  2342. LEAQ (ptrba, %rax, 8), ptrba;
  2343. ADDQ %rax, ptrbb;
  2344. #endif
  2345. #### Initial Results Register ####
  2346. XOR_DY yvec15, yvec15, yvec15;
  2347. XOR_DY yvec14, yvec14, yvec14;
  2348. #ifndef TRMMKERNEL
  2349. MOVQ bk, k;
  2350. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2351. MOVQ bk, %rax;
  2352. SUBQ kk, %rax;
  2353. MOVQ %rax, kkk;
  2354. #else
  2355. MOVQ kk, %rax;
  2356. #ifdef LEFT
  2357. ADDQ $8, %rax;
  2358. #else
  2359. ADDQ $1, %rax;
  2360. #endif
  2361. MOVQ %rax, kkk;
  2362. #endif
  2363. SARQ $2, k;
  2364. JLE .L311_loopE;
  2365. ALIGN_5
  2366. .L311_bodyB:
  2367. #### Unroll time 1 ####
  2368. LD_DY 0*SIZE(ptrba), yvec0;
  2369. LD_DY 4*SIZE(ptrba), yvec1;
  2370. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2371. MUL_DY yvec2, yvec0, yvec0;
  2372. ADD_DY yvec0, yvec15, yvec15;
  2373. MUL_DY yvec2, yvec1, yvec1;
  2374. ADD_DY yvec1, yvec14, yvec14;
  2375. #### Unroll time 2 ####
  2376. LD_DY 8*SIZE(ptrba), yvec3;
  2377. LD_DY 12*SIZE(ptrba), yvec4;
  2378. BROAD_DY 1*SIZE(ptrbb), yvec5;
  2379. MUL_DY yvec5, yvec3, yvec3;
  2380. ADD_DY yvec3, yvec15, yvec15;
  2381. MUL_DY yvec5, yvec4, yvec4
  2382. ADD_DY yvec4, yvec14, yvec14;
  2383. #### Unroll time 3 ####
  2384. LD_DY 16*SIZE(ptrba), yvec0;
  2385. LD_DY 20*SIZE(ptrba), yvec1;
  2386. BROAD_DY 2*SIZE(ptrbb), yvec2;
  2387. MUL_DY yvec2, yvec0, yvec0;
  2388. ADD_DY yvec0, yvec15, yvec15;
  2389. MUL_DY yvec2, yvec1, yvec1;
  2390. ADD_DY yvec1, yvec14, yvec14;
  2391. #### Unroll time 2 ####
  2392. LD_DY 24*SIZE(ptrba), yvec3;
  2393. LD_DY 28*SIZE(ptrba), yvec4;
  2394. BROAD_DY 3*SIZE(ptrbb), yvec5;
  2395. MUL_DY yvec5, yvec3, yvec3;
  2396. ADD_DY yvec3, yvec15, yvec15;
  2397. ADDQ $32*SIZE, ptrba;
  2398. MUL_DY yvec5, yvec4, yvec4;
  2399. ADD_DY yvec4, yvec14, yvec14;
  2400. ADDQ $4*SIZE, ptrbb;
  2401. DECQ k;
  2402. JG .L311_bodyB;
  2403. ALIGN_5
  2404. .L311_loopE:
  2405. #ifndef TRMMKERNEL
  2406. TEST $2, bk;
  2407. #else
  2408. MOVQ kkk, %rax;
  2409. TEST $2, %rax;
  2410. #endif
  2411. JLE .L312_loopE;
  2412. ALIGN_5
  2413. .L312_bodyB:
  2414. #### Unroll time 1 ####
  2415. LD_DY 0*SIZE(ptrba), yvec0;
  2416. LD_DY 4*SIZE(ptrba), yvec1;
  2417. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2418. MUL_DY yvec2, yvec0, yvec0;
  2419. ADD_DY yvec0, yvec15, yvec15;
  2420. MUL_DY yvec2, yvec1, yvec1;
  2421. ADD_DY yvec1, yvec14, yvec14;
  2422. #### Unroll time 2 ####
  2423. LD_DY 8*SIZE(ptrba), yvec3;
  2424. LD_DY 12*SIZE(ptrba), yvec4;
  2425. BROAD_DY 1*SIZE(ptrbb), yvec5;
  2426. MUL_DY yvec5, yvec3, yvec3;
  2427. ADD_DY yvec3, yvec15, yvec15;
  2428. ADDQ $16*SIZE, ptrba;
  2429. MUL_DY yvec5, yvec4, yvec4
  2430. ADD_DY yvec4, yvec14, yvec14;
  2431. ADDQ $2*SIZE, ptrbb;
  2432. .L312_loopE:
  2433. #ifndef TRMMKERNEL
  2434. TEST $1, bk;
  2435. #else
  2436. MOVQ kkk, %rax;
  2437. TEST $1, %rax;
  2438. #endif
  2439. JLE .L313_loopE;
  2440. ALIGN_5
  2441. .L313_bodyB:
  2442. #### Unroll time 1 ####
  2443. LD_DY 0*SIZE(ptrba), yvec0;
  2444. LD_DY 4*SIZE(ptrba), yvec1;
  2445. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2446. MUL_DY yvec2, yvec0, yvec0;
  2447. ADD_DY yvec0, yvec15, yvec15;
  2448. ADDQ $8*SIZE, ptrba;
  2449. MUL_DY yvec2, yvec1, yvec1;
  2450. ADD_DY yvec1, yvec14, yvec14;
  2451. ADDQ $1*SIZE, ptrbb;
  2452. .L313_loopE:
  2453. #### Multiply Alpha ####
  2454. BROAD_DY MEMALPHA, yvec7;
  2455. MUL_DY yvec7, yvec15, yvec15;
  2456. MUL_DY yvec7, yvec14, yvec14;
  2457. #### Testing Alignment ####
  2458. MOVQ C0, %rax;
  2459. OR ldc, %rax;
  2460. TEST $15, %rax;
  2461. JNE .L313_loopEx;
  2462. ALIGN_5
  2463. #### Writing Back ####
  2464. EXTRA_DY $1, yvec15, xvec13;
  2465. EXTRA_DY $1, yvec14, xvec12;
  2466. #ifndef TRMMKERNEL
  2467. ADD_DX 0*SIZE(C0), xvec15, xvec15;
  2468. ADD_DX 2*SIZE(C0), xvec13, xvec13;
  2469. ADD_DX 4*SIZE(C0), xvec14, xvec14;
  2470. ADD_DX 6*SIZE(C0), xvec12, xvec12;
  2471. #endif
  2472. ST_DX xvec15, 0*SIZE(C0);
  2473. ST_DX xvec13, 2*SIZE(C0);
  2474. ST_DX xvec14, 4*SIZE(C0);
  2475. ST_DX xvec12, 6*SIZE(C0);
  2476. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2477. MOVQ bk, %rax;
  2478. SUBQ kkk, %rax;
  2479. LEAQ (,%rax, SIZE), %rax;
  2480. LEAQ (ptrba, %rax, 8), ptrba;
  2481. ADDQ %rax, ptrbb;
  2482. #endif
  2483. #if defined(TRMMKERNEL)&&defined(LEFT)
  2484. ADDQ $8, kk;
  2485. #endif
  2486. ADDQ $8*SIZE, C0;
  2487. DECQ i;
  2488. JG .L31_bodyB;
  2489. JMP .L31_loopE;
  2490. ALIGN_5
  2491. .L313_loopEx:
  2492. EXTRA_DY $1, yvec15, xvec13;
  2493. EXTRA_DY $1, yvec14, xvec12;
  2494. #ifndef TRMMKERNEL
  2495. LDL_DX 0*SIZE(C0), xvec11, xvec11;
  2496. LDH_DX 1*SIZE(C0), xvec11, xvec11;
  2497. LDL_DX 2*SIZE(C0), xvec10, xvec10;
  2498. LDH_DX 3*SIZE(C0), xvec10, xvec10;
  2499. LDL_DX 4*SIZE(C0), xvec9, xvec9;
  2500. LDH_DX 5*SIZE(C0), xvec9, xvec9;
  2501. LDL_DX 6*SIZE(C0), xvec8, xvec8;
  2502. LDH_DX 7*SIZE(C0), xvec8, xvec8;
  2503. ADD_DX xvec11, xvec15, xvec15;
  2504. ADD_DX xvec10, xvec13, xvec13;
  2505. ADD_DX xvec9, xvec14, xvec14;
  2506. ADD_DX xvec8, xvec12, xvec12;
  2507. #endif
  2508. STL_DX xvec15, 0*SIZE(C0);
  2509. STH_DX xvec15, 1*SIZE(C0);
  2510. STL_DX xvec13, 2*SIZE(C0);
  2511. STH_DX xvec13, 3*SIZE(C0);
  2512. STL_DX xvec14, 4*SIZE(C0);
  2513. STH_DX xvec14, 5*SIZE(C0);
  2514. STL_DX xvec12, 6*SIZE(C0);
  2515. STH_DX xvec12, 7*SIZE(C0);
  2516. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2517. MOVQ bk, %rax;
  2518. SUBQ kkk, %rax;
  2519. LEAQ (,%rax, SIZE), %rax;
  2520. LEAQ (ptrba, %rax, 8), ptrba;
  2521. ADDQ %rax, ptrbb;
  2522. #endif
  2523. #if defined(TRMMKERNEL)&&defined(LEFT)
  2524. ADDQ $8, kk;
  2525. #endif
  2526. ADDQ $8*SIZE, C0;
  2527. DECQ i;
  2528. JG .L31_bodyB;
  2529. .L31_loopE:
  2530. TEST $4, bm
  2531. JLE .L32_loopE;
  2532. ALIGN_5
  2533. .L32_bodyB:
  2534. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2535. MOVQ bb, ptrbb;
  2536. #else
  2537. MOVQ bb, ptrbb;
  2538. MOVQ kk, %rax;
  2539. LEAQ (,%rax, SIZE), %rax;
  2540. LEAQ (ptrba, %rax, 4), ptrba;
  2541. ADDQ %rax, ptrbb;
  2542. #endif
  2543. #### Initial Results Register ####
  2544. XOR_DY yvec15, yvec15, yvec15;
  2545. #ifndef TRMMKERNEL
  2546. MOVQ bk, k;
  2547. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2548. MOVQ bk, %rax;
  2549. SUBQ kk, %rax;
  2550. MOVQ %rax, kkk;
  2551. #else
  2552. MOVQ kk, %rax;
  2553. #ifdef LEFT
  2554. ADDQ $4, %rax;
  2555. #else
  2556. ADDQ $1, %rax;
  2557. #endif
  2558. MOVQ %rax, kkk
  2559. #endif
  2560. SARQ $2, k;
  2561. JLE .L321_loopE;
  2562. ALIGN_5
  2563. .L321_bodyB:
  2564. LD_DY 0*SIZE(ptrba), yvec0;
  2565. BROAD_DY 0*SIZE(ptrbb), yvec1;
  2566. MUL_DY yvec0, yvec1, yvec1;
  2567. ADD_DY yvec1, yvec15, yvec15;
  2568. LD_DY 4*SIZE(ptrba), yvec2;
  2569. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2570. MUL_DY yvec2, yvec3, yvec3;
  2571. ADD_DY yvec3, yvec15, yvec15;
  2572. LD_DY 8*SIZE(ptrba), yvec4;
  2573. BROAD_DY 2*SIZE(ptrbb), yvec5;
  2574. MUL_DY yvec4, yvec5, yvec5;
  2575. ADD_DY yvec5, yvec15, yvec15;
  2576. LD_DY 12*SIZE(ptrba), yvec6;
  2577. BROAD_DY 3*SIZE(ptrbb), yvec7;
  2578. MUL_DY yvec6, yvec7, yvec7;
  2579. ADD_DY yvec7, yvec15, yvec15;
  2580. ADDQ $16*SIZE, ptrba;
  2581. ADDQ $4*SIZE, ptrbb;
  2582. DECQ k;
  2583. JG .L321_bodyB;
  2584. ALIGN_5
  2585. .L321_loopE:
  2586. #ifndef TRMMKERNEL
  2587. TEST $2, bk;
  2588. #else
  2589. MOVQ kkk, %rax;
  2590. TEST $2, %rax;
  2591. #endif
  2592. JLE .L322_loopE;
  2593. ALIGN_5
  2594. .L322_bodyB:
  2595. LD_DY 0*SIZE(ptrba), yvec0;
  2596. BROAD_DY 0*SIZE(ptrbb), yvec1;
  2597. MUL_DY yvec0, yvec1, yvec1;
  2598. ADD_DY yvec1, yvec15, yvec15;
  2599. LD_DY 4*SIZE(ptrba), yvec2;
  2600. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2601. MUL_DY yvec2, yvec3, yvec3;
  2602. ADD_DY yvec3, yvec15, yvec15;
  2603. ADDQ $8*SIZE, ptrba;
  2604. ADDQ $2*SIZE, ptrbb;
  2605. .L322_loopE:
  2606. #ifndef TRMMKERNEL
  2607. TEST $1, bk;
  2608. #else
  2609. MOVQ kkk, %rax;
  2610. TEST $1, %rax;
  2611. #endif
  2612. JLE .L323_loopE;
  2613. ALIGN_5
  2614. .L323_bodyB:
  2615. LD_DY 0*SIZE(ptrba), yvec0;
  2616. BROAD_DY 0*SIZE(ptrbb), yvec1;
  2617. MUL_DY yvec0, yvec1, yvec1;
  2618. ADD_DY yvec1, yvec15, yvec15;
  2619. ADDQ $4*SIZE, ptrba;
  2620. ADDQ $1*SIZE, ptrbb;
  2621. .L323_loopE:
  2622. #### Multiply Alpha ####
  2623. BROAD_DY MEMALPHA, yvec7;
  2624. MUL_DY yvec7, yvec15, yvec15;
  2625. #### Testing Alignment ####
  2626. MOVQ C0, %rax;
  2627. OR ldc, %rax;
  2628. TEST $15, %rax;
  2629. JNE .L323_loopEx;
  2630. ALIGN_5
  2631. #### Writing Back ####
  2632. EXTRA_DY $1, yvec15, xvec14;
  2633. #ifndef TRMMKERNEL
  2634. ADD_DX 0*SIZE(C0), xvec15, xvec15;
  2635. ADD_DX 2*SIZE(C0), xvec14, xvec14;
  2636. #endif
  2637. ST_DX xvec15, 0*SIZE(C0);
  2638. ST_DX xvec14, 2*SIZE(C0);
  2639. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2640. MOVQ bk, %rax;
  2641. SUBQ kkk, %rax;
  2642. LEAQ (, %rax, SIZE), %rax;
  2643. LEAQ (ptrba, %rax, 4), ptrba;
  2644. ADDQ %rax, ptrbb;
  2645. #endif
  2646. #if defined(TRMMKERNEL) && defined(LEFT)
  2647. ADDQ $4, kk
  2648. #endif
  2649. ADDQ $4*SIZE, C0;
  2650. JMP .L32_loopE;
  2651. ALIGN_5
  2652. .L323_loopEx:
  2653. #### Writing Back ####
  2654. EXTRA_DY $1, yvec15, xvec14;
  2655. #ifndef TRMMKERNEL
  2656. LDL_DX 0*SIZE(C0), xvec13, xvec13;
  2657. LDH_DX 1*SIZE(C0), xvec13, xvec13;
  2658. LDL_DX 2*SIZE(C0), xvec12, xvec12;
  2659. LDH_DX 3*SIZE(C0), xvec12, xvec12;
  2660. ADD_DX xvec13, xvec15, xvec15;
  2661. ADD_DX xvec12, xvec14, xvec14;
  2662. #endif
  2663. STL_DX xvec15, 0*SIZE(C0);
  2664. STH_DX xvec15, 1*SIZE(C0);
  2665. STL_DX xvec14, 2*SIZE(C0);
  2666. STH_DX xvec14, 3*SIZE(C0);
  2667. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2668. MOVQ bk, %rax;
  2669. SUBQ kkk, %rax;
  2670. LEAQ (, %rax, SIZE), %rax;
  2671. LEAQ (ptrba, %rax, 4), ptrba;
  2672. ADDQ %rax, ptrbb;
  2673. #endif
  2674. #if defined(TRMMKERNEL) && defined(LEFT)
  2675. ADDQ $4, kk
  2676. #endif
  2677. ADDQ $4*SIZE, C0;
  2678. .L32_loopE:
  2679. TEST $2, bm
  2680. JLE .L33_loopE;
  2681. ALIGN_5
  2682. .L33_bodyB:
  2683. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2684. MOVQ bb, ptrbb;
  2685. #else
  2686. MOVQ bb, ptrbb;
  2687. MOVQ kk, %rax
  2688. LEAQ (, %rax, SIZE), %rax
  2689. LEAQ (ptrba, %rax, 2), ptrba
  2690. ADDQ %rax, ptrbb;
  2691. #endif
  2692. #### Initial Result ####
  2693. XOR_DY yvec15, yvec15, yvec15;
  2694. #ifndef TRMMKERNEL
  2695. MOVQ bk, k;
  2696. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2697. MOVQ bk, %rax;
  2698. SUBQ kk, %rax;
  2699. MOVQ %rax, kkk;
  2700. #else
  2701. MOVQ kk, %rax;
  2702. #ifdef LEFT
  2703. ADDQ $2, %rax;
  2704. #else
  2705. ADDQ $1, %rax;
  2706. #endif
  2707. MOVQ %rax, kkk;
  2708. #endif
  2709. SARQ $2, k;
  2710. JLE .L331_loopE;
  2711. ALIGN_5
  2712. .L331_bodyB:
  2713. LD_DX 0*SIZE(ptrba), xvec0;
  2714. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2715. MUL_DX xvec0, xvec2, xvec2;
  2716. ADD_DX xvec2, xvec15, xvec15;
  2717. LD_DX 2*SIZE(ptrba), xvec1;
  2718. BROAD_DX 1*SIZE(ptrbb), xvec3;
  2719. MUL_DX xvec1, xvec3, xvec3;
  2720. ADD_DX xvec3, xvec15, xvec15;
  2721. LD_DX 4*SIZE(ptrba), xvec4;
  2722. BROAD_DX 2*SIZE(ptrbb), xvec5;
  2723. MUL_DX xvec4, xvec5, xvec5;
  2724. ADD_DX xvec5, xvec15, xvec15;
  2725. LD_DX 6*SIZE(ptrba), xvec6;
  2726. BROAD_DX 3*SIZE(ptrbb), xvec7;
  2727. MUL_DX xvec6, xvec7, xvec7;
  2728. ADD_DX xvec7, xvec15, xvec15;
  2729. ADDQ $8*SIZE, ptrba;
  2730. ADDQ $4*SIZE, ptrbb;
  2731. DECQ k;
  2732. JG .L331_bodyB;
  2733. ALIGN_5
  2734. .L331_loopE:
  2735. #ifndef TRMMKERNEL
  2736. TEST $2,bk;
  2737. #else
  2738. MOVQ kkk, %rax;
  2739. TEST $2, %rax
  2740. #endif
  2741. JLE .L332_loopE;
  2742. ALIGN_5
  2743. .L332_bodyB:
  2744. LD_DX 0*SIZE(ptrba), xvec0;
  2745. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2746. MUL_DX xvec0, xvec2, xvec2;
  2747. ADD_DX xvec2, xvec15, xvec15;
  2748. LD_DX 2*SIZE(ptrba), xvec1;
  2749. BROAD_DX 1*SIZE(ptrbb), xvec3;
  2750. MUL_DX xvec1, xvec3, xvec3;
  2751. ADD_DX xvec3, xvec15, xvec15;
  2752. ADDQ $4*SIZE, ptrba;
  2753. ADDQ $2*SIZE, ptrbb;
  2754. .L332_loopE:
  2755. #ifndef TRMMKERNEL
  2756. TEST $1, bk;
  2757. #else
  2758. MOVQ kkk, %rax;
  2759. TEST $1, %rax;
  2760. #endif
  2761. JLE .L333_loopE;
  2762. ALIGN_5
  2763. .L333_bodyB:
  2764. LD_DX 0*SIZE(ptrba), xvec0;
  2765. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2766. MUL_DX xvec0, xvec2, xvec2;
  2767. ADD_DX xvec2, xvec15, xvec15;
  2768. ADDQ $2*SIZE, ptrba;
  2769. ADDQ $1*SIZE, ptrbb;
  2770. .L333_loopE:
  2771. #### Multiply Alpha ####
  2772. BROAD_DX MEMALPHA, xvec7;
  2773. MUL_DX xvec7, xvec15, xvec15;
  2774. #ifndef TRMMKERNEL
  2775. LDL_DX 0*SIZE(C0), xvec14, xvec14;
  2776. LDH_DX 1*SIZE(C0), xvec14, xvec14;
  2777. ADD_DX xvec14, xvec15, xvec15;
  2778. #endif
  2779. STL_DX xvec15, 0*SIZE(C0);
  2780. STH_DX xvec15, 1*SIZE(C0);
  2781. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2782. MOVQ bk, %rax;
  2783. SUBQ kkk, %rax;
  2784. LEAQ (,%rax, SIZE), %rax;
  2785. LEAQ (ptrba, %rax, 2), ptrba;
  2786. ADDQ %rax, ptrbb;
  2787. #endif
  2788. #if defined(TRMMKERNEL) && defined(LEFT)
  2789. addq $2, kk
  2790. #endif
  2791. ADDQ $2*SIZE, C0;
  2792. .L33_loopE:
  2793. TEST $1, bm
  2794. JLE .L34_loopE;
  2795. ALIGN_5
  2796. .L34_bodyB:
  2797. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2798. MOVQ bb, ptrbb;
  2799. #else
  2800. MOVQ bb, ptrbb;
  2801. MOVQ kk, %rax;
  2802. LEAQ (, %rax, SIZE), %rax;
  2803. ADDQ %rax, ptrba;
  2804. ADDQ %rax, ptrbb;
  2805. #endif
  2806. XOR_DY yvec15, yvec15, yvec15;
  2807. #ifndef TRMMKERNEL
  2808. MOVQ bk, k;
  2809. #elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2810. MOVQ bk, %rax;
  2811. SUBQ kk, %rax;
  2812. MOVQ %rax, kkk;
  2813. #else
  2814. MOVQ kk, %rax;
  2815. #ifdef LEFT
  2816. ADDQ $1, %rax;
  2817. #else
  2818. ADDQ $1, %rax;
  2819. #endif
  2820. MOVQ %rax, kkk;
  2821. #endif
  2822. SARQ $2, k;
  2823. JLE .L341_loopE;
  2824. ALIGN_5
  2825. .L341_bodyB:
  2826. vmovsd 0*SIZE(ptrba), xvec0;
  2827. vmovsd 0*SIZE(ptrbb), xvec1;
  2828. vmulsd xvec0, xvec1, xvec1;
  2829. vaddsd xvec1, xvec15, xvec15;
  2830. vmovsd 1*SIZE(ptrba), xvec0;
  2831. vmovsd 1*SIZE(ptrbb), xvec1;
  2832. vmulsd xvec0, xvec1, xvec1;
  2833. vaddsd xvec1, xvec15, xvec15;
  2834. vmovsd 2*SIZE(ptrba), xvec0;
  2835. vmovsd 2*SIZE(ptrbb), xvec1;
  2836. vmulsd xvec0, xvec1, xvec1;
  2837. vaddsd xvec1, xvec15, xvec15;
  2838. vmovsd 3*SIZE(ptrba), xvec0;
  2839. vmovsd 3*SIZE(ptrbb), xvec1;
  2840. vmulsd xvec0, xvec1, xvec1;
  2841. vaddsd xvec1, xvec15, xvec15;
  2842. addq $4*SIZE, ptrba;
  2843. addq $4*SIZE, ptrbb;
  2844. decq k;
  2845. JG .L341_bodyB;
  2846. ALIGN_5
  2847. .L341_loopE:
  2848. #ifndef TRMMKERNEL
  2849. TEST $2, bk;
  2850. #else
  2851. MOVQ kkk, %rax;
  2852. TEST $2, %rax;
  2853. #endif
  2854. JLE .L342_loopE;
  2855. ALIGN_5
  2856. .L342_bodyB:
  2857. vmovsd 0*SIZE(ptrba), xvec0;
  2858. vmovsd 0*SIZE(ptrbb), xvec1;
  2859. vmulsd xvec0, xvec1, xvec1;
  2860. vaddsd xvec1, xvec15, xvec15;
  2861. vmovsd 1*SIZE(ptrba), xvec0;
  2862. vmovsd 1*SIZE(ptrbb), xvec1;
  2863. vmulsd xvec0, xvec1, xvec1;
  2864. vaddsd xvec1, xvec15, xvec15;
  2865. addq $2*SIZE, ptrba;
  2866. addq $2*SIZE, ptrbb;
  2867. .L342_loopE:
  2868. #ifndef TRMMKERNEL
  2869. TEST $1, bk
  2870. #else
  2871. MOVQ kkk, %rax;
  2872. TEST $1, %rax;
  2873. #endif
  2874. JLE .L343_loopE;
  2875. ALIGN_5
  2876. .L343_bodyB:
  2877. vmovsd 0*SIZE(ptrba), xvec0;
  2878. vmovsd 0*SIZE(ptrbb), xvec1;
  2879. vmulsd xvec0, xvec1, xvec1;
  2880. vaddsd xvec1, xvec15, xvec15;
  2881. addq $1*SIZE, ptrba;
  2882. addq $1*SIZE, ptrbb;
  2883. .L343_loopE:
  2884. #### Writing Back ####
  2885. vmovsd MEMALPHA, xvec7;
  2886. vmulsd xvec7, xvec15, xvec15;
  2887. #ifndef TRMMKERNEL
  2888. vmovsd 0*SIZE(C0), xvec0;
  2889. vaddsd xvec0, xvec15, xvec15;
  2890. #endif
  2891. movsd xvec15, 0*SIZE(C0);
  2892. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2893. MOVQ bk, %rax;
  2894. SUBQ kkk, %rax;
  2895. LEAQ (,%rax, SIZE), %rax;
  2896. ADDQ %rax, ptrba;
  2897. ADDQ %rax, ptrbb;
  2898. #endif
  2899. #if defined(TRMMKERNEL) && defined(LEFT)
  2900. addq $1, kk
  2901. #endif
  2902. addq $1*SIZE, C0;
  2903. .L34_loopE:
  2904. MOVQ bk, k
  2905. SALQ $3, k;
  2906. ADDQ k, bb;
  2907. LEAQ (C, ldc, 1), C;
  2908. .L30_loopE:
  2909. movq 0(%rsp), %rbx;
  2910. movq 8(%rsp), %rbp;
  2911. movq 16(%rsp), %r12;
  2912. movq 24(%rsp), %r13;
  2913. movq 32(%rsp), %r14;
  2914. movq 40(%rsp), %r15;
  2915. vzeroupper
  2916. #ifdef WINDOWS_ABI
  2917. movq 48(%rsp), %rdi
  2918. movq 56(%rsp), %rsi
  2919. movups 64(%rsp), %xmm6
  2920. movups 80(%rsp), %xmm7
  2921. movups 96(%rsp), %xmm8
  2922. movups 112(%rsp), %xmm9
  2923. movups 128(%rsp), %xmm10
  2924. movups 144(%rsp), %xmm11
  2925. movups 160(%rsp), %xmm12
  2926. movups 176(%rsp), %xmm13
  2927. movups 192(%rsp), %xmm14
  2928. movups 208(%rsp), %xmm15
  2929. #endif
  2930. addq $STACKSIZE, %rsp;
  2931. ret
  2932. EPILOGUE