You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_6x4_piledriver.S 52 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734
  1. /****************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. // register blocking= 6x4. unloop k = 4.
  28. // Use FMA3 on piledriver.
  29. // Todo: 1) deal with the edge. 2) Add windows abi.
  30. #define ASSEMBLER
  31. #include "common.h"
  32. #define STACKSIZE 128
  33. #define oldbk_i %rdi
  34. #define oldbk_j %rsi
  35. #define oldbk_l %rdx
  36. #define _bk_i %r13
  37. #define _bk_j %r14
  38. #define _bk_l %r15
  39. #define ALPHA %xmm0
  40. #define _ptr_A %rcx
  41. #define _ptr_B %r8
  42. #define _ptr_C %r9
  43. #define LDC %r10
  44. #define i %r11
  45. #define k %rax
  46. #define _pre_B %r12
  47. #define _ptr__A_0 %rdi
  48. #define _ptr__B_0 %rsi
  49. #define _ptr__C_0 %rbx
  50. #define _ptr__C_1 %rbp
  51. #define old_ldc 8+STACKSIZE(%rsp)
  52. #define alpha 48(%rsp)
  53. #define j 56(%rsp)
  54. #define MOVQ2560(s,d) movq s,d
  55. #define LEAQ2560(s,d) leaq s,d
  56. #define SARQ2560(imm,n) sarq imm,n
  57. #define ADDQ2560(off,addr) addq off,addr
  58. #define SUBQ2560(off,addr) subq off,addr
  59. #define DIVQ2560(off,addr) divq off,addr
  60. #define MULQ2560(s,d) mulq s,d
  61. #define DECQ2560(addr) decq addr
  62. #define NEGQ2560(s) negq s
  63. #define TESTQ2560(n,addr) testq n,addr
  64. #define SALQ2560(imm,n) salq imm,n
  65. #define MOVQ1280(s,d) movq s,d
  66. #define LEAQ1280(s,d) leaq s,d
  67. #define SARQ1280(imm,n) sarq imm,n
  68. #define ADDQ1280(off,addr) addq off,addr
  69. #define SUBQ1280(off,addr) subq off,addr
  70. #define DIVQ1280(off,addr) divq off,addr
  71. #define CMPQ1280(off,addr) cmpq off,addr
  72. #define MULQ1280(s,d) mulq s,d
  73. #define DECQ1280(addr) decq addr
  74. #define NEGQ1280(s) negq s
  75. #define TESTQ1280(n,addr) testq n,addr
  76. #define SALQ1280(imm,n) salq imm,n
  77. #define JG jg
  78. #define JLE jle
  79. #define VLD2560(addr,reg) vmovapd addr,reg
  80. #define VST2560(reg,addr) vmovapd reg,addr
  81. #define VMUL2560(a,b,c) vmulpd a,b,c
  82. #define MVMUL2560(a,b,c) vmulpd b,a,c
  83. #define VADD2560(a,b,c) vaddpd a,b,c
  84. #define MVADD2560(a,b,c) vaddpd b,a,c
  85. #define VSHUF2560(imm,s,d) vpermilpd imm,s,d
  86. #define VSHUF2F2560(imm,s1,s2,d) vperm2f128 imm,s1,s2,d
  87. #define BROAD2560(addr,reg) vbroadcastsd addr,reg
  88. #define MOVRR2560(a,b) vmovapd a,b
  89. #define REVS2560(imm,s1,s2,d) vshufpd imm,s1,s2,d
  90. #define EXTR2561(imm,a,b) vextractf128 imm,a,b
  91. #define LDL2561(addr,reg) vmovlpd addr,reg,reg
  92. #define LDH2561(addr,reg) vmovhpd addr,reg,reg
  93. #define STL2561(reg,addr) vmovlpd reg,addr
  94. #define STH2561(reg,addr) vmovhpd reg,addr
  95. #define VADD2561(a,b,c) vaddpd a,b,c
  96. #define VXOR2560(a,b,c) vxorpd a,b,c
  97. #define PREFETCH02560(addr,b) prefetcht0 addr
  98. #define PREFETCH12560(addr,b) prefetcht0 addr
  99. #define PREFETCH22560(addr,b) prefetcht2 addr
  100. #define PREFETCHW2560(addr,b) prefetchw addr
  101. #define PREFETCHN2560(addr,b) prefetchnta addr
  102. #define VMA2560(a,b,c,d) vfmaddpd d,a,b,c
  103. #define MVMA2560(a,b,c,d) vfmaddpd d,a,b,c
  104. #define VLD1280(addr,reg) vmovapd addr,reg
  105. #define VLD1282(addr,reg) vmovapd addr,reg
  106. #define VLD1281(addr,reg) movsd addr,reg
  107. #define VST1280(reg,addr) vmovapd reg,addr
  108. #define VST1282(reg,addr) vmovapd reg,addr
  109. #define VST1281(reg,addr) movsd reg,addr
  110. #define VLDU1282(addr,reg) vmovupd addr,reg
  111. #define VLDU1281(addr,reg) movsd addr,reg
  112. #define VSTU1282(reg,addr) vmovupd reg,addr
  113. #define VSTU1281(reg,addr) movsd reg,addr
  114. #define VMUL1280(a,b,c) vmulpd a,b,c
  115. #define VMUL1282(a,b,c) vmulpd a,b,c
  116. #define VMUL1281(a,b,c) vmulpd a,b,c
  117. #define MVMUL1280(a,b,c) vmulpd b,a,c
  118. #define VADD1280(a,b,c) vaddpd a,b,c
  119. #define MVADD1280(a,b,c) vaddpd b,a,c
  120. #define VSHUF1280(imm,s,d) vpermilpd imm,s,d
  121. #define VSHUF2F1280(imm,s1,s2,d) vperm2f128 imm,s1,s2,d
  122. #define BROAD1280(addr,reg) vmovddup addr,reg
  123. #define BROAD1282(addr,reg) vmovddup addr,reg
  124. #define BROAD1281(addr,reg) movddup addr,reg
  125. #define MOVRR1280(a,b) vmovapd a,b
  126. #define REVS1280(imm,s1,s2,d) vshufpd imm,s1,s2,d
  127. #define EXTR1281(imm,a,b) vextractf128 imm,a,b
  128. #define LDL1281(addr,reg) vmovlpd addr,reg,reg
  129. #define LDH1281(addr,reg) vmovhpd addr,reg,reg
  130. #define STL1281(reg,addr) vmovlpd reg,addr
  131. #define STH1281(reg,addr) vmovhpd reg,addr
  132. #define VADD1281(a,b,c) vaddpd a,b,c
  133. #define VXOR1280(a,b,c) vxorpd a,b,c
  134. #define VXOR1282(a,b,c) vxorpd a,b,c
  135. #define VXOR1281(a,b,c) vxorpd a,b,c
  136. #define PREFETCH01280(addr,b) prefetcht0 addr
  137. #define PREFETCH11280(addr,b) prefetcht0 addr
  138. #define PREFETCH21280(addr,b) prefetcht2 addr
  139. #define PREFETCHW1280(addr,b) prefetchw addr
  140. #define PREFETCHN1280(addr,b) prefetchnta addr
  141. #define VMA1280(a,b,c,d) vfmaddpd d,a,b,c
  142. #define VMA1282(a,b,c,d) vfmadd231pd a,b,c
  143. #define VMA1281(a,b,c,d) vfmadd231pd a,b,c
  144. #define VMA21282(a,b,c,d) vfmadd231pd a,b,c
  145. #define VMA21281(a,b,c,d) vfmadd231pd a,b,c
  146. //#define VMA1282(a,b,c,d) nop
  147. //#define VMA1281(a,b,c,d) nop
  148. //#define VMA21282(a,b,c,d) nop
  149. //#define VMA21281(a,b,c,d) nop
  150. #define MVMA1280(a,b,c,d) vfmaddpd d,a,b,c
  151. #define imm1 $0x05
  152. #define imm3 $0x05
  153. #define imm100 $0x05
  154. #define imm200 $0x0a
  155. #define XMM0 %xmm0
  156. #define XMM1 %xmm1
  157. #define XMM2 %xmm2
  158. #define XMM3 %xmm3
  159. #define XMM4 %xmm4
  160. #define XMM5 %xmm5
  161. #define XMM6 %xmm6
  162. #define XMM7 %xmm7
  163. #define XMM8 %xmm8
  164. #define XMM9 %xmm9
  165. #define XMM10 %xmm10
  166. #define XMM11 %xmm11
  167. #define XMM12 %xmm12
  168. #define XMM13 %xmm13
  169. #define XMM14 %xmm14
  170. #define XMM15 %xmm15
  171. #define YMM0 %ymm0
  172. #define YMM1 %ymm1
  173. #define YMM2 %ymm2
  174. #define YMM3 %ymm3
  175. #define YMM4 %ymm4
  176. #define YMM5 %ymm5
  177. #define YMM6 %ymm6
  178. #define YMM7 %ymm7
  179. #define YMM8 %ymm8
  180. #define YMM9 %ymm9
  181. #define YMM10 %ymm10
  182. #define YMM11 %ymm11
  183. #define YMM12 %ymm12
  184. #define YMM13 %ymm13
  185. #define YMM14 %ymm14
  186. #define YMM15 %ymm15
  187. PROLOGUE
  188. subq $STACKSIZE, %rsp;
  189. movq %rbx, 0(%rsp);
  190. movq %rbp, 8(%rsp);
  191. movq %r12, 16(%rsp);
  192. movq %r13, 24(%rsp);
  193. movq %r14, 32(%rsp);
  194. movq %r15, 40(%rsp);
  195. vzeroupper
  196. movl old_ldc, %eax
  197. movq %rax, LDC
  198. movlps ALPHA, alpha
  199. movq oldbk_i, _bk_i
  200. movq oldbk_j, _bk_j
  201. movq oldbk_l, _bk_l
  202. leaq (, LDC, SIZE), LDC
  203. MOVQ1280(_bk_j,j);
  204. SARQ1280($2,j);
  205. JLE ._L_0_loopE;
  206. ALIGN_4;
  207. ._L_0_bodyB:;
  208. MOVQ1280(_ptr_A,_ptr__A_0);
  209. MOVQ1280(_ptr_C,_ptr__C_0);
  210. LEAQ1280((_ptr_C,LDC,2),_ptr__C_1);
  211. MOVQ1280(_bk_l,%rax);
  212. SALQ1280($5,%rax);
  213. ADDQ1280(%rax,_pre_B);
  214. MOVQ1280(_bk_i,i);
  215. CMPQ1280($6,i);
  216. JL ._L_1_loopE;
  217. ._L_1_bodyB:;
  218. MOVQ1280(_ptr_B,_ptr__B_0);
  219. VXOR1282(XMM0,XMM0,XMM0);
  220. VXOR1282(XMM1,XMM1,XMM1);
  221. VXOR1282(XMM2,XMM2,XMM2);
  222. VXOR1282(XMM3,XMM3,XMM3);
  223. VXOR1282(XMM4,XMM4,XMM4);
  224. VXOR1282(XMM5,XMM5,XMM5);
  225. VXOR1282(XMM6,XMM6,XMM6);
  226. VXOR1282(XMM7,XMM7,XMM7);
  227. VXOR1282(XMM8,XMM8,XMM8);
  228. VXOR1282(XMM9,XMM9,XMM9);
  229. VXOR1282(XMM10,XMM10,XMM10);
  230. VXOR1282(XMM11,XMM11,XMM11);
  231. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  232. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  233. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  234. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  235. MOVQ1280(_bk_l,k);
  236. SARQ1280($2,k);
  237. JLE ._L_2_loopE;
  238. ALIGN_4;
  239. ._L_2_bodyB:;
  240. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  241. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  242. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  243. VMA1282(XMM12,XMM15,XMM0,XMM0);
  244. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  245. VMA1282(XMM13,XMM15,XMM1,XMM1);
  246. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  247. VMA1282(XMM14,XMM15,XMM2,XMM2);
  248. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  249. VMA1282(XMM12,XMM15,XMM3,XMM3);
  250. VMA1282(XMM13,XMM15,XMM4,XMM4);
  251. VMA1282(XMM14,XMM15,XMM5,XMM5);
  252. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  253. VMA1282(XMM12,XMM15,XMM6,XMM6);
  254. VMA1282(XMM13,XMM15,XMM7,XMM7);
  255. VMA1282(XMM14,XMM15,XMM8,XMM8);
  256. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  257. VMA1282(XMM12,XMM15,XMM9,XMM9);
  258. VMA1282(XMM13,XMM15,XMM10,XMM10);
  259. VMA1282(XMM14,XMM15,XMM11,XMM11);
  260. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  261. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  262. VLD1282(6*SIZE(_ptr__A_0),XMM12);
  263. VMA1282(XMM12,XMM15,XMM0,XMM0);
  264. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  265. VMA1282(XMM13,XMM15,XMM1,XMM1);
  266. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  267. VMA1282(XMM14,XMM15,XMM2,XMM2);
  268. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  269. VMA1282(XMM12,XMM15,XMM3,XMM3);
  270. VMA1282(XMM13,XMM15,XMM4,XMM4);
  271. VMA1282(XMM14,XMM15,XMM5,XMM5);
  272. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  273. VMA1282(XMM12,XMM15,XMM6,XMM6);
  274. VMA1282(XMM13,XMM15,XMM7,XMM7);
  275. VMA1282(XMM14,XMM15,XMM8,XMM8);
  276. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  277. VMA1282(XMM12,XMM15,XMM9,XMM9);
  278. VMA1282(XMM13,XMM15,XMM10,XMM10);
  279. VMA1282(XMM14,XMM15,XMM11,XMM11);
  280. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  281. BROAD1282(8*SIZE(_ptr__B_0),XMM15);
  282. VLD1282(12*SIZE(_ptr__A_0),XMM12);
  283. VMA1282(XMM12,XMM15,XMM0,XMM0);
  284. VLD1282(14*SIZE(_ptr__A_0),XMM13);
  285. VMA1282(XMM13,XMM15,XMM1,XMM1);
  286. VLD1282(16*SIZE(_ptr__A_0),XMM14);
  287. VMA1282(XMM14,XMM15,XMM2,XMM2);
  288. BROAD1282(9*SIZE(_ptr__B_0),XMM15);
  289. VMA1282(XMM12,XMM15,XMM3,XMM3);
  290. VMA1282(XMM13,XMM15,XMM4,XMM4);
  291. VMA1282(XMM14,XMM15,XMM5,XMM5);
  292. BROAD1282(10*SIZE(_ptr__B_0),XMM15);
  293. VMA1282(XMM12,XMM15,XMM6,XMM6);
  294. VMA1282(XMM13,XMM15,XMM7,XMM7);
  295. VMA1282(XMM14,XMM15,XMM8,XMM8);
  296. BROAD1282(11*SIZE(_ptr__B_0),XMM15);
  297. VMA1282(XMM12,XMM15,XMM9,XMM9);
  298. VMA1282(XMM13,XMM15,XMM10,XMM10);
  299. VMA1282(XMM14,XMM15,XMM11,XMM11);
  300. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  301. BROAD1282(12*SIZE(_ptr__B_0),XMM15);
  302. VLD1282(18*SIZE(_ptr__A_0),XMM12);
  303. VMA1282(XMM12,XMM15,XMM0,XMM0);
  304. VLD1282(20*SIZE(_ptr__A_0),XMM13);
  305. VMA1282(XMM13,XMM15,XMM1,XMM1);
  306. VLD1282(22*SIZE(_ptr__A_0),XMM14);
  307. VMA1282(XMM14,XMM15,XMM2,XMM2);
  308. BROAD1282(13*SIZE(_ptr__B_0),XMM15);
  309. VMA1282(XMM12,XMM15,XMM3,XMM3);
  310. VMA1282(XMM13,XMM15,XMM4,XMM4);
  311. VMA1282(XMM14,XMM15,XMM5,XMM5);
  312. BROAD1282(14*SIZE(_ptr__B_0),XMM15);
  313. VMA1282(XMM12,XMM15,XMM6,XMM6);
  314. VMA1282(XMM13,XMM15,XMM7,XMM7);
  315. VMA1282(XMM14,XMM15,XMM8,XMM8);
  316. BROAD1282(15*SIZE(_ptr__B_0),XMM15);
  317. VMA1282(XMM12,XMM15,XMM9,XMM9);
  318. VMA1282(XMM13,XMM15,XMM10,XMM10);
  319. VMA1282(XMM14,XMM15,XMM11,XMM11);
  320. ADDQ1280($24*SIZE,_ptr__A_0);
  321. ADDQ1280($16*SIZE,_ptr__B_0);
  322. ._L_2_bodyE:;
  323. DECQ1280(k);
  324. JG ._L_2_bodyB;
  325. ALIGN_4;
  326. ._L_2_loopE:;
  327. TESTQ1280($2,_bk_l);
  328. JLE ._L_3_loopE;
  329. ALIGN_4;
  330. ._L_3_bodyB:;
  331. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  332. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  333. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  334. VMA1282(XMM12,XMM15,XMM0,XMM0);
  335. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  336. VMA1282(XMM13,XMM15,XMM1,XMM1);
  337. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  338. VMA1282(XMM14,XMM15,XMM2,XMM2);
  339. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  340. VMA1282(XMM12,XMM15,XMM3,XMM3);
  341. VMA1282(XMM13,XMM15,XMM4,XMM4);
  342. VMA1282(XMM14,XMM15,XMM5,XMM5);
  343. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  344. VMA1282(XMM12,XMM15,XMM6,XMM6);
  345. VMA1282(XMM13,XMM15,XMM7,XMM7);
  346. VMA1282(XMM14,XMM15,XMM8,XMM8);
  347. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  348. VMA1282(XMM12,XMM15,XMM9,XMM9);
  349. VMA1282(XMM13,XMM15,XMM10,XMM10);
  350. VMA1282(XMM14,XMM15,XMM11,XMM11);
  351. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  352. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  353. VLD1282(6*SIZE(_ptr__A_0),XMM12);
  354. VMA1282(XMM12,XMM15,XMM0,XMM0);
  355. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  356. VMA1282(XMM13,XMM15,XMM1,XMM1);
  357. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  358. VMA1282(XMM14,XMM15,XMM2,XMM2);
  359. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  360. VMA1282(XMM12,XMM15,XMM3,XMM3);
  361. VMA1282(XMM13,XMM15,XMM4,XMM4);
  362. VMA1282(XMM14,XMM15,XMM5,XMM5);
  363. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  364. VMA1282(XMM12,XMM15,XMM6,XMM6);
  365. VMA1282(XMM13,XMM15,XMM7,XMM7);
  366. VMA1282(XMM14,XMM15,XMM8,XMM8);
  367. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  368. VMA1282(XMM12,XMM15,XMM9,XMM9);
  369. VMA1282(XMM13,XMM15,XMM10,XMM10);
  370. VMA1282(XMM14,XMM15,XMM11,XMM11);
  371. ADDQ1280($12*SIZE,_ptr__A_0);
  372. ADDQ1280($8*SIZE,_ptr__B_0);
  373. ._L_3_loopE:;
  374. TESTQ1280($1,_bk_l);
  375. JLE ._L_4_loopE;
  376. ALIGN_4;
  377. ._L_4_bodyB:;
  378. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  379. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  380. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  381. VMA1282(XMM12,XMM15,XMM0,XMM0);
  382. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  383. VMA1282(XMM13,XMM15,XMM1,XMM1);
  384. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  385. VMA1282(XMM14,XMM15,XMM2,XMM2);
  386. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  387. VMA1282(XMM12,XMM15,XMM3,XMM3);
  388. VMA1282(XMM13,XMM15,XMM4,XMM4);
  389. VMA1282(XMM14,XMM15,XMM5,XMM5);
  390. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  391. VMA1282(XMM12,XMM15,XMM6,XMM6);
  392. VMA1282(XMM13,XMM15,XMM7,XMM7);
  393. VMA1282(XMM14,XMM15,XMM8,XMM8);
  394. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  395. VMA1282(XMM12,XMM15,XMM9,XMM9);
  396. VMA1282(XMM13,XMM15,XMM10,XMM10);
  397. VMA1282(XMM14,XMM15,XMM11,XMM11);
  398. ADDQ1280($6*SIZE,_ptr__A_0);
  399. ADDQ1280($4*SIZE,_ptr__B_0);
  400. ._L_4_loopE:;
  401. BROAD1282(alpha,XMM12);
  402. VLDU1282(0*SIZE(_ptr__C_0),XMM13);
  403. VMA21282(XMM12,XMM0,XMM13,XMM0);
  404. VSTU1282(XMM13,0*SIZE(_ptr__C_0));
  405. VLDU1282(2*SIZE(_ptr__C_0),XMM14);
  406. VMA21282(XMM12,XMM1,XMM14,XMM1);
  407. VSTU1282(XMM14,2*SIZE(_ptr__C_0));
  408. VLDU1282(4*SIZE(_ptr__C_0),XMM15);
  409. VMA21282(XMM12,XMM2,XMM15,XMM2);
  410. VSTU1282(XMM15,4*SIZE(_ptr__C_0));
  411. VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM13);
  412. VMA21282(XMM12,XMM3,XMM13,XMM3);
  413. VSTU1282(XMM13,0*SIZE(_ptr__C_0,LDC,1));
  414. VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM14);
  415. VMA21282(XMM12,XMM4,XMM14,XMM4);
  416. VSTU1282(XMM14,2*SIZE(_ptr__C_0,LDC,1));
  417. VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM15);
  418. VMA21282(XMM12,XMM5,XMM15,XMM5);
  419. VSTU1282(XMM15,4*SIZE(_ptr__C_0,LDC,1));
  420. VLDU1282(0*SIZE(_ptr__C_1),XMM13);
  421. VMA21282(XMM12,XMM6,XMM13,XMM6);
  422. VSTU1282(XMM13,0*SIZE(_ptr__C_1));
  423. VLDU1282(2*SIZE(_ptr__C_1),XMM14);
  424. VMA21282(XMM12,XMM7,XMM14,XMM7);
  425. VSTU1282(XMM14,2*SIZE(_ptr__C_1));
  426. VLDU1282(4*SIZE(_ptr__C_1),XMM15);
  427. VMA21282(XMM12,XMM8,XMM15,XMM8);
  428. VSTU1282(XMM15,4*SIZE(_ptr__C_1));
  429. VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM13);
  430. VMA21282(XMM12,XMM9,XMM13,XMM9);
  431. VSTU1282(XMM13,0*SIZE(_ptr__C_1,LDC,1));
  432. VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM14);
  433. VMA21282(XMM12,XMM10,XMM14,XMM10);
  434. VSTU1282(XMM14,2*SIZE(_ptr__C_1,LDC,1));
  435. VLDU1282(4*SIZE(_ptr__C_1,LDC,1),XMM15);
  436. VMA21282(XMM12,XMM11,XMM15,XMM11);
  437. VSTU1282(XMM15,4*SIZE(_ptr__C_1,LDC,1));
  438. ADDQ1280($6*SIZE,_ptr__C_0);
  439. ADDQ1280($6*SIZE,_ptr__C_1);
  440. ._L_1_bodyE:;
  441. SUBQ1280($6,i);
  442. JG ._L_1_bodyB;
  443. ALIGN_4;
  444. ._L_1_loopE:;
  445. TESTQ1280($4,i);
  446. JLE ._L_5_loopE;
  447. ALIGN_4;
  448. ._L_5_bodyB:;
  449. MOVQ1280(_ptr_B,_ptr__B_0);
  450. VXOR1282(XMM0,XMM0,XMM0);
  451. VXOR1282(XMM1,XMM1,XMM1);
  452. VXOR1282(XMM2,XMM2,XMM2);
  453. VXOR1282(XMM3,XMM3,XMM3);
  454. VXOR1282(XMM4,XMM4,XMM4);
  455. VXOR1282(XMM5,XMM5,XMM5);
  456. VXOR1282(XMM6,XMM6,XMM6);
  457. VXOR1282(XMM7,XMM7,XMM7);
  458. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  459. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  460. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  461. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  462. MOVQ1280(_bk_l,k);
  463. SARQ1280($2,k);
  464. JLE ._L_6_loopE;
  465. ALIGN_4;
  466. ._L_6_bodyB:;
  467. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  468. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  469. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  470. VMA1282(XMM13,XMM15,XMM0,XMM0);
  471. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  472. VMA1282(XMM14,XMM15,XMM1,XMM1);
  473. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  474. VMA1282(XMM13,XMM15,XMM2,XMM2);
  475. VMA1282(XMM14,XMM15,XMM3,XMM3);
  476. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  477. VMA1282(XMM13,XMM15,XMM4,XMM4);
  478. VMA1282(XMM14,XMM15,XMM5,XMM5);
  479. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  480. VMA1282(XMM13,XMM15,XMM6,XMM6);
  481. VMA1282(XMM14,XMM15,XMM7,XMM7);
  482. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  483. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  484. VLD1282(4*SIZE(_ptr__A_0),XMM13);
  485. VMA1282(XMM13,XMM15,XMM0,XMM0);
  486. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  487. VMA1282(XMM14,XMM15,XMM1,XMM1);
  488. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  489. VMA1282(XMM13,XMM15,XMM2,XMM2);
  490. VMA1282(XMM14,XMM15,XMM3,XMM3);
  491. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  492. VMA1282(XMM13,XMM15,XMM4,XMM4);
  493. VMA1282(XMM14,XMM15,XMM5,XMM5);
  494. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  495. VMA1282(XMM13,XMM15,XMM6,XMM6);
  496. VMA1282(XMM14,XMM15,XMM7,XMM7);
  497. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  498. BROAD1282(8*SIZE(_ptr__B_0),XMM15);
  499. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  500. VMA1282(XMM13,XMM15,XMM0,XMM0);
  501. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  502. VMA1282(XMM14,XMM15,XMM1,XMM1);
  503. BROAD1282(9*SIZE(_ptr__B_0),XMM15);
  504. VMA1282(XMM13,XMM15,XMM2,XMM2);
  505. VMA1282(XMM14,XMM15,XMM3,XMM3);
  506. BROAD1282(10*SIZE(_ptr__B_0),XMM15);
  507. VMA1282(XMM13,XMM15,XMM4,XMM4);
  508. VMA1282(XMM14,XMM15,XMM5,XMM5);
  509. BROAD1282(11*SIZE(_ptr__B_0),XMM15);
  510. VMA1282(XMM13,XMM15,XMM6,XMM6);
  511. VMA1282(XMM14,XMM15,XMM7,XMM7);
  512. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  513. BROAD1282(12*SIZE(_ptr__B_0),XMM15);
  514. VLD1282(12*SIZE(_ptr__A_0),XMM13);
  515. VMA1282(XMM13,XMM15,XMM0,XMM0);
  516. VLD1282(14*SIZE(_ptr__A_0),XMM14);
  517. VMA1282(XMM14,XMM15,XMM1,XMM1);
  518. BROAD1282(13*SIZE(_ptr__B_0),XMM15);
  519. VMA1282(XMM13,XMM15,XMM2,XMM2);
  520. VMA1282(XMM14,XMM15,XMM3,XMM3);
  521. BROAD1282(14*SIZE(_ptr__B_0),XMM15);
  522. VMA1282(XMM13,XMM15,XMM4,XMM4);
  523. VMA1282(XMM14,XMM15,XMM5,XMM5);
  524. BROAD1282(15*SIZE(_ptr__B_0),XMM15);
  525. VMA1282(XMM13,XMM15,XMM6,XMM6);
  526. VMA1282(XMM14,XMM15,XMM7,XMM7);
  527. ADDQ1280($16*SIZE,_ptr__A_0);
  528. ADDQ1280($16*SIZE,_ptr__B_0);
  529. ._L_6_bodyE:;
  530. DECQ1280(k);
  531. JG ._L_6_bodyB;
  532. ALIGN_4;
  533. ._L_6_loopE:;
  534. TESTQ1280($2,_bk_l);
  535. JLE ._L_7_loopE;
  536. ALIGN_4;
  537. ._L_7_bodyB:;
  538. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  539. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  540. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  541. VMA1282(XMM13,XMM15,XMM0,XMM0);
  542. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  543. VMA1282(XMM14,XMM15,XMM1,XMM1);
  544. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  545. VMA1282(XMM13,XMM15,XMM2,XMM2);
  546. VMA1282(XMM14,XMM15,XMM3,XMM3);
  547. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  548. VMA1282(XMM13,XMM15,XMM4,XMM4);
  549. VMA1282(XMM14,XMM15,XMM5,XMM5);
  550. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  551. VMA1282(XMM13,XMM15,XMM6,XMM6);
  552. VMA1282(XMM14,XMM15,XMM7,XMM7);
  553. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  554. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  555. VLD1282(4*SIZE(_ptr__A_0),XMM13);
  556. VMA1282(XMM13,XMM15,XMM0,XMM0);
  557. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  558. VMA1282(XMM14,XMM15,XMM1,XMM1);
  559. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  560. VMA1282(XMM13,XMM15,XMM2,XMM2);
  561. VMA1282(XMM14,XMM15,XMM3,XMM3);
  562. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  563. VMA1282(XMM13,XMM15,XMM4,XMM4);
  564. VMA1282(XMM14,XMM15,XMM5,XMM5);
  565. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  566. VMA1282(XMM13,XMM15,XMM6,XMM6);
  567. VMA1282(XMM14,XMM15,XMM7,XMM7);
  568. ADDQ1280($8*SIZE,_ptr__A_0);
  569. ADDQ1280($8*SIZE,_ptr__B_0);
  570. ._L_7_loopE:;
  571. TESTQ1280($1,_bk_l);
  572. JLE ._L_8_loopE;
  573. ALIGN_4;
  574. ._L_8_bodyB:;
  575. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  576. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  577. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  578. VMA1282(XMM13,XMM15,XMM0,XMM0);
  579. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  580. VMA1282(XMM14,XMM15,XMM1,XMM1);
  581. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  582. VMA1282(XMM13,XMM15,XMM2,XMM2);
  583. VMA1282(XMM14,XMM15,XMM3,XMM3);
  584. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  585. VMA1282(XMM13,XMM15,XMM4,XMM4);
  586. VMA1282(XMM14,XMM15,XMM5,XMM5);
  587. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  588. VMA1282(XMM13,XMM15,XMM6,XMM6);
  589. VMA1282(XMM14,XMM15,XMM7,XMM7);
  590. ADDQ1280($4*SIZE,_ptr__A_0);
  591. ADDQ1280($4*SIZE,_ptr__B_0);
  592. ._L_8_loopE:;
  593. BROAD1282(alpha,XMM8);
  594. VLDU1282(0*SIZE(_ptr__C_0),XMM9);
  595. VMA21282(XMM8,XMM0,XMM9,XMM0);
  596. VSTU1282(XMM9,0*SIZE(_ptr__C_0));
  597. VLDU1282(2*SIZE(_ptr__C_0),XMM10);
  598. VMA21282(XMM8,XMM1,XMM10,XMM1);
  599. VSTU1282(XMM10,2*SIZE(_ptr__C_0));
  600. VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM11);
  601. VMA21282(XMM8,XMM2,XMM11,XMM2);
  602. VSTU1282(XMM11,0*SIZE(_ptr__C_0,LDC,1));
  603. VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM12);
  604. VMA21282(XMM8,XMM3,XMM12,XMM3);
  605. VSTU1282(XMM12,2*SIZE(_ptr__C_0,LDC,1));
  606. VLDU1282(0*SIZE(_ptr__C_1),XMM13);
  607. VMA21282(XMM8,XMM4,XMM13,XMM4);
  608. VSTU1282(XMM13,0*SIZE(_ptr__C_1));
  609. VLDU1282(2*SIZE(_ptr__C_1),XMM14);
  610. VMA21282(XMM8,XMM5,XMM14,XMM5);
  611. VSTU1282(XMM14,2*SIZE(_ptr__C_1));
  612. VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM15);
  613. VMA21282(XMM8,XMM6,XMM15,XMM6);
  614. VSTU1282(XMM15,0*SIZE(_ptr__C_1,LDC,1));
  615. VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM9);
  616. VMA21282(XMM8,XMM7,XMM9,XMM7);
  617. VSTU1282(XMM9,2*SIZE(_ptr__C_1,LDC,1));
  618. ADDQ1280($4*SIZE,_ptr__C_0);
  619. ADDQ1280($4*SIZE,_ptr__C_1);
  620. ._L_5_loopE:;
  621. TESTQ1280($2,i);
  622. JLE ._L_9_loopE;
  623. ALIGN_4;
  624. ._L_9_bodyB:;
  625. MOVQ1280(_ptr_B,_ptr__B_0);
  626. VXOR1282(XMM0,XMM0,XMM0);
  627. VXOR1282(XMM1,XMM1,XMM1);
  628. VXOR1282(XMM2,XMM2,XMM2);
  629. VXOR1282(XMM3,XMM3,XMM3);
  630. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  631. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  632. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  633. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  634. MOVQ1280(_bk_l,k);
  635. SARQ1280($2,k);
  636. JLE ._L_10_loopE;
  637. ALIGN_4;
  638. ._L_10_bodyB:;
  639. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  640. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  641. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  642. VMA1282(XMM14,XMM15,XMM0,XMM0);
  643. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  644. VMA1282(XMM14,XMM15,XMM1,XMM1);
  645. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  646. VMA1282(XMM14,XMM15,XMM2,XMM2);
  647. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  648. VMA1282(XMM14,XMM15,XMM3,XMM3);
  649. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  650. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  651. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  652. VMA1282(XMM14,XMM15,XMM0,XMM0);
  653. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  654. VMA1282(XMM14,XMM15,XMM1,XMM1);
  655. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  656. VMA1282(XMM14,XMM15,XMM2,XMM2);
  657. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  658. VMA1282(XMM14,XMM15,XMM3,XMM3);
  659. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  660. BROAD1282(8*SIZE(_ptr__B_0),XMM15);
  661. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  662. VMA1282(XMM14,XMM15,XMM0,XMM0);
  663. BROAD1282(9*SIZE(_ptr__B_0),XMM15);
  664. VMA1282(XMM14,XMM15,XMM1,XMM1);
  665. BROAD1282(10*SIZE(_ptr__B_0),XMM15);
  666. VMA1282(XMM14,XMM15,XMM2,XMM2);
  667. BROAD1282(11*SIZE(_ptr__B_0),XMM15);
  668. VMA1282(XMM14,XMM15,XMM3,XMM3);
  669. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  670. BROAD1282(12*SIZE(_ptr__B_0),XMM15);
  671. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  672. VMA1282(XMM14,XMM15,XMM0,XMM0);
  673. BROAD1282(13*SIZE(_ptr__B_0),XMM15);
  674. VMA1282(XMM14,XMM15,XMM1,XMM1);
  675. BROAD1282(14*SIZE(_ptr__B_0),XMM15);
  676. VMA1282(XMM14,XMM15,XMM2,XMM2);
  677. BROAD1282(15*SIZE(_ptr__B_0),XMM15);
  678. VMA1282(XMM14,XMM15,XMM3,XMM3);
  679. ADDQ1280($8*SIZE,_ptr__A_0);
  680. ADDQ1280($16*SIZE,_ptr__B_0);
  681. ._L_10_bodyE:;
  682. DECQ1280(k);
  683. JG ._L_10_bodyB;
  684. ALIGN_4;
  685. ._L_10_loopE:;
  686. TESTQ1280($2,_bk_l);
  687. JLE ._L_11_loopE;
  688. ALIGN_4;
  689. ._L_11_bodyB:;
  690. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  691. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  692. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  693. VMA1282(XMM14,XMM15,XMM0,XMM0);
  694. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  695. VMA1282(XMM14,XMM15,XMM1,XMM1);
  696. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  697. VMA1282(XMM14,XMM15,XMM2,XMM2);
  698. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  699. VMA1282(XMM14,XMM15,XMM3,XMM3);
  700. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  701. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  702. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  703. VMA1282(XMM14,XMM15,XMM0,XMM0);
  704. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  705. VMA1282(XMM14,XMM15,XMM1,XMM1);
  706. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  707. VMA1282(XMM14,XMM15,XMM2,XMM2);
  708. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  709. VMA1282(XMM14,XMM15,XMM3,XMM3);
  710. ADDQ1280($4*SIZE,_ptr__A_0);
  711. ADDQ1280($8*SIZE,_ptr__B_0);
  712. ._L_11_loopE:;
  713. TESTQ1280($1,_bk_l);
  714. JLE ._L_12_loopE;
  715. ALIGN_4;
  716. ._L_12_bodyB:;
  717. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  718. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  719. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  720. VMA1282(XMM14,XMM15,XMM0,XMM0);
  721. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  722. VMA1282(XMM14,XMM15,XMM1,XMM1);
  723. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  724. VMA1282(XMM14,XMM15,XMM2,XMM2);
  725. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  726. VMA1282(XMM14,XMM15,XMM3,XMM3);
  727. ADDQ1280($2*SIZE,_ptr__A_0);
  728. ADDQ1280($4*SIZE,_ptr__B_0);
  729. ._L_12_loopE:;
  730. BROAD1282(alpha,XMM4);
  731. VLDU1282(0*SIZE(_ptr__C_0),XMM5);
  732. VMA21282(XMM4,XMM0,XMM5,XMM0);
  733. VSTU1282(XMM5,0*SIZE(_ptr__C_0));
  734. VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM6);
  735. VMA21282(XMM4,XMM1,XMM6,XMM1);
  736. VSTU1282(XMM6,0*SIZE(_ptr__C_0,LDC,1));
  737. VLDU1282(0*SIZE(_ptr__C_1),XMM7);
  738. VMA21282(XMM4,XMM2,XMM7,XMM2);
  739. VSTU1282(XMM7,0*SIZE(_ptr__C_1));
  740. VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM8);
  741. VMA21282(XMM4,XMM3,XMM8,XMM3);
  742. VSTU1282(XMM8,0*SIZE(_ptr__C_1,LDC,1));
  743. ADDQ1280($2*SIZE,_ptr__C_0);
  744. ADDQ1280($2*SIZE,_ptr__C_1);
  745. ._L_9_loopE:;
  746. TESTQ1280($1,i);
  747. JLE ._L_13_loopE;
  748. ALIGN_4;
  749. ._L_13_bodyB:;
  750. MOVQ1280(_ptr_B,_ptr__B_0);
  751. VXOR1281(XMM0,XMM0,XMM0);
  752. VXOR1281(XMM1,XMM1,XMM1);
  753. VXOR1281(XMM2,XMM2,XMM2);
  754. VXOR1281(XMM3,XMM3,XMM3);
  755. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  756. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  757. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  758. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  759. MOVQ1280(_bk_l,k);
  760. SARQ1280($2,k);
  761. JLE ._L_14_loopE;
  762. ALIGN_4;
  763. ._L_14_bodyB:;
  764. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  765. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  766. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  767. VMA1281(XMM14,XMM15,XMM0,XMM0);
  768. BROAD1281(1*SIZE(_ptr__B_0),XMM15);
  769. VMA1281(XMM14,XMM15,XMM1,XMM1);
  770. BROAD1281(2*SIZE(_ptr__B_0),XMM15);
  771. VMA1281(XMM14,XMM15,XMM2,XMM2);
  772. BROAD1281(3*SIZE(_ptr__B_0),XMM15);
  773. VMA1281(XMM14,XMM15,XMM3,XMM3);
  774. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  775. BROAD1281(4*SIZE(_ptr__B_0),XMM15);
  776. VLD1281(1*SIZE(_ptr__A_0),XMM14);
  777. VMA1281(XMM14,XMM15,XMM0,XMM0);
  778. BROAD1281(5*SIZE(_ptr__B_0),XMM15);
  779. VMA1281(XMM14,XMM15,XMM1,XMM1);
  780. BROAD1281(6*SIZE(_ptr__B_0),XMM15);
  781. VMA1281(XMM14,XMM15,XMM2,XMM2);
  782. BROAD1281(7*SIZE(_ptr__B_0),XMM15);
  783. VMA1281(XMM14,XMM15,XMM3,XMM3);
  784. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  785. BROAD1281(8*SIZE(_ptr__B_0),XMM15);
  786. VLD1281(2*SIZE(_ptr__A_0),XMM14);
  787. VMA1281(XMM14,XMM15,XMM0,XMM0);
  788. BROAD1281(9*SIZE(_ptr__B_0),XMM15);
  789. VMA1281(XMM14,XMM15,XMM1,XMM1);
  790. BROAD1281(10*SIZE(_ptr__B_0),XMM15);
  791. VMA1281(XMM14,XMM15,XMM2,XMM2);
  792. BROAD1281(11*SIZE(_ptr__B_0),XMM15);
  793. VMA1281(XMM14,XMM15,XMM3,XMM3);
  794. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  795. BROAD1281(12*SIZE(_ptr__B_0),XMM15);
  796. VLD1281(3*SIZE(_ptr__A_0),XMM14);
  797. VMA1281(XMM14,XMM15,XMM0,XMM0);
  798. BROAD1281(13*SIZE(_ptr__B_0),XMM15);
  799. VMA1281(XMM14,XMM15,XMM1,XMM1);
  800. BROAD1281(14*SIZE(_ptr__B_0),XMM15);
  801. VMA1281(XMM14,XMM15,XMM2,XMM2);
  802. BROAD1281(15*SIZE(_ptr__B_0),XMM15);
  803. VMA1281(XMM14,XMM15,XMM3,XMM3);
  804. ADDQ1280($4*SIZE,_ptr__A_0);
  805. ADDQ1280($16*SIZE,_ptr__B_0);
  806. ._L_14_bodyE:;
  807. DECQ1280(k);
  808. JG ._L_14_bodyB;
  809. ALIGN_4;
  810. ._L_14_loopE:;
  811. TESTQ1280($2,_bk_l);
  812. JLE ._L_15_loopE;
  813. ALIGN_4;
  814. ._L_15_bodyB:;
  815. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  816. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  817. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  818. VMA1281(XMM14,XMM15,XMM0,XMM0);
  819. BROAD1281(1*SIZE(_ptr__B_0),XMM15);
  820. VMA1281(XMM14,XMM15,XMM1,XMM1);
  821. BROAD1281(2*SIZE(_ptr__B_0),XMM15);
  822. VMA1281(XMM14,XMM15,XMM2,XMM2);
  823. BROAD1281(3*SIZE(_ptr__B_0),XMM15);
  824. VMA1281(XMM14,XMM15,XMM3,XMM3);
  825. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  826. BROAD1281(4*SIZE(_ptr__B_0),XMM15);
  827. VLD1281(1*SIZE(_ptr__A_0),XMM14);
  828. VMA1281(XMM14,XMM15,XMM0,XMM0);
  829. BROAD1281(5*SIZE(_ptr__B_0),XMM15);
  830. VMA1281(XMM14,XMM15,XMM1,XMM1);
  831. BROAD1281(6*SIZE(_ptr__B_0),XMM15);
  832. VMA1281(XMM14,XMM15,XMM2,XMM2);
  833. BROAD1281(7*SIZE(_ptr__B_0),XMM15);
  834. VMA1281(XMM14,XMM15,XMM3,XMM3);
  835. ADDQ1280($2*SIZE,_ptr__A_0);
  836. ADDQ1280($8*SIZE,_ptr__B_0);
  837. ._L_15_loopE:;
  838. TESTQ1280($1,_bk_l);
  839. JLE ._L_16_loopE;
  840. ALIGN_4;
  841. ._L_16_bodyB:;
  842. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  843. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  844. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  845. VMA1281(XMM14,XMM15,XMM0,XMM0);
  846. BROAD1281(1*SIZE(_ptr__B_0),XMM15);
  847. VMA1281(XMM14,XMM15,XMM1,XMM1);
  848. BROAD1281(2*SIZE(_ptr__B_0),XMM15);
  849. VMA1281(XMM14,XMM15,XMM2,XMM2);
  850. BROAD1281(3*SIZE(_ptr__B_0),XMM15);
  851. VMA1281(XMM14,XMM15,XMM3,XMM3);
  852. ADDQ1280($1*SIZE,_ptr__A_0);
  853. ADDQ1280($4*SIZE,_ptr__B_0);
  854. ._L_16_loopE:;
  855. BROAD1281(alpha,XMM4);
  856. VLDU1281(0*SIZE(_ptr__C_0),XMM5);
  857. VMA21281(XMM4,XMM0,XMM5,XMM0);
  858. VSTU1281(XMM5,0*SIZE(_ptr__C_0));
  859. VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM6);
  860. VMA21281(XMM4,XMM1,XMM6,XMM1);
  861. VSTU1281(XMM6,0*SIZE(_ptr__C_0,LDC,1));
  862. VLDU1281(0*SIZE(_ptr__C_1),XMM7);
  863. VMA21281(XMM4,XMM2,XMM7,XMM2);
  864. VSTU1281(XMM7,0*SIZE(_ptr__C_1));
  865. VLDU1281(0*SIZE(_ptr__C_1,LDC,1),XMM8);
  866. VMA21281(XMM4,XMM3,XMM8,XMM3);
  867. VSTU1281(XMM8,0*SIZE(_ptr__C_1,LDC,1));
  868. ADDQ1280($1*SIZE,_ptr__C_0);
  869. ADDQ1280($1*SIZE,_ptr__C_1);
  870. ._L_13_loopE:;
  871. MOVQ1280(LDC,%rax);
  872. SALQ1280($2,%rax);
  873. ADDQ1280(%rax,_ptr_C);
  874. MOVQ1280(_bk_l,%rax);
  875. SALQ1280($5,%rax);
  876. ADDQ1280(%rax,_ptr_B);
  877. ._L_0_bodyE:;
  878. DECQ1280(j);
  879. JG ._L_0_bodyB;
  880. ALIGN_4;
  881. ._L_0_loopE:;
  882. TESTQ1280($2,_bk_j);
  883. JLE ._L_17_loopE;
  884. ALIGN_4;
  885. ._L_17_bodyB:;
  886. MOVQ1280(_ptr_A,_ptr__A_0);
  887. MOVQ1280(_ptr_C,_ptr__C_0);
  888. LEAQ1280((_ptr_C,LDC,1),_ptr__C_1);
  889. MOVQ1280(_bk_l,%rax);
  890. SALQ1280($4,%rax);
  891. ADDQ1280(%rax,_pre_B);
  892. MOVQ1280(_bk_i,i);
  893. CMPQ1280($6,i);
  894. JL ._L_18_loopE;
  895. ._L_18_bodyB:;
  896. MOVQ1280(_ptr_B,_ptr__B_0);
  897. VXOR1282(XMM0,XMM0,XMM0);
  898. VXOR1282(XMM1,XMM1,XMM1);
  899. VXOR1282(XMM2,XMM2,XMM2);
  900. VXOR1282(XMM3,XMM3,XMM3);
  901. VXOR1282(XMM4,XMM4,XMM4);
  902. VXOR1282(XMM5,XMM5,XMM5);
  903. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  904. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  905. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  906. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  907. MOVQ1280(_bk_l,k);
  908. SARQ1280($2,k);
  909. JLE ._L_19_loopE;
  910. ALIGN_4;
  911. ._L_19_bodyB:;
  912. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  913. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  914. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  915. VMA1282(XMM12,XMM15,XMM0,XMM0);
  916. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  917. VMA1282(XMM13,XMM15,XMM1,XMM1);
  918. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  919. VMA1282(XMM14,XMM15,XMM2,XMM2);
  920. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  921. VMA1282(XMM12,XMM15,XMM3,XMM3);
  922. VMA1282(XMM13,XMM15,XMM4,XMM4);
  923. VMA1282(XMM14,XMM15,XMM5,XMM5);
  924. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  925. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  926. VLD1282(6*SIZE(_ptr__A_0),XMM12);
  927. VMA1282(XMM12,XMM15,XMM0,XMM0);
  928. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  929. VMA1282(XMM13,XMM15,XMM1,XMM1);
  930. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  931. VMA1282(XMM14,XMM15,XMM2,XMM2);
  932. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  933. VMA1282(XMM12,XMM15,XMM3,XMM3);
  934. VMA1282(XMM13,XMM15,XMM4,XMM4);
  935. VMA1282(XMM14,XMM15,XMM5,XMM5);
  936. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  937. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  938. VLD1282(12*SIZE(_ptr__A_0),XMM12);
  939. VMA1282(XMM12,XMM15,XMM0,XMM0);
  940. VLD1282(14*SIZE(_ptr__A_0),XMM13);
  941. VMA1282(XMM13,XMM15,XMM1,XMM1);
  942. VLD1282(16*SIZE(_ptr__A_0),XMM14);
  943. VMA1282(XMM14,XMM15,XMM2,XMM2);
  944. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  945. VMA1282(XMM12,XMM15,XMM3,XMM3);
  946. VMA1282(XMM13,XMM15,XMM4,XMM4);
  947. VMA1282(XMM14,XMM15,XMM5,XMM5);
  948. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  949. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  950. VLD1282(18*SIZE(_ptr__A_0),XMM12);
  951. VMA1282(XMM12,XMM15,XMM0,XMM0);
  952. VLD1282(20*SIZE(_ptr__A_0),XMM13);
  953. VMA1282(XMM13,XMM15,XMM1,XMM1);
  954. VLD1282(22*SIZE(_ptr__A_0),XMM14);
  955. VMA1282(XMM14,XMM15,XMM2,XMM2);
  956. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  957. VMA1282(XMM12,XMM15,XMM3,XMM3);
  958. VMA1282(XMM13,XMM15,XMM4,XMM4);
  959. VMA1282(XMM14,XMM15,XMM5,XMM5);
  960. ADDQ1280($24*SIZE,_ptr__A_0);
  961. ADDQ1280($8*SIZE,_ptr__B_0);
  962. ._L_19_bodyE:;
  963. DECQ1280(k);
  964. JG ._L_19_bodyB;
  965. ALIGN_4;
  966. ._L_19_loopE:;
  967. TESTQ1280($2,_bk_l);
  968. JLE ._L_20_loopE;
  969. ALIGN_4;
  970. ._L_20_bodyB:;
  971. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  972. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  973. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  974. VMA1282(XMM12,XMM15,XMM0,XMM0);
  975. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  976. VMA1282(XMM13,XMM15,XMM1,XMM1);
  977. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  978. VMA1282(XMM14,XMM15,XMM2,XMM2);
  979. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  980. VMA1282(XMM12,XMM15,XMM3,XMM3);
  981. VMA1282(XMM13,XMM15,XMM4,XMM4);
  982. VMA1282(XMM14,XMM15,XMM5,XMM5);
  983. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  984. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  985. VLD1282(6*SIZE(_ptr__A_0),XMM12);
  986. VMA1282(XMM12,XMM15,XMM0,XMM0);
  987. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  988. VMA1282(XMM13,XMM15,XMM1,XMM1);
  989. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  990. VMA1282(XMM14,XMM15,XMM2,XMM2);
  991. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  992. VMA1282(XMM12,XMM15,XMM3,XMM3);
  993. VMA1282(XMM13,XMM15,XMM4,XMM4);
  994. VMA1282(XMM14,XMM15,XMM5,XMM5);
  995. ADDQ1280($12*SIZE,_ptr__A_0);
  996. ADDQ1280($4*SIZE,_ptr__B_0);
  997. ._L_20_loopE:;
  998. TESTQ1280($1,_bk_l);
  999. JLE ._L_21_loopE;
  1000. ALIGN_4;
  1001. ._L_21_bodyB:;
  1002. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1003. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1004. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  1005. VMA1282(XMM12,XMM15,XMM0,XMM0);
  1006. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  1007. VMA1282(XMM13,XMM15,XMM1,XMM1);
  1008. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  1009. VMA1282(XMM14,XMM15,XMM2,XMM2);
  1010. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1011. VMA1282(XMM12,XMM15,XMM3,XMM3);
  1012. VMA1282(XMM13,XMM15,XMM4,XMM4);
  1013. VMA1282(XMM14,XMM15,XMM5,XMM5);
  1014. ADDQ1280($6*SIZE,_ptr__A_0);
  1015. ADDQ1280($2*SIZE,_ptr__B_0);
  1016. ._L_21_loopE:;
  1017. BROAD1282(alpha,XMM6);
  1018. VLDU1282(0*SIZE(_ptr__C_0),XMM7);
  1019. VMA21282(XMM6,XMM0,XMM7,XMM0);
  1020. VSTU1282(XMM7,0*SIZE(_ptr__C_0));
  1021. VLDU1282(2*SIZE(_ptr__C_0),XMM8);
  1022. VMA21282(XMM6,XMM1,XMM8,XMM1);
  1023. VSTU1282(XMM8,2*SIZE(_ptr__C_0));
  1024. VLDU1282(4*SIZE(_ptr__C_0),XMM9);
  1025. VMA21282(XMM6,XMM2,XMM9,XMM2);
  1026. VSTU1282(XMM9,4*SIZE(_ptr__C_0));
  1027. VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM10);
  1028. VMA21282(XMM6,XMM3,XMM10,XMM3);
  1029. VSTU1282(XMM10,0*SIZE(_ptr__C_0,LDC,1));
  1030. VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM11);
  1031. VMA21282(XMM6,XMM4,XMM11,XMM4);
  1032. VSTU1282(XMM11,2*SIZE(_ptr__C_0,LDC,1));
  1033. VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM12);
  1034. VMA21282(XMM6,XMM5,XMM12,XMM5);
  1035. VSTU1282(XMM12,4*SIZE(_ptr__C_0,LDC,1));
  1036. ADDQ1280($6*SIZE,_ptr__C_0);
  1037. ADDQ1280($6*SIZE,_ptr__C_1);
  1038. ._L_18_bodyE:;
  1039. SUBQ1280($6,i);
  1040. JG ._L_18_bodyB;
  1041. ALIGN_4;
  1042. ._L_18_loopE:;
  1043. TESTQ1280($4,i);
  1044. JLE ._L_22_loopE;
  1045. ALIGN_4;
  1046. ._L_22_bodyB:;
  1047. MOVQ1280(_ptr_B,_ptr__B_0);
  1048. VXOR1282(XMM0,XMM0,XMM0);
  1049. VXOR1282(XMM1,XMM1,XMM1);
  1050. VXOR1282(XMM2,XMM2,XMM2);
  1051. VXOR1282(XMM3,XMM3,XMM3);
  1052. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  1053. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  1054. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  1055. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  1056. MOVQ1280(_bk_l,k);
  1057. SARQ1280($2,k);
  1058. JLE ._L_23_loopE;
  1059. ALIGN_4;
  1060. ._L_23_bodyB:;
  1061. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1062. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1063. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  1064. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1065. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1066. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1067. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1068. VMA1282(XMM13,XMM15,XMM2,XMM2);
  1069. VMA1282(XMM14,XMM15,XMM3,XMM3);
  1070. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1071. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  1072. VLD1282(4*SIZE(_ptr__A_0),XMM13);
  1073. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1074. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  1075. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1076. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  1077. VMA1282(XMM13,XMM15,XMM2,XMM2);
  1078. VMA1282(XMM14,XMM15,XMM3,XMM3);
  1079. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  1080. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  1081. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  1082. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1083. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  1084. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1085. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  1086. VMA1282(XMM13,XMM15,XMM2,XMM2);
  1087. VMA1282(XMM14,XMM15,XMM3,XMM3);
  1088. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  1089. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  1090. VLD1282(12*SIZE(_ptr__A_0),XMM13);
  1091. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1092. VLD1282(14*SIZE(_ptr__A_0),XMM14);
  1093. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1094. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  1095. VMA1282(XMM13,XMM15,XMM2,XMM2);
  1096. VMA1282(XMM14,XMM15,XMM3,XMM3);
  1097. ADDQ1280($16*SIZE,_ptr__A_0);
  1098. ADDQ1280($8*SIZE,_ptr__B_0);
  1099. ._L_23_bodyE:;
  1100. DECQ1280(k);
  1101. JG ._L_23_bodyB;
  1102. ALIGN_4;
  1103. ._L_23_loopE:;
  1104. TESTQ1280($2,_bk_l);
  1105. JLE ._L_24_loopE;
  1106. ALIGN_4;
  1107. ._L_24_bodyB:;
  1108. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1109. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1110. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  1111. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1112. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1113. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1114. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1115. VMA1282(XMM13,XMM15,XMM2,XMM2);
  1116. VMA1282(XMM14,XMM15,XMM3,XMM3);
  1117. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1118. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  1119. VLD1282(4*SIZE(_ptr__A_0),XMM13);
  1120. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1121. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  1122. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1123. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  1124. VMA1282(XMM13,XMM15,XMM2,XMM2);
  1125. VMA1282(XMM14,XMM15,XMM3,XMM3);
  1126. ADDQ1280($8*SIZE,_ptr__A_0);
  1127. ADDQ1280($4*SIZE,_ptr__B_0);
  1128. ._L_24_loopE:;
  1129. TESTQ1280($1,_bk_l);
  1130. JLE ._L_25_loopE;
  1131. ALIGN_4;
  1132. ._L_25_bodyB:;
  1133. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1134. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1135. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  1136. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1137. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1138. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1139. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1140. VMA1282(XMM13,XMM15,XMM2,XMM2);
  1141. VMA1282(XMM14,XMM15,XMM3,XMM3);
  1142. ADDQ1280($4*SIZE,_ptr__A_0);
  1143. ADDQ1280($2*SIZE,_ptr__B_0);
  1144. ._L_25_loopE:;
  1145. BROAD1282(alpha,XMM4);
  1146. VLDU1282(0*SIZE(_ptr__C_0),XMM5);
  1147. VMA21282(XMM4,XMM0,XMM5,XMM0);
  1148. VSTU1282(XMM5,0*SIZE(_ptr__C_0));
  1149. VLDU1282(2*SIZE(_ptr__C_0),XMM6);
  1150. VMA21282(XMM4,XMM1,XMM6,XMM1);
  1151. VSTU1282(XMM6,2*SIZE(_ptr__C_0));
  1152. VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM7);
  1153. VMA21282(XMM4,XMM2,XMM7,XMM2);
  1154. VSTU1282(XMM7,0*SIZE(_ptr__C_0,LDC,1));
  1155. VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM8);
  1156. VMA21282(XMM4,XMM3,XMM8,XMM3);
  1157. VSTU1282(XMM8,2*SIZE(_ptr__C_0,LDC,1));
  1158. ADDQ1280($4*SIZE,_ptr__C_0);
  1159. ADDQ1280($4*SIZE,_ptr__C_1);
  1160. ._L_22_loopE:;
  1161. TESTQ1280($2,i);
  1162. JLE ._L_26_loopE;
  1163. ALIGN_4;
  1164. ._L_26_bodyB:;
  1165. MOVQ1280(_ptr_B,_ptr__B_0);
  1166. VXOR1282(XMM0,XMM0,XMM0);
  1167. VXOR1282(XMM1,XMM1,XMM1);
  1168. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  1169. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  1170. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  1171. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  1172. MOVQ1280(_bk_l,k);
  1173. SARQ1280($2,k);
  1174. JLE ._L_27_loopE;
  1175. ALIGN_4;
  1176. ._L_27_bodyB:;
  1177. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1178. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1179. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  1180. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1181. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1182. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1183. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1184. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  1185. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1186. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1187. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  1188. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1189. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  1190. BROAD1282(4*SIZE(_ptr__B_0),XMM15);
  1191. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  1192. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1193. BROAD1282(5*SIZE(_ptr__B_0),XMM15);
  1194. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1195. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  1196. BROAD1282(6*SIZE(_ptr__B_0),XMM15);
  1197. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  1198. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1199. BROAD1282(7*SIZE(_ptr__B_0),XMM15);
  1200. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1201. ADDQ1280($8*SIZE,_ptr__A_0);
  1202. ADDQ1280($8*SIZE,_ptr__B_0);
  1203. ._L_27_bodyE:;
  1204. DECQ1280(k);
  1205. JG ._L_27_bodyB;
  1206. ALIGN_4;
  1207. ._L_27_loopE:;
  1208. TESTQ1280($2,_bk_l);
  1209. JLE ._L_28_loopE;
  1210. ALIGN_4;
  1211. ._L_28_bodyB:;
  1212. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1213. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1214. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  1215. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1216. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1217. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1218. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1219. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  1220. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1221. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1222. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  1223. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1224. ADDQ1280($4*SIZE,_ptr__A_0);
  1225. ADDQ1280($4*SIZE,_ptr__B_0);
  1226. ._L_28_loopE:;
  1227. TESTQ1280($1,_bk_l);
  1228. JLE ._L_29_loopE;
  1229. ALIGN_4;
  1230. ._L_29_bodyB:;
  1231. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1232. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1233. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  1234. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1235. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1236. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1237. ADDQ1280($2*SIZE,_ptr__A_0);
  1238. ADDQ1280($2*SIZE,_ptr__B_0);
  1239. ._L_29_loopE:;
  1240. BROAD1282(alpha,XMM2);
  1241. VLDU1282(0*SIZE(_ptr__C_0),XMM3);
  1242. VMA21282(XMM2,XMM0,XMM3,XMM0);
  1243. VSTU1282(XMM3,0*SIZE(_ptr__C_0));
  1244. VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM4);
  1245. VMA21282(XMM2,XMM1,XMM4,XMM1);
  1246. VSTU1282(XMM4,0*SIZE(_ptr__C_0,LDC,1));
  1247. ADDQ1280($2*SIZE,_ptr__C_0);
  1248. ADDQ1280($2*SIZE,_ptr__C_1);
  1249. ._L_26_loopE:;
  1250. TESTQ1280($1,i);
  1251. JLE ._L_30_loopE;
  1252. ALIGN_4;
  1253. ._L_30_bodyB:;
  1254. MOVQ1280(_ptr_B,_ptr__B_0);
  1255. VXOR1281(XMM0,XMM0,XMM0);
  1256. VXOR1281(XMM1,XMM1,XMM1);
  1257. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  1258. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  1259. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  1260. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  1261. MOVQ1280(_bk_l,k);
  1262. SARQ1280($2,k);
  1263. JLE ._L_31_loopE;
  1264. ALIGN_4;
  1265. ._L_31_bodyB:;
  1266. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1267. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  1268. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  1269. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1270. BROAD1281(1*SIZE(_ptr__B_0),XMM15);
  1271. VMA1281(XMM14,XMM15,XMM1,XMM1);
  1272. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1273. BROAD1281(2*SIZE(_ptr__B_0),XMM15);
  1274. VLD1281(1*SIZE(_ptr__A_0),XMM14);
  1275. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1276. BROAD1281(3*SIZE(_ptr__B_0),XMM15);
  1277. VMA1281(XMM14,XMM15,XMM1,XMM1);
  1278. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  1279. BROAD1281(4*SIZE(_ptr__B_0),XMM15);
  1280. VLD1281(2*SIZE(_ptr__A_0),XMM14);
  1281. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1282. BROAD1281(5*SIZE(_ptr__B_0),XMM15);
  1283. VMA1281(XMM14,XMM15,XMM1,XMM1);
  1284. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  1285. BROAD1281(6*SIZE(_ptr__B_0),XMM15);
  1286. VLD1281(3*SIZE(_ptr__A_0),XMM14);
  1287. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1288. BROAD1281(7*SIZE(_ptr__B_0),XMM15);
  1289. VMA1281(XMM14,XMM15,XMM1,XMM1);
  1290. ADDQ1280($4*SIZE,_ptr__A_0);
  1291. ADDQ1280($8*SIZE,_ptr__B_0);
  1292. ._L_31_bodyE:;
  1293. DECQ1280(k);
  1294. JG ._L_31_bodyB;
  1295. ALIGN_4;
  1296. ._L_31_loopE:;
  1297. TESTQ1280($2,_bk_l);
  1298. JLE ._L_32_loopE;
  1299. ALIGN_4;
  1300. ._L_32_bodyB:;
  1301. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1302. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  1303. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  1304. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1305. BROAD1281(1*SIZE(_ptr__B_0),XMM15);
  1306. VMA1281(XMM14,XMM15,XMM1,XMM1);
  1307. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1308. BROAD1281(2*SIZE(_ptr__B_0),XMM15);
  1309. VLD1281(1*SIZE(_ptr__A_0),XMM14);
  1310. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1311. BROAD1281(3*SIZE(_ptr__B_0),XMM15);
  1312. VMA1281(XMM14,XMM15,XMM1,XMM1);
  1313. ADDQ1280($2*SIZE,_ptr__A_0);
  1314. ADDQ1280($4*SIZE,_ptr__B_0);
  1315. ._L_32_loopE:;
  1316. TESTQ1280($1,_bk_l);
  1317. JLE ._L_33_loopE;
  1318. ALIGN_4;
  1319. ._L_33_bodyB:;
  1320. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1321. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  1322. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  1323. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1324. BROAD1281(1*SIZE(_ptr__B_0),XMM15);
  1325. VMA1281(XMM14,XMM15,XMM1,XMM1);
  1326. ADDQ1280($1*SIZE,_ptr__A_0);
  1327. ADDQ1280($2*SIZE,_ptr__B_0);
  1328. ._L_33_loopE:;
  1329. BROAD1281(alpha,XMM2);
  1330. VLDU1281(0*SIZE(_ptr__C_0),XMM3);
  1331. VMA21281(XMM2,XMM0,XMM3,XMM0);
  1332. VSTU1281(XMM3,0*SIZE(_ptr__C_0));
  1333. VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM4);
  1334. VMA21281(XMM2,XMM1,XMM4,XMM1);
  1335. VSTU1281(XMM4,0*SIZE(_ptr__C_0,LDC,1));
  1336. ADDQ1280($1*SIZE,_ptr__C_0);
  1337. ADDQ1280($1*SIZE,_ptr__C_1);
  1338. ._L_30_loopE:;
  1339. MOVQ1280(LDC,%rax);
  1340. SALQ1280($1,%rax);
  1341. ADDQ1280(%rax,_ptr_C);
  1342. MOVQ1280(_bk_l,%rax);
  1343. SALQ1280($4,%rax);
  1344. ADDQ1280(%rax,_ptr_B);
  1345. ._L_17_loopE:;
  1346. TESTQ1280($1,_bk_j);
  1347. JLE ._L_34_loopE;
  1348. ALIGN_4;
  1349. ._L_34_bodyB:;
  1350. MOVQ1280(_ptr_A,_ptr__A_0);
  1351. MOVQ1280(_ptr_C,_ptr__C_0);
  1352. MOVQ1280(_bk_l,%rax);
  1353. SALQ1280($3,%rax);
  1354. ADDQ1280(%rax,_pre_B);
  1355. MOVQ1280(_bk_i,i);
  1356. CMPQ1280($6,i);
  1357. JL ._L_35_loopE;
  1358. ._L_35_bodyB:;
  1359. MOVQ1280(_ptr_B,_ptr__B_0);
  1360. VXOR1282(XMM0,XMM0,XMM0);
  1361. VXOR1282(XMM1,XMM1,XMM1);
  1362. VXOR1282(XMM2,XMM2,XMM2);
  1363. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  1364. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  1365. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  1366. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  1367. MOVQ1280(_bk_l,k);
  1368. SARQ1280($2,k);
  1369. JLE ._L_36_loopE;
  1370. ALIGN_4;
  1371. ._L_36_bodyB:;
  1372. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1373. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1374. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  1375. VMA1282(XMM12,XMM15,XMM0,XMM0);
  1376. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  1377. VMA1282(XMM13,XMM15,XMM1,XMM1);
  1378. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  1379. VMA1282(XMM14,XMM15,XMM2,XMM2);
  1380. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1381. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1382. VLD1282(6*SIZE(_ptr__A_0),XMM12);
  1383. VMA1282(XMM12,XMM15,XMM0,XMM0);
  1384. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  1385. VMA1282(XMM13,XMM15,XMM1,XMM1);
  1386. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  1387. VMA1282(XMM14,XMM15,XMM2,XMM2);
  1388. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  1389. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  1390. VLD1282(12*SIZE(_ptr__A_0),XMM12);
  1391. VMA1282(XMM12,XMM15,XMM0,XMM0);
  1392. VLD1282(14*SIZE(_ptr__A_0),XMM13);
  1393. VMA1282(XMM13,XMM15,XMM1,XMM1);
  1394. VLD1282(16*SIZE(_ptr__A_0),XMM14);
  1395. VMA1282(XMM14,XMM15,XMM2,XMM2);
  1396. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  1397. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  1398. VLD1282(18*SIZE(_ptr__A_0),XMM12);
  1399. VMA1282(XMM12,XMM15,XMM0,XMM0);
  1400. VLD1282(20*SIZE(_ptr__A_0),XMM13);
  1401. VMA1282(XMM13,XMM15,XMM1,XMM1);
  1402. VLD1282(22*SIZE(_ptr__A_0),XMM14);
  1403. VMA1282(XMM14,XMM15,XMM2,XMM2);
  1404. ADDQ1280($24*SIZE,_ptr__A_0);
  1405. ADDQ1280($4*SIZE,_ptr__B_0);
  1406. ._L_36_bodyE:;
  1407. DECQ1280(k);
  1408. JG ._L_36_bodyB;
  1409. ALIGN_4;
  1410. ._L_36_loopE:;
  1411. TESTQ1280($2,_bk_l);
  1412. JLE ._L_37_loopE;
  1413. ALIGN_4;
  1414. ._L_37_bodyB:;
  1415. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1416. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1417. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  1418. VMA1282(XMM12,XMM15,XMM0,XMM0);
  1419. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  1420. VMA1282(XMM13,XMM15,XMM1,XMM1);
  1421. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  1422. VMA1282(XMM14,XMM15,XMM2,XMM2);
  1423. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1424. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1425. VLD1282(6*SIZE(_ptr__A_0),XMM12);
  1426. VMA1282(XMM12,XMM15,XMM0,XMM0);
  1427. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  1428. VMA1282(XMM13,XMM15,XMM1,XMM1);
  1429. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  1430. VMA1282(XMM14,XMM15,XMM2,XMM2);
  1431. ADDQ1280($12*SIZE,_ptr__A_0);
  1432. ADDQ1280($2*SIZE,_ptr__B_0);
  1433. ._L_37_loopE:;
  1434. TESTQ1280($1,_bk_l);
  1435. JLE ._L_38_loopE;
  1436. ALIGN_4;
  1437. ._L_38_bodyB:;
  1438. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1439. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1440. VLD1282(0*SIZE(_ptr__A_0),XMM12);
  1441. VMA1282(XMM12,XMM15,XMM0,XMM0);
  1442. VLD1282(2*SIZE(_ptr__A_0),XMM13);
  1443. VMA1282(XMM13,XMM15,XMM1,XMM1);
  1444. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  1445. VMA1282(XMM14,XMM15,XMM2,XMM2);
  1446. ADDQ1280($6*SIZE,_ptr__A_0);
  1447. ADDQ1280($1*SIZE,_ptr__B_0);
  1448. ._L_38_loopE:;
  1449. BROAD1282(alpha,XMM3);
  1450. VLDU1282(0*SIZE(_ptr__C_0),XMM4);
  1451. VMA21282(XMM3,XMM0,XMM4,XMM0);
  1452. VSTU1282(XMM4,0*SIZE(_ptr__C_0));
  1453. VLDU1282(2*SIZE(_ptr__C_0),XMM5);
  1454. VMA21282(XMM3,XMM1,XMM5,XMM1);
  1455. VSTU1282(XMM5,2*SIZE(_ptr__C_0));
  1456. VLDU1282(4*SIZE(_ptr__C_0),XMM6);
  1457. VMA21282(XMM3,XMM2,XMM6,XMM2);
  1458. VSTU1282(XMM6,4*SIZE(_ptr__C_0));
  1459. ADDQ1280($6*SIZE,_ptr__C_0);
  1460. ADDQ1280($6*SIZE,_ptr__C_1);
  1461. ._L_35_bodyE:;
  1462. SUBQ1280($6,i);
  1463. JG ._L_35_bodyB;
  1464. ALIGN_4;
  1465. ._L_35_loopE:;
  1466. TESTQ1280($4,i);
  1467. JLE ._L_39_loopE;
  1468. ALIGN_4;
  1469. ._L_39_bodyB:;
  1470. MOVQ1280(_ptr_B,_ptr__B_0);
  1471. VXOR1282(XMM0,XMM0,XMM0);
  1472. VXOR1282(XMM1,XMM1,XMM1);
  1473. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  1474. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  1475. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  1476. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  1477. MOVQ1280(_bk_l,k);
  1478. SARQ1280($2,k);
  1479. JLE ._L_40_loopE;
  1480. ALIGN_4;
  1481. ._L_40_bodyB:;
  1482. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1483. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1484. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  1485. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1486. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1487. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1488. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1489. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1490. VLD1282(4*SIZE(_ptr__A_0),XMM13);
  1491. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1492. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  1493. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1494. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  1495. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  1496. VLD1282(8*SIZE(_ptr__A_0),XMM13);
  1497. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1498. VLD1282(10*SIZE(_ptr__A_0),XMM14);
  1499. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1500. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  1501. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  1502. VLD1282(12*SIZE(_ptr__A_0),XMM13);
  1503. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1504. VLD1282(14*SIZE(_ptr__A_0),XMM14);
  1505. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1506. ADDQ1280($16*SIZE,_ptr__A_0);
  1507. ADDQ1280($4*SIZE,_ptr__B_0);
  1508. ._L_40_bodyE:;
  1509. DECQ1280(k);
  1510. JG ._L_40_bodyB;
  1511. ALIGN_4;
  1512. ._L_40_loopE:;
  1513. TESTQ1280($2,_bk_l);
  1514. JLE ._L_41_loopE;
  1515. ALIGN_4;
  1516. ._L_41_bodyB:;
  1517. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1518. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1519. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  1520. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1521. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1522. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1523. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1524. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1525. VLD1282(4*SIZE(_ptr__A_0),XMM13);
  1526. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1527. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  1528. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1529. ADDQ1280($8*SIZE,_ptr__A_0);
  1530. ADDQ1280($2*SIZE,_ptr__B_0);
  1531. ._L_41_loopE:;
  1532. TESTQ1280($1,_bk_l);
  1533. JLE ._L_42_loopE;
  1534. ALIGN_4;
  1535. ._L_42_bodyB:;
  1536. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1537. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1538. VLD1282(0*SIZE(_ptr__A_0),XMM13);
  1539. VMA1282(XMM13,XMM15,XMM0,XMM0);
  1540. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1541. VMA1282(XMM14,XMM15,XMM1,XMM1);
  1542. ADDQ1280($4*SIZE,_ptr__A_0);
  1543. ADDQ1280($1*SIZE,_ptr__B_0);
  1544. ._L_42_loopE:;
  1545. BROAD1282(alpha,XMM2);
  1546. VLDU1282(0*SIZE(_ptr__C_0),XMM3);
  1547. VMA21282(XMM2,XMM0,XMM3,XMM0);
  1548. VSTU1282(XMM3,0*SIZE(_ptr__C_0));
  1549. VLDU1282(2*SIZE(_ptr__C_0),XMM4);
  1550. VMA21282(XMM2,XMM1,XMM4,XMM1);
  1551. VSTU1282(XMM4,2*SIZE(_ptr__C_0));
  1552. ADDQ1280($4*SIZE,_ptr__C_0);
  1553. ADDQ1280($4*SIZE,_ptr__C_1);
  1554. ._L_39_loopE:;
  1555. TESTQ1280($2,i);
  1556. JLE ._L_43_loopE;
  1557. ALIGN_4;
  1558. ._L_43_bodyB:;
  1559. MOVQ1280(_ptr_B,_ptr__B_0);
  1560. VXOR1282(XMM0,XMM0,XMM0);
  1561. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  1562. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  1563. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  1564. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  1565. MOVQ1280(_bk_l,k);
  1566. SARQ1280($2,k);
  1567. JLE ._L_44_loopE;
  1568. ALIGN_4;
  1569. ._L_44_bodyB:;
  1570. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1571. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1572. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  1573. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1574. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1575. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1576. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1577. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1578. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  1579. BROAD1282(2*SIZE(_ptr__B_0),XMM15);
  1580. VLD1282(4*SIZE(_ptr__A_0),XMM14);
  1581. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1582. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  1583. BROAD1282(3*SIZE(_ptr__B_0),XMM15);
  1584. VLD1282(6*SIZE(_ptr__A_0),XMM14);
  1585. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1586. ADDQ1280($8*SIZE,_ptr__A_0);
  1587. ADDQ1280($4*SIZE,_ptr__B_0);
  1588. ._L_44_bodyE:;
  1589. DECQ1280(k);
  1590. JG ._L_44_bodyB;
  1591. ALIGN_4;
  1592. ._L_44_loopE:;
  1593. TESTQ1280($2,_bk_l);
  1594. JLE ._L_45_loopE;
  1595. ALIGN_4;
  1596. ._L_45_bodyB:;
  1597. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1598. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1599. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  1600. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1601. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1602. BROAD1282(1*SIZE(_ptr__B_0),XMM15);
  1603. VLD1282(2*SIZE(_ptr__A_0),XMM14);
  1604. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1605. ADDQ1280($4*SIZE,_ptr__A_0);
  1606. ADDQ1280($2*SIZE,_ptr__B_0);
  1607. ._L_45_loopE:;
  1608. TESTQ1280($1,_bk_l);
  1609. JLE ._L_46_loopE;
  1610. ALIGN_4;
  1611. ._L_46_bodyB:;
  1612. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1613. BROAD1282(0*SIZE(_ptr__B_0),XMM15);
  1614. VLD1282(0*SIZE(_ptr__A_0),XMM14);
  1615. VMA1282(XMM14,XMM15,XMM0,XMM0);
  1616. ADDQ1280($2*SIZE,_ptr__A_0);
  1617. ADDQ1280($1*SIZE,_ptr__B_0);
  1618. ._L_46_loopE:;
  1619. BROAD1282(alpha,XMM1);
  1620. VLDU1282(0*SIZE(_ptr__C_0),XMM2);
  1621. VMA21282(XMM1,XMM0,XMM2,XMM0);
  1622. VSTU1282(XMM2,0*SIZE(_ptr__C_0));
  1623. ADDQ1280($2*SIZE,_ptr__C_0);
  1624. ADDQ1280($2*SIZE,_ptr__C_1);
  1625. ._L_43_loopE:;
  1626. TESTQ1280($1,i);
  1627. JLE ._L_47_loopE;
  1628. ALIGN_4;
  1629. ._L_47_bodyB:;
  1630. MOVQ1280(_ptr_B,_ptr__B_0);
  1631. VXOR1281(XMM0,XMM0,XMM0);
  1632. PREFETCHN1280(3*SIZE(_ptr__C_0),N);
  1633. PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N);
  1634. PREFETCHN1280(3*SIZE(_ptr__C_1),N);
  1635. PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N);
  1636. MOVQ1280(_bk_l,k);
  1637. SARQ1280($2,k);
  1638. JLE ._L_48_loopE;
  1639. ALIGN_4;
  1640. ._L_48_bodyB:;
  1641. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1642. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  1643. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  1644. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1645. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1646. BROAD1281(1*SIZE(_ptr__B_0),XMM15);
  1647. VLD1281(1*SIZE(_ptr__A_0),XMM14);
  1648. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1649. PREFETCH01280(176*SIZE(_ptr__A_0),0);
  1650. BROAD1281(2*SIZE(_ptr__B_0),XMM15);
  1651. VLD1281(2*SIZE(_ptr__A_0),XMM14);
  1652. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1653. PREFETCH21280(184*SIZE(_ptr__A_0),2);
  1654. BROAD1281(3*SIZE(_ptr__B_0),XMM15);
  1655. VLD1281(3*SIZE(_ptr__A_0),XMM14);
  1656. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1657. ADDQ1280($4*SIZE,_ptr__A_0);
  1658. ADDQ1280($4*SIZE,_ptr__B_0);
  1659. ._L_48_bodyE:;
  1660. DECQ1280(k);
  1661. JG ._L_48_bodyB;
  1662. ALIGN_4;
  1663. ._L_48_loopE:;
  1664. TESTQ1280($2,_bk_l);
  1665. JLE ._L_49_loopE;
  1666. ALIGN_4;
  1667. ._L_49_bodyB:;
  1668. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1669. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  1670. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  1671. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1672. PREFETCH21280(168*SIZE(_ptr__A_0),2);
  1673. BROAD1281(1*SIZE(_ptr__B_0),XMM15);
  1674. VLD1281(1*SIZE(_ptr__A_0),XMM14);
  1675. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1676. ADDQ1280($2*SIZE,_ptr__A_0);
  1677. ADDQ1280($2*SIZE,_ptr__B_0);
  1678. ._L_49_loopE:;
  1679. TESTQ1280($1,_bk_l);
  1680. JLE ._L_50_loopE;
  1681. ALIGN_4;
  1682. ._L_50_bodyB:;
  1683. PREFETCH01280(160*SIZE(_ptr__A_0),0);
  1684. BROAD1281(0*SIZE(_ptr__B_0),XMM15);
  1685. VLD1281(0*SIZE(_ptr__A_0),XMM14);
  1686. VMA1281(XMM14,XMM15,XMM0,XMM0);
  1687. ADDQ1280($1*SIZE,_ptr__A_0);
  1688. ADDQ1280($1*SIZE,_ptr__B_0);
  1689. ._L_50_loopE:;
  1690. BROAD1281(alpha,XMM1);
  1691. VLDU1281(0*SIZE(_ptr__C_0),XMM2);
  1692. VMA21281(XMM1,XMM0,XMM2,XMM0);
  1693. VSTU1281(XMM2,0*SIZE(_ptr__C_0));
  1694. ADDQ1280($1*SIZE,_ptr__C_0);
  1695. ADDQ1280($1*SIZE,_ptr__C_1);
  1696. ._L_47_loopE:;
  1697. MOVQ1280(LDC,%rax);
  1698. ADDQ1280(%rax,_ptr_C);
  1699. MOVQ1280(_bk_l,%rax);
  1700. SALQ1280($3,%rax);
  1701. ADDQ1280(%rax,_ptr_B);
  1702. ._L_34_loopE:;
  1703. vzeroupper
  1704. movq 0(%rsp), %rbx;
  1705. movq 8(%rsp), %rbp;
  1706. movq 16(%rsp), %r12;
  1707. movq 24(%rsp), %r13;
  1708. movq 32(%rsp), %r14;
  1709. movq 40(%rsp), %r15;
  1710. addq $STACKSIZE, %rsp;
  1711. ret
  1712. EPILOGUE