You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x8_skylakex_2.c 27 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. #include "common.h"
  2. #include <stdint.h>
  3. #include <immintrin.h>
  4. //register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
  5. /* row-major c_block */
  6. #define INNER_KERNEL_k1m1n8 \
  7. "prefetcht0 384(%1);"\
  8. "vmovupd (%1),%%zmm5; addq $64,%1;"\
  9. "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;"
  10. #define INNER_KERNEL_k1m2n8 \
  11. INNER_KERNEL_k1m1n8\
  12. "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
  13. #define INNER_KERNEL_k1m1n16 \
  14. "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
  15. "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
  16. "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
  17. #define INNER_KERNEL_k1m2n16 \
  18. INNER_KERNEL_k1m1n16\
  19. "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
  20. #define INNER_KERNEL_k1m1n24 \
  21. "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
  22. "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
  23. "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
  24. #define INNER_KERNEL_k1m2n24 \
  25. INNER_KERNEL_k1m1n24\
  26. "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
  27. /* row-major z-partition c_block */
  28. #define INNER_KERNEL_k1m4n8 \
  29. "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\
  30. "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\
  31. "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;"
  32. #define INNER_KERNEL_k1m4n16 \
  33. INNER_KERNEL_k1m4n8\
  34. "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\
  35. "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;"
  36. #define INNER_KERNEL_k1m4n24 \
  37. INNER_KERNEL_k1m4n16\
  38. "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\
  39. "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;"
  40. #define INNER_KERNEL_k1m8n8 \
  41. "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\
  42. "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\
  43. "prefetcht0 128(%1);"\
  44. "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\
  45. "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\
  46. "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\
  47. "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;"
  48. #define INNER_KERNEL_k1m8n16 \
  49. INNER_KERNEL_k1m8n8\
  50. "prefetcht0 128(%1,%%r12,2);"\
  51. "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\
  52. "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\
  53. "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\
  54. "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;"
  55. #define INNER_KERNEL_k1m8n24 \
  56. INNER_KERNEL_k1m8n16\
  57. "prefetcht0 128(%1,%%r12,4);"\
  58. "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\
  59. "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\
  60. "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\
  61. "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;"
  62. /* micro kernels */
  63. #define INNER_KERNELm1(nn) \
  64. "cmpq $1,%2;jb "#nn"3f;"\
  65. #nn"4:\n\t"\
  66. INNER_KERNEL_k1m1n##nn "addq $8,%0;"\
  67. "decq %2;cmpq $1,%2;jnb "#nn"4b;"\
  68. #nn"3:\n\t"
  69. #define INNER_KERNELm2(nn) \
  70. "cmpq $1,%2;jb "#nn"0f;"\
  71. #nn"1:\n\t"\
  72. INNER_KERNEL_k1m2n##nn "addq $16,%0;"\
  73. "decq %2;cmpq $1,%2;jnb "#nn"1b;"\
  74. #nn"0:\n\t"
  75. #define INNER_KERNELm4(nn) \
  76. "cmpq $1,%2;jb "#nn"00f;"\
  77. #nn"01:\n\t"\
  78. INNER_KERNEL_k1m4n##nn "addq $64,%1;"\
  79. "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
  80. #nn"00:\n\t"
  81. /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
  82. #define INNER_KERNELm8(nn) \
  83. "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\
  84. #nn"008:\n\t"\
  85. INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
  86. INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
  87. INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
  88. "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
  89. INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
  90. INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
  91. INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
  92. "prefetcht1 (%11); addq $32,%11;"\
  93. "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\
  94. "movq %3,%10;"\
  95. #nn"001:\n\t"\
  96. "cmpq $1,%2;jb "#nn"000f;"\
  97. "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
  98. INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
  99. "decq %2;jmp "#nn"001b;"\
  100. ""#nn"000:\n\t"
  101. #define INNER_INIT_m1n8 \
  102. "vpxorq %%zmm8, %%zmm8, %%zmm8;"
  103. #define INNER_INIT_m2n8 \
  104. "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;"
  105. #define INNER_INIT_m4n8 \
  106. "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
  107. #define INNER_INIT_m8n8 \
  108. INNER_INIT_m4n8\
  109. "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;"
  110. #define INNER_INIT_m1n16 INNER_INIT_m2n8
  111. #define INNER_INIT_m2n16 INNER_INIT_m4n8
  112. #define INNER_INIT_m4n16 INNER_INIT_m8n8
  113. #define INNER_INIT_m8n16 \
  114. INNER_INIT_m8n8\
  115. "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\
  116. "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;"
  117. #define INNER_INIT_m1n24 \
  118. "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;"
  119. #define INNER_INIT_m2n24 \
  120. INNER_INIT_m1n24\
  121. "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;"
  122. #define INNER_INIT_m4n24 \
  123. INNER_INIT_m4n16\
  124. "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"
  125. #define INNER_INIT_m8n24 \
  126. INNER_INIT_m8n16\
  127. "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\
  128. "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;"
  129. #define INNER_SETINDEX \
  130. "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\
  131. "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\
  132. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  133. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  134. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  135. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  136. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  137. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"
  138. #define INNER_STORE_m1n8(c1,disp) \
  139. "kxnorw %%k1,%%k1,%%k1;"\
  140. "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\
  141. "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
  142. "kxnorw %%k1,%%k1,%%k1;"\
  143. "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};"
  144. #define INNER_SAVE_m1n8 \
  145. "movq %3,%10;"\
  146. INNER_SETINDEX\
  147. INNER_STORE_m1n8(%%zmm8,0)
  148. #define INNER_SAVE_m1n16 \
  149. INNER_SAVE_m1n8\
  150. "leaq (%10,%4,8),%10;"\
  151. INNER_STORE_m1n8(%%zmm9,0)
  152. #define INNER_SAVE_m1n24 \
  153. INNER_SAVE_m1n16\
  154. "leaq (%10,%4,8),%10;"\
  155. INNER_STORE_m1n8(%%zmm10,0)
  156. #define INNER_SAVE_m2n8 \
  157. "movq %3,%10;"\
  158. INNER_SETINDEX\
  159. INNER_STORE_m1n8(%%zmm8,0)\
  160. INNER_STORE_m1n8(%%zmm9,8)
  161. #define INNER_SAVE_m2n16 \
  162. "movq %3,%10;"\
  163. INNER_SETINDEX\
  164. INNER_STORE_m1n8(%%zmm8,0)\
  165. INNER_STORE_m1n8(%%zmm10,8)\
  166. "leaq (%10,%4,8),%10;"\
  167. INNER_STORE_m1n8(%%zmm9,0)\
  168. INNER_STORE_m1n8(%%zmm11,8)
  169. #define INNER_SAVE_m2n24 \
  170. "movq %3,%10;"\
  171. INNER_SETINDEX\
  172. INNER_STORE_m1n8(%%zmm8,0)\
  173. INNER_STORE_m1n8(%%zmm11,8)\
  174. "leaq (%10,%4,8),%10;"\
  175. INNER_STORE_m1n8(%%zmm9,0)\
  176. INNER_STORE_m1n8(%%zmm12,8)\
  177. "leaq (%10,%4,8),%10;"\
  178. INNER_STORE_m1n8(%%zmm10,0)\
  179. INNER_STORE_m1n8(%%zmm13,8)
  180. #define INNER_TRANS_4x8(c1,c2,c3,c4) \
  181. "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\
  182. "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\
  183. "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\
  184. "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\
  185. #define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \
  186. "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\
  187. "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\
  188. "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\
  189. "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};"
  190. #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
  191. INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8)
  192. //%7 for k01(input) only when m=4
  193. #define INNER_STORE_4x8(c1,c2,c3,c4) \
  194. "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
  195. "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
  196. "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
  197. "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
  198. "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
  199. "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
  200. "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
  201. "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
  202. "leaq (%10,%4,4),%10;"
  203. #define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
  204. "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\
  205. "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\
  206. "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\
  207. "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;"
  208. #define INNER_SAVE_m4n8 \
  209. "movq %3,%10;"\
  210. INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
  211. INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
  212. #define INNER_SAVE_m4n16 \
  213. INNER_SAVE_m4n8\
  214. INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
  215. INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
  216. #define INNER_SAVE_m4n24 \
  217. INNER_SAVE_m4n16\
  218. INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\
  219. INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
  220. #define INNER_SAVE_m8n8 \
  221. "movq %3,%10;"\
  222. INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
  223. INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
  224. #define INNER_SAVE_m8n16 \
  225. INNER_SAVE_m8n8\
  226. INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\
  227. INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)
  228. #define INNER_SAVE_m8n24 \
  229. INNER_SAVE_m8n16\
  230. INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\
  231. INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)
  232. #define COMPUTE_n8 {\
  233. b_pref = packed_b_pointer + 8 * K;\
  234. __asm__ __volatile__(\
  235. "vbroadcastsd (%9),%%zmm3;"\
  236. "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
  237. "cmpq $8,%8; jb 42222f;"\
  238. "42221:\n\t"\
  239. INNER_INIT_m8n8\
  240. INNER_KERNELm8(8)\
  241. INNER_SAVE_m8n8\
  242. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
  243. "addq $64,%3;"\
  244. "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
  245. "42222:\n\t"\
  246. "cmpq $4,%8; jb 42223f;"\
  247. INNER_INIT_m4n8\
  248. INNER_KERNELm4(8)\
  249. INNER_SAVE_m4n8\
  250. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  251. "addq $32,%3;"\
  252. "subq $4,%8;"\
  253. "42223:\n\t"\
  254. "cmpq $2,%8; jb 42224f;"\
  255. INNER_INIT_m2n8\
  256. INNER_KERNELm2(8)\
  257. INNER_SAVE_m2n8\
  258. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  259. "addq $16,%3;"\
  260. "subq $2,%8;"\
  261. "42224:\n\t"\
  262. "cmpq $1,%8; jb 42225f;"\
  263. INNER_INIT_m1n8\
  264. INNER_KERNELm1(8)\
  265. INNER_SAVE_m1n8\
  266. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  267. "addq $8,%3;"\
  268. "42225:\n\t"\
  269. "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
  270. "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
  271. :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
  272. "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
  273. ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
  274. a_block_pointer -= M * K;\
  275. }
  276. #define COMPUTE_n16 {\
  277. b_pref = packed_b_pointer + 16 * K;\
  278. __asm__ __volatile__(\
  279. "vbroadcastsd (%9),%%zmm3;"\
  280. "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
  281. "cmpq $8,%8; jb 32222f;"\
  282. "32221:\n\t"\
  283. INNER_INIT_m8n16\
  284. INNER_KERNELm8(16)\
  285. INNER_SAVE_m8n16\
  286. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
  287. "addq $64,%3;"\
  288. "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
  289. "32222:\n\t"\
  290. "cmpq $4,%8; jb 32223f;"\
  291. INNER_INIT_m4n16\
  292. INNER_KERNELm4(16)\
  293. INNER_SAVE_m4n16\
  294. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  295. "addq $32,%3;"\
  296. "subq $4,%8;"\
  297. "32223:\n\t"\
  298. "cmpq $2,%8; jb 32224f;"\
  299. INNER_INIT_m2n16\
  300. INNER_KERNELm2(16)\
  301. INNER_SAVE_m2n16\
  302. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  303. "addq $16,%3;"\
  304. "subq $2,%8;"\
  305. "32224:\n\t"\
  306. "cmpq $1,%8; jb 32225f;"\
  307. INNER_INIT_m1n16\
  308. INNER_KERNELm1(16)\
  309. INNER_SAVE_m1n16\
  310. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  311. "addq $8,%3;"\
  312. "32225:\n\t"\
  313. "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
  314. "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
  315. "leaq (%1,%%r12,4),%1;"\
  316. :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
  317. "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
  318. ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
  319. "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
  320. a_block_pointer -= M * K;\
  321. }
  322. #define COMPUTE_n24 {\
  323. b_pref = packed_b_pointer + 24 * K;\
  324. __asm__ __volatile__(\
  325. "vbroadcastsd (%9),%%zmm3;"\
  326. "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
  327. "cmpq $8,%8; jb 22222f;"\
  328. "22221:\n\t"\
  329. INNER_INIT_m8n24\
  330. INNER_KERNELm8(24)\
  331. INNER_SAVE_m8n24\
  332. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
  333. "addq $64,%3;"\
  334. "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
  335. "22222:\n\t"\
  336. "cmpq $4,%8; jb 22223f;"\
  337. INNER_INIT_m4n24\
  338. INNER_KERNELm4(24)\
  339. INNER_SAVE_m4n24\
  340. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  341. "addq $32,%3;"\
  342. "subq $4,%8;"\
  343. "22223:\n\t"\
  344. "cmpq $2,%8; jb 22224f;"\
  345. INNER_INIT_m2n24\
  346. INNER_KERNELm2(24)\
  347. INNER_SAVE_m2n24\
  348. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  349. "addq $16,%3;"\
  350. "subq $2,%8;"\
  351. "22224:\n\t"\
  352. "cmpq $1,%8; jb 22225f;"\
  353. INNER_INIT_m1n24\
  354. INNER_KERNELm1(24)\
  355. INNER_SAVE_m1n24\
  356. "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
  357. "addq $8,%3;"\
  358. "22225:\n\t"\
  359. "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
  360. "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
  361. "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
  362. :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
  363. "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\
  364. "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\
  365. "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
  366. a_block_pointer -= M * K;\
  367. }
  368. static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8
  369. //perform C += A<pack> B<pack>
  370. if(k==0 || m==0 || ndiv8==0) return;
  371. int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
  372. int64_t K = (int64_t)k; int64_t M = (int64_t)m;
  373. double *a_block_pointer,*b_pref;
  374. double *c_pointer = c,*c_store = c;
  375. __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
  376. BLASLONG ndiv8_count;
  377. double *packed_b_pointer = packed_b;
  378. a_block_pointer = packed_a;
  379. for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
  380. COMPUTE_n24
  381. }
  382. for(;ndiv8_count>1;ndiv8_count-=2){
  383. COMPUTE_n16
  384. }
  385. if(ndiv8_count>0){
  386. COMPUTE_n8
  387. }
  388. }
  389. /* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */
  390. /* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */
  391. /* double accumulator: sc1; temporary variables: sa1,sb1 */
  392. /* column-major c_block */
  393. #define KERNEL_m4n4k1 {\
  394. ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
  395. yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  396. yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
  397. yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\
  398. yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\
  399. b_block_pointer+=4;\
  400. }
  401. #define KERNEL_m4n2k1 {\
  402. ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
  403. yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  404. yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
  405. b_block_pointer+=2;\
  406. }
  407. #define KERNEL_m4n1k1 {\
  408. ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
  409. yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  410. b_block_pointer++;\
  411. }
  412. #define INIT_m4n1 yc1=_mm256_setzero_pd();
  413. #define INIT_m4n2 yc2=INIT_m4n1
  414. #define INIT_m4n4 yc4=yc3=INIT_m4n2
  415. #define SAVE_m4n1 {\
  416. yb1 = _mm256_broadcast_sd(alpha);\
  417. ya1 = _mm256_loadu_pd(c_pointer);\
  418. yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
  419. _mm256_storeu_pd(c_pointer,yc1);\
  420. c_pointer += 4;\
  421. }
  422. #define SAVE_m4n2 {\
  423. ya1 = _mm256_broadcast_sd(alpha);\
  424. yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
  425. yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
  426. _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
  427. c_pointer += 4;\
  428. }
  429. #define SAVE_m4n4 {\
  430. ya1 = _mm256_broadcast_sd(alpha);\
  431. yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
  432. yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
  433. _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
  434. c_pointer += LDC*2;\
  435. yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
  436. yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
  437. _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
  438. c_pointer += 4-LDC*2;\
  439. }
  440. #define KERNEL_m2n2k1 {\
  441. xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
  442. xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
  443. xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\
  444. b_block_pointer += 2;\
  445. }
  446. #define KERNEL_m2n1k1 {\
  447. xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
  448. xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
  449. b_block_pointer ++;\
  450. }
  451. #define INIT_m2n1 xc1=_mm_setzero_pd();
  452. #define INIT_m2n2 xc2=INIT_m2n1
  453. #define SAVE_m2n1 {\
  454. xb1 = _mm_loaddup_pd(alpha);\
  455. xa1 = _mm_loadu_pd(c_pointer);\
  456. xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
  457. _mm_storeu_pd(c_pointer,xc1);\
  458. c_pointer += 2;\
  459. }
  460. #define SAVE_m2n2 {\
  461. xa1 = _mm_loaddup_pd(alpha);\
  462. xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
  463. xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
  464. _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
  465. c_pointer += 2;\
  466. }
  467. #define KERNEL_m1n1k1 {\
  468. sa1 = *a_block_pointer; a_block_pointer++;\
  469. sb1 = *b_block_pointer; sc1 += sa1 * sb1;\
  470. b_block_pointer ++;\
  471. }
  472. #define INIT_m1n1 sc1=0.0;
  473. #define SAVE_m1n1 {\
  474. *c_pointer += sc1 * (*alpha);\
  475. c_pointer++;\
  476. }
  477. /* row-major c_block */
  478. #define KERNEL_m2n4k1 {\
  479. yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
  480. ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  481. ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\
  482. a_block_pointer += 2;\
  483. }
  484. #define KERNEL_m1n4k1 {\
  485. yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
  486. ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  487. a_block_pointer ++;\
  488. }
  489. #define KERNEL_m1n2k1 {\
  490. xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\
  491. xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
  492. a_block_pointer ++;\
  493. }
  494. #define INIT_m1n2 INIT_m2n1
  495. #define INIT_m1n4 INIT_m4n1
  496. #define INIT_m2n4 INIT_m4n2
  497. #define SAVE_m2n4 {\
  498. ya1 = _mm256_broadcast_sd(alpha);\
  499. yc1 = _mm256_mul_pd(yc1,ya1);\
  500. yc2 = _mm256_mul_pd(yc2,ya1);\
  501. yb1 = _mm256_unpacklo_pd(yc1,yc2);\
  502. yb2 = _mm256_unpackhi_pd(yc1,yc2);\
  503. xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
  504. xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\
  505. _mm_storeu_pd(c_pointer,xb1);\
  506. _mm_storeu_pd(c_pointer+LDC,xb2);\
  507. xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\
  508. xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\
  509. _mm_storeu_pd(c_pointer+2*LDC,xb1);\
  510. _mm_storeu_pd(c_pointer+3*LDC,xb2);\
  511. c_pointer += 2;\
  512. }
  513. #define SAVE_m1n2 {\
  514. xb1 = _mm_loaddup_pd(alpha);\
  515. xc1 = _mm_mul_pd(xc1,xb1);\
  516. *c_pointer += _mm_cvtsd_f64(xc1);\
  517. xa1 = _mm_unpackhi_pd(xc1,xc1);\
  518. c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
  519. c_pointer ++;\
  520. }
  521. #define SAVE_m1n4 {\
  522. ya1 = _mm256_broadcast_sd(alpha);\
  523. yc1 = _mm256_mul_pd(yc1,ya1);\
  524. xb1 = _mm256_extractf128_pd(yc1,0);\
  525. *c_pointer += _mm_cvtsd_f64(xb1);\
  526. xb2 = _mm_unpackhi_pd(xb1,xb1);\
  527. c_pointer[LDC] += _mm_cvtsd_f64(xb2);\
  528. xb1 = _mm256_extractf128_pd(yc1,1);\
  529. c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
  530. xb2 = _mm_unpackhi_pd(xb1,xb1);\
  531. c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\
  532. c_pointer ++;\
  533. }
  534. static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
  535. //perform C += A<pack> B<pack> , edge_n<8 must be satisfied.
  536. if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return;
  537. double *a_block_pointer,*b_block_pointer,*b_base_pointer;
  538. double *c_pointer = c;
  539. __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2;
  540. __m128d xc1,xc2,xa1,xb1,xb2;
  541. double sc1,sa1,sb1;
  542. BLASLONG m_count,n_count,k_count;
  543. b_base_pointer = packed_b;
  544. //now start calculation of the edge part
  545. for(n_count=edge_n;n_count>3;n_count-=4){
  546. a_block_pointer = packed_a;
  547. for(m_count=m;m_count>3;m_count-=4){
  548. b_block_pointer = b_base_pointer;
  549. INIT_m4n4
  550. for(k_count=0;k_count<k;k_count++) KERNEL_m4n4k1
  551. SAVE_m4n4
  552. }
  553. for(;m_count>1;m_count-=2){
  554. b_block_pointer = b_base_pointer;
  555. INIT_m2n4
  556. for(k_count=0;k_count<k;k_count++) KERNEL_m2n4k1
  557. SAVE_m2n4
  558. }
  559. if(m_count>0){
  560. b_block_pointer = b_base_pointer;
  561. INIT_m1n4
  562. for(k_count=0;k_count<k;k_count++) KERNEL_m1n4k1
  563. SAVE_m1n4
  564. }
  565. b_base_pointer += 4*k;
  566. c_pointer += 4 * LDC - m;
  567. }
  568. for(;n_count>1;n_count-=2){
  569. a_block_pointer = packed_a;
  570. for(m_count=m;m_count>3;m_count-=4){
  571. b_block_pointer = b_base_pointer;
  572. INIT_m4n2
  573. for(k_count=0;k_count<k;k_count++) KERNEL_m4n2k1
  574. SAVE_m4n2
  575. }
  576. for(;m_count>1;m_count-=2){
  577. b_block_pointer = b_base_pointer;
  578. INIT_m2n2
  579. for(k_count=0;k_count<k;k_count++) KERNEL_m2n2k1
  580. SAVE_m2n2
  581. }
  582. if(m_count>0){
  583. b_block_pointer = b_base_pointer;
  584. INIT_m1n2
  585. for(k_count=0;k_count<k;k_count++) KERNEL_m1n2k1
  586. SAVE_m1n2
  587. }
  588. b_base_pointer += 2*k;
  589. c_pointer += 2 * LDC - m;
  590. }
  591. if(n_count>0){
  592. a_block_pointer = packed_a;
  593. for(m_count=m;m_count>3;m_count-=4){
  594. b_block_pointer = b_base_pointer;
  595. INIT_m4n1
  596. for(k_count=0;k_count<k;k_count++) KERNEL_m4n1k1
  597. SAVE_m4n1
  598. }
  599. for(;m_count>1;m_count-=2){
  600. b_block_pointer = b_base_pointer;
  601. INIT_m2n1
  602. for(k_count=0;k_count<k;k_count++) KERNEL_m2n1k1
  603. SAVE_m2n1
  604. }
  605. if(m_count>0){
  606. b_block_pointer = b_base_pointer;
  607. INIT_m1n1
  608. for(k_count=0;k_count<k;k_count++) KERNEL_m1n1k1
  609. SAVE_m1n1
  610. }
  611. }
  612. }
  613. int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
  614. if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
  615. BLASLONG ndiv8 = n/8;double ALPHA = alpha;
  616. double *packed_a = A;
  617. if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
  618. if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
  619. return 0;
  620. }