You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x8_skylakex.c 32 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782
  1. #include "common.h"
  2. #include <stdint.h>
  3. #include <immintrin.h>
  4. #define ICOPY_4
  5. //register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
  6. /* row-major c_block */
  7. #define INNER_KERNEL_k1m1n8 \
  8. "prefetcht0 384(%1);"\
  9. "prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\
  10. "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;"
  11. #define INNER_KERNEL_k1m2n8 \
  12. INNER_KERNEL_k1m1n8\
  13. "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
  14. #define INNER_KERNEL_k1m4n8 \
  15. INNER_KERNEL_k1m2n8\
  16. "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
  17. "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
  18. #define INNER_KERNEL_k1m8n8 \
  19. INNER_KERNEL_k1m4n8\
  20. "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
  21. "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
  22. "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
  23. "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
  24. #define INNER_KERNEL_k1m1n16 \
  25. "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1);"\
  26. "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; addq $64,%1;"\
  27. "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
  28. #define INNER_KERNEL_k1m2n16 \
  29. INNER_KERNEL_k1m1n16\
  30. "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
  31. #define INNER_KERNEL_k1m4n16 \
  32. INNER_KERNEL_k1m2n16\
  33. "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
  34. "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
  35. #define INNER_KERNEL_k1m8n16 \
  36. INNER_KERNEL_k1m4n16\
  37. "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
  38. "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
  39. "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
  40. "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
  41. #define INNER_KERNEL_k1m1n24 \
  42. "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1); prefetcht0 128(%1,%%r12,2);"\
  43. "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; vmovupd (%1,%%r12,2),%%zmm7; addq $64,%1;"\
  44. "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
  45. #define INNER_KERNEL_k1m2n24 \
  46. INNER_KERNEL_k1m1n24\
  47. "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
  48. #define INNER_KERNEL_k1m4n24 \
  49. INNER_KERNEL_k1m2n24\
  50. "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
  51. "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
  52. #define INNER_KERNEL_k1m8n24 \
  53. INNER_KERNEL_k1m4n24\
  54. "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
  55. "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
  56. "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
  57. "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
  58. #define INNER_KERNELm1(nn) \
  59. "cmpq $1,%2;jb "#nn"3f;"\
  60. #nn"4:\n\t"\
  61. INNER_KERNEL_k1m1n##nn "addq $8,%0;"\
  62. "decq %2;cmpq $1,%2;jnb "#nn"4b;"\
  63. #nn"3:\n\t"
  64. #define INNER_KERNELm2(nn) \
  65. "cmpq $1,%2;jb "#nn"0f;"\
  66. #nn"1:\n\t"\
  67. INNER_KERNEL_k1m2n##nn "addq $16,%0;"\
  68. "decq %2;cmpq $1,%2;jnb "#nn"1b;"\
  69. #nn"0:\n\t"
  70. #define INNER_KERNELm4(nn) \
  71. "cmpq $1,%2;jb "#nn"00f;"\
  72. #nn"01:\n\t"\
  73. INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
  74. "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
  75. #nn"00:\n\t"
  76. #define INNER_KERNELm8(nn) \
  77. "cmpq $8,%2;jb "#nn"001f;"\
  78. #nn"008:\n\t"\
  79. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  80. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  81. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  82. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  83. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  84. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  85. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  86. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  87. "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\
  88. #nn"001:\n\t"\
  89. "cmpq $1,%2;jb "#nn"000f;"\
  90. INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
  91. "decq %2;cmpq $1,%2;jnb "#nn"001b;"\
  92. ""#nn"000:\n\t"
  93. #define INNER_INIT_m1n8 \
  94. "vpxorq %%zmm8, %%zmm8, %%zmm8;"
  95. #define INNER_INIT_m2n8 \
  96. "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;"
  97. #define INNER_INIT_m4n8 \
  98. "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
  99. #define INNER_INIT_m8n8 \
  100. INNER_INIT_m4n8\
  101. "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;"
  102. #define INNER_INIT_m1n16 INNER_INIT_m2n8
  103. #define INNER_INIT_m2n16 INNER_INIT_m4n8
  104. #define INNER_INIT_m4n16 INNER_INIT_m8n8
  105. #define INNER_INIT_m8n16 \
  106. INNER_INIT_m8n8\
  107. "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\
  108. "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;"
  109. #define INNER_INIT_m1n24 \
  110. "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;"
  111. #define INNER_INIT_m2n24 \
  112. INNER_INIT_m1n24\
  113. "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;"
  114. #define INNER_INIT_m4n24 \
  115. INNER_INIT_m4n16\
  116. "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"
  117. #define INNER_INIT_m8n24 \
  118. INNER_INIT_m8n16\
  119. "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\
  120. "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;"
  121. #define INNER_SETINDEX \
  122. "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\
  123. "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\
  124. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  125. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  126. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  127. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  128. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
  129. "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"
  130. #define INNER_STORE_m1n8(c1,disp) \
  131. "kxnorw %%k1,%%k1,%%k1;"\
  132. "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\
  133. "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
  134. "kxnorw %%k1,%%k1,%%k1;"\
  135. "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};"
  136. #define INNER_SAVE_m1n8 \
  137. INNER_SETINDEX\
  138. INNER_STORE_m1n8(%%zmm8,0)
  139. #define INNER_SAVE_m1n16 \
  140. INNER_SAVE_m1n8\
  141. "leaq (%3,%4,8),%3;"\
  142. INNER_STORE_m1n8(%%zmm9,0)
  143. #define INNER_SAVE_m1n24 \
  144. INNER_SAVE_m1n16\
  145. "leaq (%3,%4,8),%3;"\
  146. INNER_STORE_m1n8(%%zmm10,0)
  147. #define INNER_SAVE_m2n8 \
  148. INNER_SETINDEX\
  149. INNER_STORE_m1n8(%%zmm8,0)\
  150. INNER_STORE_m1n8(%%zmm9,8)
  151. #define INNER_SAVE_m2n16 \
  152. INNER_SETINDEX\
  153. INNER_STORE_m1n8(%%zmm8,0)\
  154. INNER_STORE_m1n8(%%zmm10,8)\
  155. "leaq (%3,%4,8),%3;"\
  156. INNER_STORE_m1n8(%%zmm9,0)\
  157. INNER_STORE_m1n8(%%zmm11,8)
  158. #define INNER_SAVE_m2n24 \
  159. INNER_SETINDEX\
  160. INNER_STORE_m1n8(%%zmm8,0)\
  161. INNER_STORE_m1n8(%%zmm11,8)\
  162. "leaq (%3,%4,8),%3;"\
  163. INNER_STORE_m1n8(%%zmm9,0)\
  164. INNER_STORE_m1n8(%%zmm12,8)\
  165. "leaq (%3,%4,8),%3;"\
  166. INNER_STORE_m1n8(%%zmm10,0)\
  167. INNER_STORE_m1n8(%%zmm13,8)
  168. #define INNER_PREF_8x8 \
  169. "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\
  170. "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\
  171. "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\
  172. "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\
  173. "subq %4,%3; subq %4,%3; subq %4,%3;"
  174. #define INNER_TRANS_4x8(c1,c2,c3,c4) \
  175. "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
  176. "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
  177. "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
  178. "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
  179. "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
  180. "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
  181. #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
  182. INNER_TRANS_4x8(c1,c2,c3,c4)\
  183. INNER_TRANS_4x8(c5,c6,c7,c8)\
  184. "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
  185. "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\
  186. "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
  187. "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\
  188. "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
  189. "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
  190. "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
  191. "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
  192. //%7 for k01(input) only when m=4
  193. #define INNER_STORE_4x8(c1,c2,c3,c4) \
  194. "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
  195. "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
  196. "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
  197. "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
  198. "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
  199. "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
  200. "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
  201. "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
  202. "leaq (%3,%4,4),%3;"
  203. #define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
  204. "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
  205. "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
  206. "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
  207. "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
  208. "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
  209. "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
  210. "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
  211. "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
  212. #define INNER_SAVE_m4n8 \
  213. INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
  214. INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
  215. #define INNER_SAVE_m4n16 \
  216. INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
  217. INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
  218. INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
  219. INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
  220. #define INNER_SAVE_m4n24 \
  221. INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
  222. INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
  223. INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
  224. INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
  225. INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
  226. INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
  227. #define INNER_SAVE_m8n8 \
  228. INNER_PREF_8x8\
  229. INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
  230. INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
  231. #define INNER_SAVE_m8n16 \
  232. INNER_PREF_8x8\
  233. INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
  234. INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
  235. INNER_PREF_8x8\
  236. INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
  237. INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
  238. #define INNER_SAVE_m8n24 \
  239. INNER_PREF_8x8\
  240. INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
  241. INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
  242. INNER_PREF_8x8\
  243. INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
  244. INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
  245. INNER_PREF_8x8\
  246. INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
  247. INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
  248. #define COMPUTE_n8 {\
  249. __asm__ __volatile__(\
  250. "vbroadcastsd (%9),%%zmm3;"\
  251. "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
  252. "cmpq $8,%8; jb 42222f;"\
  253. "42221:\n\t"\
  254. INNER_INIT_m8n8\
  255. INNER_KERNELm8(8)\
  256. INNER_SAVE_m8n8\
  257. "movq %%r13,%2; subq %%r12,%1;"\
  258. "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
  259. "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
  260. "42222:\n\t"\
  261. "cmpq $4,%8; jb 42223f;"\
  262. INNER_INIT_m4n8\
  263. INNER_KERNELm4(8)\
  264. INNER_SAVE_m4n8\
  265. "movq %%r13,%2; subq %%r12,%1;"\
  266. "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
  267. "subq $4,%8;"\
  268. "42223:\n\t"\
  269. "cmpq $2,%8; jb 42224f;"\
  270. INNER_INIT_m2n8\
  271. INNER_KERNELm2(8)\
  272. INNER_SAVE_m2n8\
  273. "movq %%r13,%2; subq %%r12,%1;"\
  274. "addq $16,%3;"\
  275. "subq $2,%8;"\
  276. "42224:\n\t"\
  277. "cmpq $1,%8; jb 42225f;"\
  278. INNER_INIT_m1n8\
  279. INNER_KERNELm1(8)\
  280. INNER_SAVE_m1n8\
  281. "movq %%r13,%2; subq %%r12,%1;"\
  282. "addq $8,%3;"\
  283. "42225:\n\t"\
  284. "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
  285. "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
  286. :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
  287. ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
  288. a_block_pointer -= M * K;\
  289. }
  290. #define COMPUTE_n16 {\
  291. __asm__ __volatile__(\
  292. "vbroadcastsd (%9),%%zmm3;"\
  293. "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
  294. "cmpq $8,%8; jb 32222f;"\
  295. "32221:\n\t"\
  296. INNER_INIT_m8n16\
  297. INNER_KERNELm8(16)\
  298. INNER_SAVE_m8n16\
  299. "movq %%r13,%2; subq %%r12,%1;"\
  300. "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
  301. "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
  302. "32222:\n\t"\
  303. "cmpq $4,%8; jb 32223f;"\
  304. INNER_INIT_m4n16\
  305. INNER_KERNELm4(16)\
  306. INNER_SAVE_m4n16\
  307. "movq %%r13,%2; subq %%r12,%1;"\
  308. "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
  309. "subq $4,%8;"\
  310. "32223:\n\t"\
  311. "cmpq $2,%8; jb 32224f;"\
  312. INNER_INIT_m2n16\
  313. INNER_KERNELm2(16)\
  314. INNER_SAVE_m2n16\
  315. "movq %%r13,%2; subq %%r12,%1;"\
  316. "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
  317. "subq $2,%8;"\
  318. "32224:\n\t"\
  319. "cmpq $1,%8; jb 32225f;"\
  320. INNER_INIT_m1n16\
  321. INNER_KERNELm1(16)\
  322. INNER_SAVE_m1n16\
  323. "movq %%r13,%2; subq %%r12,%1;"\
  324. "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
  325. "32225:\n\t"\
  326. "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
  327. "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
  328. "leaq (%1,%%r12,2),%1;"\
  329. :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
  330. ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
  331. "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
  332. a_block_pointer -= M * K;\
  333. }
  334. #define COMPUTE_n24 {\
  335. __asm__ __volatile__(\
  336. "vbroadcastsd (%9),%%zmm3;"\
  337. "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
  338. "cmpq $8,%8; jb 22222f;"\
  339. "22221:\n\t"\
  340. INNER_INIT_m8n24\
  341. INNER_KERNELm8(24)\
  342. INNER_SAVE_m8n24\
  343. "movq %%r13,%2; subq %%r12,%1;"\
  344. "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
  345. "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
  346. "22222:\n\t"\
  347. "cmpq $4,%8; jb 22223f;"\
  348. INNER_INIT_m4n24\
  349. INNER_KERNELm4(24)\
  350. INNER_SAVE_m4n24\
  351. "movq %%r13,%2; subq %%r12,%1;"\
  352. "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
  353. "subq $4,%8;"\
  354. "22223:\n\t"\
  355. "cmpq $2,%8; jb 22224f;"\
  356. INNER_INIT_m2n24\
  357. INNER_KERNELm2(24)\
  358. INNER_SAVE_m2n24\
  359. "movq %%r13,%2; subq %%r12,%1;"\
  360. "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
  361. "subq $2,%8;"\
  362. "22224:\n\t"\
  363. "cmpq $1,%8; jb 22225f;"\
  364. INNER_INIT_m1n24\
  365. INNER_KERNELm1(24)\
  366. INNER_SAVE_m1n24\
  367. "movq %%r13,%2; subq %%r12,%1;"\
  368. "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
  369. "22225:\n\t"\
  370. "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
  371. "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
  372. "leaq (%1,%%r12,2),%1; addq %%r12,%1;"\
  373. :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
  374. ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
  375. "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
  376. a_block_pointer -= M * K;\
  377. }
  378. static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
  379. //perform C += A<pack> B<pack>
  380. if(k==0 || m==0 || ndiv8==0) return;
  381. int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
  382. int64_t K = (int64_t)k; int64_t M = (int64_t)m;
  383. double *a_block_pointer;
  384. double *c_pointer = c;
  385. __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
  386. BLASLONG m_count,ndiv8_count,k_count;
  387. double *packed_b_pointer = packed_b;
  388. a_block_pointer = packed_a;
  389. for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
  390. COMPUTE_n24
  391. }
  392. for(;ndiv8_count>1;ndiv8_count-=2){
  393. COMPUTE_n16
  394. }
  395. if(ndiv8_count>0){
  396. COMPUTE_n8
  397. }
  398. }
  399. /* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */
  400. /* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */
  401. /* __m128d accumulators: xc1-xc4; temporary variables: xa1,xb1-xb2 */
  402. /* double accumulator: sc1; temporary variables: sa1,sb1 */
  403. /* column-major c_block */
  404. #define KERNEL_m8n4k1 {\
  405. __asm__ __volatile__(\
  406. "vmovupd (%0),%2; addq $64,%0;"\
  407. "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%5; "\
  408. "vbroadcastsd 8(%1),%4; vfmadd231pd %2,%4,%6; "\
  409. "vbroadcastsd 16(%1),%3; vfmadd231pd %2,%3,%7; "\
  410. "vbroadcastsd 24(%1),%4; vfmadd231pd %2,%4,%8; "\
  411. "addq $32,%1;"\
  412. :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2),"+v"(zc3),"+v"(zc4)::"cc","memory");\
  413. }
  414. #define KERNEL_m8n2k1 {\
  415. __asm__ __volatile__(\
  416. "vmovupd (%0),%2; addq $64,%0;"\
  417. "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%5; "\
  418. "vbroadcastsd 8(%1),%4; vfmadd231pd %2,%4,%6; "\
  419. "addq $16,%1;"\
  420. :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2)::"cc","memory");\
  421. }
  422. #define KERNEL_m8n1k1 {\
  423. __asm__ __volatile__(\
  424. "vmovupd (%0),%2; addq $64,%0;"\
  425. "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%4; "\
  426. "addq $8,%1;"\
  427. :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zc1)::"cc","memory");\
  428. }
  429. #define INIT_m8n1 zc1=_mm512_setzero_pd();
  430. #define INIT_m8n2 zc2=INIT_m8n1
  431. #define INIT_m8n4 zc4=zc3=INIT_m8n2
  432. #define SAVE_m8n1 {\
  433. __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
  434. zb1 = _mm512_loadu_pd(c_pointer);\
  435. zc1 = _mm512_fmadd_pd(zc1,za1,zb1);\
  436. _mm512_storeu_pd(c_pointer,zc1);\
  437. c_pointer += 8;\
  438. }
  439. #define SAVE_m8n2 {\
  440. __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
  441. zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
  442. zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\
  443. _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
  444. c_pointer += 8;\
  445. }
  446. #define SAVE_m8n4 {\
  447. __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
  448. zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
  449. zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\
  450. _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
  451. c_pointer += LDC*2;\
  452. zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
  453. zc3 = _mm512_fmadd_pd(zc3,za1,zb1); zc4 = _mm512_fmadd_pd(zc4,za1,zb2);\
  454. _mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\
  455. c_pointer += 8-LDC*2;\
  456. }
  457. #define KERNEL_m4n4k1 {\
  458. ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
  459. yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  460. yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
  461. yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\
  462. yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\
  463. b_block_pointer+=4;\
  464. }
  465. #define KERNEL_m4n2k1 {\
  466. ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
  467. yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  468. yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
  469. b_block_pointer+=2;\
  470. }
  471. #define KERNEL_m4n1k1 {\
  472. ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
  473. yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  474. b_block_pointer++;\
  475. }
  476. #define INIT_m4n1 yc1=_mm256_setzero_pd();
  477. #define INIT_m4n2 yc2=INIT_m4n1
  478. #define INIT_m4n4 yc4=yc3=INIT_m4n2
  479. #define SAVE_m4n1 {\
  480. yb1 = _mm256_broadcast_sd(alpha);\
  481. ya1 = _mm256_loadu_pd(c_pointer);\
  482. yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
  483. _mm256_storeu_pd(c_pointer,yc1);\
  484. c_pointer += 4;\
  485. }
  486. #define SAVE_m4n2 {\
  487. ya1 = _mm256_broadcast_sd(alpha);\
  488. yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
  489. yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
  490. _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
  491. c_pointer += 4;\
  492. }
  493. #define SAVE_m4n4 {\
  494. ya1 = _mm256_broadcast_sd(alpha);\
  495. yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
  496. yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
  497. _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
  498. c_pointer += LDC*2;\
  499. yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
  500. yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
  501. _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
  502. c_pointer += 4-LDC*2;\
  503. }
  504. #define KERNEL_m2n2k1 {\
  505. xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
  506. xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
  507. xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\
  508. b_block_pointer += 2;\
  509. }
  510. #define KERNEL_m2n1k1 {\
  511. xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
  512. xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
  513. b_block_pointer ++;\
  514. }
  515. #define INIT_m2n1 xc1=_mm_setzero_pd();
  516. #define INIT_m2n2 xc2=INIT_m2n1
  517. #define SAVE_m2n1 {\
  518. xb1 = _mm_loaddup_pd(alpha);\
  519. xa1 = _mm_loadu_pd(c_pointer);\
  520. xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
  521. _mm_storeu_pd(c_pointer,xc1);\
  522. c_pointer += 2;\
  523. }
  524. #define SAVE_m2n2 {\
  525. xa1 = _mm_loaddup_pd(alpha);\
  526. xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
  527. xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
  528. _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
  529. c_pointer += 2;\
  530. }
  531. #define KERNEL_m1n1k1 {\
  532. sa1 = *a_block_pointer; a_block_pointer++;\
  533. sb1 = *b_block_pointer; sc1 += sa1 * sb1;\
  534. b_block_pointer ++;\
  535. }
  536. #define INIT_m1n1 sc1=0.0;
  537. #define SAVE_m1n1 {\
  538. *c_pointer += sc1 * (*alpha);\
  539. c_pointer++;\
  540. }
  541. /* row-major c_block */
  542. #define KERNEL_m2n4k1 {\
  543. yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
  544. ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  545. ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\
  546. a_block_pointer += 2;\
  547. }
  548. #define KERNEL_m1n4k1 {\
  549. yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
  550. ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
  551. a_block_pointer ++;\
  552. }
  553. #define KERNEL_m1n2k1 {\
  554. xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\
  555. xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
  556. a_block_pointer ++;\
  557. }
  558. #define INIT_m1n2 INIT_m2n1
  559. #define INIT_m1n4 INIT_m4n1
  560. #define INIT_m2n4 INIT_m4n2
  561. #define SAVE_m2n4 {\
  562. ya1 = _mm256_broadcast_sd(alpha);\
  563. yc1 = _mm256_mul_pd(yc1,ya1);\
  564. yc2 = _mm256_mul_pd(yc2,ya1);\
  565. yb1 = _mm256_unpacklo_pd(yc1,yc2);\
  566. yb2 = _mm256_unpackhi_pd(yc1,yc2);\
  567. xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
  568. xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\
  569. _mm_storeu_pd(c_pointer,xb1);\
  570. _mm_storeu_pd(c_pointer+LDC,xb2);\
  571. xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\
  572. xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\
  573. _mm_storeu_pd(c_pointer+2*LDC,xb1);\
  574. _mm_storeu_pd(c_pointer+3*LDC,xb2);\
  575. c_pointer += 2;\
  576. }
  577. #define SAVE_m1n2 {\
  578. xb1 = _mm_loaddup_pd(alpha);\
  579. xc1 = _mm_mul_pd(xc1,xb1);\
  580. *c_pointer += _mm_cvtsd_f64(xc1);\
  581. xa1 = _mm_unpackhi_pd(xc1,xc1);\
  582. c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
  583. c_pointer ++;\
  584. }
  585. #define SAVE_m1n4 {\
  586. ya1 = _mm256_broadcast_sd(alpha);\
  587. yc1 = _mm256_mul_pd(yc1,ya1);\
  588. xb1 = _mm256_extractf128_pd(yc1,0);\
  589. *c_pointer += _mm_cvtsd_f64(xb1);\
  590. xb2 = _mm_unpackhi_pd(xb1,xb1);\
  591. c_pointer[LDC] += _mm_cvtsd_f64(xb2);\
  592. xb1 = _mm256_extractf128_pd(yc1,1);\
  593. c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
  594. xb2 = _mm_unpackhi_pd(xb1,xb1);\
  595. c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\
  596. c_pointer ++;\
  597. }
  598. static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
  599. //perform C += A<pack> B<pack> , edge_n<8 must be satisfied.
  600. if(k==0 || m==0 || edge_n==0) return;
  601. double *a_block_pointer,*b_block_pointer,*b_base_pointer;
  602. double *c_pointer = c;
  603. __m512d zb1,zb2,za1,zc1,zc2,zc3,zc4;
  604. __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2;
  605. __m128d xc1,xc2,xa1,xb1,xb2;
  606. double sc1,sa1,sb1;
  607. BLASLONG m_count,n_count,k_count;
  608. b_base_pointer = packed_b;
  609. //now start calculation of the edge part
  610. for(n_count=edge_n;n_count>3;n_count-=4){
  611. a_block_pointer = packed_a;
  612. for(m_count=m;m_count>7;m_count-=8){
  613. b_block_pointer = b_base_pointer;
  614. INIT_m8n4
  615. for(k_count=0;k_count<k;k_count++) KERNEL_m8n4k1
  616. SAVE_m8n4
  617. }
  618. for(;m_count>3;m_count-=4){
  619. b_block_pointer = b_base_pointer;
  620. INIT_m4n4
  621. for(k_count=0;k_count<k;k_count++) KERNEL_m4n4k1
  622. SAVE_m4n4
  623. }
  624. for(;m_count>1;m_count-=2){
  625. b_block_pointer = b_base_pointer;
  626. INIT_m2n4
  627. for(k_count=0;k_count<k;k_count++) KERNEL_m2n4k1
  628. SAVE_m2n4
  629. }
  630. if(m_count>0){
  631. b_block_pointer = b_base_pointer;
  632. INIT_m1n4
  633. for(k_count=0;k_count<k;k_count++) KERNEL_m1n4k1
  634. SAVE_m1n4
  635. }
  636. b_base_pointer += 4*k;
  637. c_pointer += 4 * LDC - m;
  638. }
  639. for(;n_count>1;n_count-=2){
  640. a_block_pointer = packed_a;
  641. for(m_count=m;m_count>7;m_count-=8){
  642. b_block_pointer = b_base_pointer;
  643. INIT_m8n2
  644. for(k_count=0;k_count<k;k_count++) KERNEL_m8n2k1
  645. SAVE_m8n2
  646. }
  647. for(;m_count>3;m_count-=4){
  648. b_block_pointer = b_base_pointer;
  649. INIT_m4n2
  650. for(k_count=0;k_count<k;k_count++) KERNEL_m4n2k1
  651. SAVE_m4n2
  652. }
  653. for(;m_count>1;m_count-=2){
  654. b_block_pointer = b_base_pointer;
  655. INIT_m2n2
  656. for(k_count=0;k_count<k;k_count++) KERNEL_m2n2k1
  657. SAVE_m2n2
  658. }
  659. if(m_count>0){
  660. b_block_pointer = b_base_pointer;
  661. INIT_m1n2
  662. for(k_count=0;k_count<k;k_count++) KERNEL_m1n2k1
  663. SAVE_m1n2
  664. }
  665. b_base_pointer += 2*k;
  666. c_pointer += 2 * LDC - m;
  667. }
  668. if(n_count>0){
  669. a_block_pointer = packed_a;
  670. for(m_count=m;m_count>7;m_count-=8){
  671. b_block_pointer = b_base_pointer;
  672. INIT_m8n1
  673. for(k_count=0;k_count<k;k_count++) KERNEL_m8n1k1
  674. SAVE_m8n1
  675. }
  676. for(;m_count>3;m_count-=4){
  677. b_block_pointer = b_base_pointer;
  678. INIT_m4n1
  679. for(k_count=0;k_count<k;k_count++) KERNEL_m4n1k1
  680. SAVE_m4n1
  681. }
  682. for(;m_count>1;m_count-=2){
  683. b_block_pointer = b_base_pointer;
  684. INIT_m2n1
  685. for(k_count=0;k_count<k;k_count++) KERNEL_m2n1k1
  686. SAVE_m2n1
  687. }
  688. if(m_count>0){
  689. b_block_pointer = b_base_pointer;
  690. INIT_m1n1
  691. for(k_count=0;k_count<k;k_count++) KERNEL_m1n1k1
  692. SAVE_m1n1
  693. }
  694. }
  695. }
  696. #ifdef ICOPY_4
  697. static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){
  698. BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp;
  699. src1 = src; dst1 = dst; src2 = src1 + 4 * k;
  700. for(m_count=m;m_count>7;m_count-=8){
  701. for(k_count=k;k_count>0;k_count--){
  702. tmp = _mm256_loadu_pd(src1);_mm256_storeu_pd(dst1+0,tmp);src1+=4;
  703. tmp = _mm256_loadu_pd(src2);_mm256_storeu_pd(dst1+4,tmp);src2+=4;
  704. dst1+=8;
  705. }
  706. src1+=4*k;src2+=4*k;
  707. }
  708. for(;m_count>0;m_count--){
  709. for(k_count=k;k_count>0;k_count--){
  710. *dst1 = (*src1); src1++; dst1++;
  711. }
  712. }
  713. }
  714. #endif
  715. int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
  716. if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
  717. BLASLONG ndiv8 = n/8;double ALPHA = alpha;
  718. #ifdef ICOPY_4
  719. double *packed_a = (double *)malloc(m*k*sizeof(double));
  720. copy_4_to_8(A,packed_a,m,k);
  721. #else //ICOPY_8
  722. double *packed_a = A;
  723. #endif
  724. if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
  725. if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
  726. #ifdef ICOPY_4
  727. free(packed_a);packed_a=NULL;
  728. #endif
  729. return 0;
  730. }