You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_16x2_skylakex.c 33 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. /* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = b_pref */
  2. /* r10 = tmp, r11 = m_counter, r12 = size_of_1_tile_in_b, r13 = k, r14 = b_head, r15 = %1+3*r12 */
  3. #if (defined (LEFT) && !defined(TRANSA)) || (!defined (LEFT) && defined(TRANSA))
  4. #define BACKWARDS 1
  5. #else
  6. #define BACKWARDS 0
  7. #endif
  8. #define GEMM_SET_PB "movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"
  9. #define set_p_copy1(ptr) "sarq $1,%%r12; addq %%r12,"#ptr"; salq $1,%%r12; salq $3,%%r13; subq %%r13,"#ptr"; sarq $3,%%r13;"
  10. #define set_p_copy2(ptr) "addq %%r12,"#ptr"; salq $4,%%r13; subq %%r13,"#ptr"; sarq $4,%%r13;"
  11. #define set_p_copy4(ptr) "leaq ("#ptr",%%r12,2),"#ptr"; salq $5,%%r13; subq %%r13,"#ptr"; sarq $5,%%r13;"
  12. #define set_p_copy8(ptr) "leaq ("#ptr",%%r12,4),"#ptr"; salq $6,%%r13; subq %%r13,"#ptr"; sarq $6,%%r13;"
  13. #define set_p_copy16(ptr) "leaq ("#ptr",%%r12,8),"#ptr"; salq $7,%%r13; subq %%r13,"#ptr"; sarq $7,%%r13;"
  14. #define set_p_b_dim1(ptr) set_p_copy1(ptr)
  15. #define set_p_b_dim2(ptr) set_p_copy2(ptr)
  16. #define set_p_b_dim4(ptr) set_p_copy2(ptr)
  17. #define set_p_b_dim6(ptr) set_p_copy2(ptr)
  18. #define set_p_b_dim8(ptr) set_p_copy2(ptr)
  19. #define set_p_b_dim10(ptr) set_p_copy2(ptr)
  20. #define set_p_b_dim12(ptr) set_p_copy2(ptr)
  21. #ifdef TRMMKERNEL
  22. #if BACKWARDS == 1
  23. #define INIT_set_papb(mdim,ndim) GEMM_SET_PB set_p_copy##mdim(%0) set_p_b_dim##ndim(%1) set_p_b_dim##ndim(%%r15)
  24. #define SAVE_set_pa(mdim) ""
  25. #else
  26. #define INIT_set_papb(mdim,ndim) GEMM_SET_PB
  27. #define SAVE_set_pa(mdim) set_p_copy##mdim(%0)
  28. #endif
  29. #else
  30. #define INIT_set_papb(mdim,ndim) GEMM_SET_PB
  31. #define SAVE_set_pa(mdim) ""
  32. #endif
  33. #if defined(TRMMKERNEL) && !defined(LEFT)
  34. #if BACKWARDS == 1
  35. #define HEAD_SET_OFF(ndim) {}
  36. #define TAIL_SET_OFF(ndim) {off += ndim;}
  37. #define kernel_kstart_n4(mdim,updk) KERNEL_k1m##mdim##n2 KERNEL_k1m##mdim##n2 "addq $32,%%r15; "#updk" $2,%5;"
  38. #define kernel_kstart_n6(mdim,updk) kernel_kstart_n4(mdim,updk) KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "addq $32,%%r15; "#updk" $2,%5;"
  39. #define kernel_kstart_n8(mdim,updk) kernel_kstart_n6(mdim,updk) KERNEL_k1m##mdim##n6 KERNEL_k1m##mdim##n6 "addq $32,%%r15; "#updk" $2,%5;"
  40. #define kernel_kstart_n10(mdim,updk) kernel_kstart_n8(mdim,updk) KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 #updk" $2,%5;"
  41. #define kernel_kstart_n12(mdim,updk) kernel_kstart_n10(mdim,updk) KERNEL_k1m##mdim##n10 KERNEL_k1m##mdim##n10 #updk" $2,%5;"
  42. #define kernel_kend_n4(mdim) ""
  43. #define kernel_kend_n6(mdim) ""
  44. #define kernel_kend_n8(mdim) ""
  45. #define kernel_kend_n10(mdim) ""
  46. #define kernel_kend_n12(mdim) ""
  47. #else
  48. #define HEAD_SET_OFF(ndim) {off += (ndim > 2 ? 2 : ndim);}
  49. #define TAIL_SET_OFF(ndim) {off += (ndim > 2 ? (ndim-2) : 0);}
  50. #define kernel_kstart_n4(mdim,updk) ""
  51. #define kernel_kstart_n6(mdim,updk) ""
  52. #define kernel_kstart_n8(mdim,updk) ""
  53. #define kernel_kstart_n10(mdim,updk) ""
  54. #define kernel_kstart_n12(mdim,updk) ""
  55. #define kernel_kend_n4(mdim) "xorq %3,%3;"\
  56. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\
  57. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16)
  58. #define kernel_kend_n6(mdim) "xorq %3,%3;"\
  59. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\
  60. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\
  61. loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\
  62. loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48)
  63. #define kernel_kend_n8(mdim) "xorq %3,%3;"\
  64. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\
  65. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\
  66. loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\
  67. loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\
  68. loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\
  69. loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80)
  70. #define kernel_kend_n10(mdim) "xorq %3,%3;"\
  71. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\
  72. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\
  73. loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\
  74. loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\
  75. loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\
  76. loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\
  77. loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\
  78. loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112)
  79. #define kernel_kend_n12(mdim) "xorq %3,%3;"\
  80. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\
  81. loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\
  82. loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\
  83. loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\
  84. loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\
  85. loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\
  86. loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\
  87. loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\
  88. loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\
  89. loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144)
  90. #endif
  91. #else
  92. #define HEAD_SET_OFF(ndim) {}
  93. #define TAIL_SET_OFF(ndim) {}
  94. #define kernel_kstart_n4(mdim,updk) ""
  95. #define kernel_kstart_n6(mdim,updk) ""
  96. #define kernel_kstart_n8(mdim,updk) ""
  97. #define kernel_kstart_n10(mdim,updk) ""
  98. #define kernel_kstart_n12(mdim,updk) ""
  99. #define kernel_kend_n4(mdim) ""
  100. #define kernel_kend_n6(mdim) ""
  101. #define kernel_kend_n8(mdim) ""
  102. #define kernel_kend_n10(mdim) ""
  103. #define kernel_kend_n12(mdim) ""
  104. #endif
  105. #define kernel_kstart_n1(mdim,updk) ""
  106. #define kernel_kstart_n2(mdim,updk) ""
  107. #define kernel_kend_n1(mdim) ""
  108. #define kernel_kend_n2(mdim) ""
  109. #ifdef TRMMKERNEL
  110. #if BACKWARDS == 1
  111. #define INITASM_SET_K "movq %10,%%r13; subq %9,%%r13;"
  112. #else
  113. #define INITASM_SET_K "movq %9,%%r13;"
  114. #endif
  115. #else
  116. #define INITASM_SET_K "movq %10,%%r13;"
  117. #endif
  118. #if defined(TRMMKERNEL) && defined(LEFT)
  119. #if BACKWARDS==1
  120. #define init_update_k(mdim) ""
  121. #define save_update_k(mdim) "subq $"#mdim",%%r13;"
  122. #else
  123. #define init_update_k(mdim) "addq $"#mdim",%%r13;"
  124. #define save_update_k(mdim) ""
  125. #endif
  126. #else
  127. #define init_update_k(mdim) ""
  128. #define save_update_k(mdim) ""
  129. #endif
  130. #define KERNEL_h_k1m16n1 \
  131. "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\
  132. "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;"
  133. #define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;"
  134. #ifdef BROADCAST_KERNEL
  135. #define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\
  136. "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;"
  137. #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\
  138. "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\
  139. "vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";"
  140. #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__)
  141. #else
  142. #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\
  143. "vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\
  144. "vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";"
  145. #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__)
  146. #define KERNEL_h_k1m16n2 \
  147. "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\
  148. unit_acc_m16n2(8,9,10,11,%1)
  149. #endif
  150. #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
  151. #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1)
  152. #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
  153. #define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;"
  154. #define KERNEL_h_k1m16n8 KERNEL_k1m16n6 "prefetcht0 448(%0);" unit_acc_m16n2(20,21,22,23,%%r15)
  155. #define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%%r15;"
  156. #define KERNEL_h_k1m16n10 KERNEL_h_k1m16n8 unit_acc_m16n2(24,25,26,27,%%r15,%%r12,1)
  157. #define KERNEL_k1m16n10 KERNEL_h_k1m16n10 "addq $16,%%r15;"
  158. #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2)
  159. #define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;"
  160. #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
  161. #ifdef BROADCAST_KERNEL
  162. #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;"
  163. #else
  164. #define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;"
  165. #endif
  166. #define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1)
  167. #define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2)
  168. #define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15)
  169. #define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1)
  170. #define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2)
  171. #endif
  172. #define save_init_m16 "movq %2,%3; addq $128,%2;"
  173. #ifdef TRMMKERNEL
  174. #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
  175. #ifdef BROADCAST_KERNEL
  176. #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
  177. "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
  178. "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
  179. #else
  180. #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
  181. "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\
  182. "vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\
  183. "vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;"
  184. #endif
  185. #else
  186. #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
  187. #ifdef BROADCAST_KERNEL
  188. #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
  189. "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
  190. "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
  191. #else
  192. #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
  193. "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\
  194. "vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\
  195. "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;"
  196. #endif
  197. #endif
  198. #define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11)
  199. #define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15)
  200. #define SAVE_m16n6 SAVE_m16n4 unit_save_m16n2(16,17,18,19)
  201. #define SAVE_m16n8 SAVE_m16n6 unit_save_m16n2(20,21,22,23)
  202. #define SAVE_m16n10 SAVE_m16n8 unit_save_m16n2(24,25,26,27)
  203. #define SAVE_m16n12 SAVE_m16n10 unit_save_m16n2(28,29,30,31)
  204. #define unit_init_2zmm(c1_no,c2_no) "vpxorq %%zmm"#c1_no",%%zmm"#c1_no",%%zmm"#c1_no"; vpxorq %%zmm"#c2_no",%%zmm"#c2_no",%%zmm"#c2_no";"
  205. #define unit_init_4zmm(c1_no,c2_no,c3_no,c4_no) unit_init_2zmm(c1_no,c2_no) unit_init_2zmm(c3_no,c4_no)
  206. #define INIT_m16n1 unit_init_2zmm(8,9)
  207. #define INIT_m16n2 unit_init_4zmm(8,9,10,11)
  208. #define INIT_m16n4 INIT_m16n2 unit_init_4zmm(12,13,14,15)
  209. #define INIT_m16n6 INIT_m16n4 unit_init_4zmm(16,17,18,19)
  210. #define INIT_m16n8 INIT_m16n6 unit_init_4zmm(20,21,22,23)
  211. #define INIT_m16n10 INIT_m16n8 unit_init_4zmm(24,25,26,27)
  212. #define INIT_m16n12 INIT_m16n10 unit_init_4zmm(28,29,30,31)
  213. #define KERNEL_k1m8n1 \
  214. "vbroadcastsd (%1),%%zmm1; addq $8,%1;"\
  215. "vfmadd231pd (%0),%%zmm1,%%zmm8; addq $64,%0;"
  216. #define unit_acc_gen_m8n2(c1_no,c2_no,boff,...)\
  217. "vbroadcastf32x4 "#boff"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"
  218. #define unit_acc_m8n2(c1_no,c2_no,...) unit_acc_gen_m8n2(c1_no,c2_no,0,__VA_ARGS__)
  219. #define KERNEL_h_k1m8n2 \
  220. "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" unit_acc_m8n2(8,9,%1)
  221. #define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $16,%1;"
  222. #define KERNEL_h_k1m8n4 KERNEL_h_k1m8n2 unit_acc_m8n2(10,11,%1,%%r12,1)
  223. #define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
  224. #define KERNEL_k1m8n6 KERNEL_h_k1m8n4 unit_acc_m8n2(12,13,%1,%%r12,2) "addq $16,%1;"
  225. #define KERNEL_h_k1m8n8 KERNEL_k1m8n6 unit_acc_m8n2(14,15,%%r15)
  226. #define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%%r15;"
  227. #define KERNEL_h_k1m8n10 KERNEL_h_k1m8n8 unit_acc_m8n2(16,17,%%r15,%%r12,1)
  228. #define KERNEL_k1m8n10 KERNEL_h_k1m8n10 "addq $16,%%r15;"
  229. #define KERNEL_h_k1m8n12 KERNEL_h_k1m8n10 unit_acc_m8n2(18,19,%%r15,%%r12,2)
  230. #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;"
  231. #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
  232. #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;"
  233. #define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1)
  234. #define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2)
  235. #define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15)
  236. #define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1)
  237. #define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2)
  238. #endif
  239. #define save_init_m8 "movq %2,%3; addq $64,%2;"
  240. #ifdef TRMMKERNEL
  241. #define SAVE_m8n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); addq $64,%2;"
  242. #define unit_save_m8n2(c1_no,c2_no)\
  243. "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3);"\
  244. "vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm2; vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
  245. #else
  246. #define SAVE_m8n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); addq $64,%2;"
  247. #define unit_save_m8n2(c1_no,c2_no)\
  248. "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3);"\
  249. "vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm2; vfmadd213pd (%3,%4,1),%%zmm0,%%zmm2; vmovupd %%zmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
  250. #endif
  251. #define SAVE_m8n2 save_init_m8 unit_save_m8n2(8,9)
  252. #define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(10,11)
  253. #define SAVE_m8n6 SAVE_m8n4 unit_save_m8n2(12,13)
  254. #define SAVE_m8n8 SAVE_m8n6 unit_save_m8n2(14,15)
  255. #define SAVE_m8n10 SAVE_m8n8 unit_save_m8n2(16,17)
  256. #define SAVE_m8n12 SAVE_m8n10 unit_save_m8n2(18,19)
  257. #define INIT_m8n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;"
  258. #define INIT_m8n2 unit_init_2zmm(8,9)
  259. #define INIT_m8n4 INIT_m8n2 unit_init_2zmm(10,11)
  260. #define INIT_m8n6 INIT_m8n4 unit_init_2zmm(12,13)
  261. #define INIT_m8n8 INIT_m8n6 unit_init_2zmm(14,15)
  262. #define INIT_m8n10 INIT_m8n8 unit_init_2zmm(16,17)
  263. #define INIT_m8n12 INIT_m8n10 unit_init_2zmm(18,19)
  264. #define KERNEL_k1m4n1 \
  265. "vbroadcastsd (%1),%%ymm1; addq $8,%1;"\
  266. "vfmadd231pd (%0),%%ymm1,%%ymm4; addq $32,%0;"
  267. #define unit_acc_gen_m4n2(c1_no,c2_no,boff,...)\
  268. "vbroadcastf128 "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm"#c1_no"; vfmadd231pd %%ymm2,%%ymm3,%%ymm"#c2_no";"
  269. #define unit_acc_m4n2(c1_no,c2_no,...) unit_acc_gen_m4n2(c1_no,c2_no,0,__VA_ARGS__)
  270. #define KERNEL_h_k1m4n2 \
  271. "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;" unit_acc_m4n2(4,5,%1)
  272. #define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
  273. #define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_acc_m4n2(6,7,%1,%%r12,1)
  274. #define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
  275. #define KERNEL_k1m4n6 KERNEL_h_k1m4n4 unit_acc_m4n2(8,9,%1,%%r12,2) "addq $16,%1;"
  276. #define KERNEL_h_k1m4n8 KERNEL_k1m4n6 unit_acc_m4n2(10,11,%%r15)
  277. #define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%%r15;"
  278. #define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1)
  279. #define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;"
  280. #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2)
  281. //#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
  282. #define unit_acc_k2m4n2(c1_no,c2_no,...)\
  283. "vbroadcastf64x4 ("#__VA_ARGS__"),%%zmm3; vpermpd %%zmm3,%%zmm30,%%zmm3;"\
  284. "vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"
  285. #define unit_merge_to_ymm(c1_no) \
  286. "vextractf64x4 $1,%%zmm"#c1_no",%%ymm30; vaddpd %%ymm"#c1_no",%%ymm30,%%ymm"#c1_no";"
  287. #define KERNEL_k1m4n12 \
  288. "cmpq $2, %5; jb 104912f;"\
  289. "vmovupd 64+%11,%%zmm30;"\
  290. "\n204912:"\
  291. "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \
  292. unit_acc_k2m4n2(4,5,%1) unit_acc_k2m4n2(6,7,%1,%%r12,1) unit_acc_k2m4n2(8, 9, %1, %%r12, 2) "addq $32,%1;" \
  293. unit_acc_k2m4n2(10,11,%%r15) unit_acc_k2m4n2(12,13,%%r15,%%r12,1) unit_acc_k2m4n2(14,15,%%r15,%%r12,2) "addq $32,%%r15;" \
  294. "subq $2, %5; cmpq $2, %5; jnb 204912b;"\
  295. unit_merge_to_ymm(4) unit_merge_to_ymm(5) unit_merge_to_ymm(6) unit_merge_to_ymm(7) \
  296. unit_merge_to_ymm(8) unit_merge_to_ymm(9) unit_merge_to_ymm(10) unit_merge_to_ymm(11) \
  297. unit_merge_to_ymm(12) unit_merge_to_ymm(13) unit_merge_to_ymm(14) unit_merge_to_ymm(15) \
  298. "testq %5, %5; jz 1004912f;"\
  299. "\n104912:"\
  300. KERNEL_h_k1m4n12 "addq $16,%%r15;"\
  301. "decq %5; jnz 104912b;"\
  302. "\n1004912:"\
  303. "incq %5;"
  304. #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
  305. #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;"
  306. #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
  307. #define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2)
  308. #define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15)
  309. #define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1)
  310. #define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2)
  311. #endif
  312. #define save_init_m4 "movq %2,%3; addq $32,%2;"
  313. #ifdef TRMMKERNEL
  314. #define SAVE_m4n1 "vmulpd %%ymm4,%%ymm0,%%ymm4; vmovupd %%ymm4,(%2); addq $32,%2;"
  315. #define unit_save_m4n2(c1_no,c2_no)\
  316. "vunpcklpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vmulpd %%ymm1,%%ymm0,%%ymm1; vmovupd %%ymm1,(%3);"\
  317. "vunpckhpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm2; vmulpd %%ymm2,%%ymm0,%%ymm2; vmovupd %%ymm2,(%3,%4,1); leaq (%3,%4,2),%3;"
  318. #else
  319. #define SAVE_m4n1 "vfmadd213pd (%2),%%ymm0,%%ymm4; vmovupd %%ymm4,(%2); addq $32,%2;"
  320. #define unit_save_m4n2(c1_no,c2_no)\
  321. "vunpcklpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vfmadd213pd (%3),%%ymm0,%%ymm1; vmovupd %%ymm1,(%3);"\
  322. "vunpckhpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm2; vfmadd213pd (%3,%4,1),%%ymm0,%%ymm2; vmovupd %%ymm2,(%3,%4,1); leaq (%3,%4,2),%3;"
  323. #endif
  324. #define SAVE_m4n2 save_init_m4 unit_save_m4n2(4,5)
  325. #define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(6,7)
  326. #define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(8,9)
  327. #define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(10,11)
  328. #define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(12,13)
  329. #define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(14,15)
  330. #define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
  331. #define unit_init_2ymm(c1_no,c2_no) "vpxor %%ymm"#c1_no",%%ymm"#c1_no",%%ymm"#c1_no"; vpxor %%ymm"#c2_no",%%ymm"#c2_no",%%ymm"#c2_no";"
  332. #define INIT_m4n2 unit_init_2ymm(4,5)
  333. #define INIT_m4n4 INIT_m4n2 unit_init_2ymm(6,7)
  334. #define INIT_m4n6 INIT_m4n4 unit_init_2ymm(8,9)
  335. #define INIT_m4n8 INIT_m4n6 unit_init_2ymm(10,11)
  336. #define INIT_m4n10 INIT_m4n8 unit_init_2ymm(12,13)
  337. #define INIT_m4n12 INIT_m4n10 unit_init_2ymm(14,15)
  338. #define KERNEL_k1m2n1 \
  339. "vmovddup (%1),%%xmm1; addq $8,%1;"\
  340. "vfmadd231pd (%0),%%xmm1,%%xmm4; addq $16,%0;"
  341. #define unit_acc_gen_m2n2(c1_no,c2_no,boff,...)\
  342. "vmovupd "#boff"("#__VA_ARGS__"),%%xmm3; vfmadd231pd %%xmm1,%%xmm3,%%xmm"#c1_no"; vfmadd231pd %%xmm2,%%xmm3,%%xmm"#c2_no";"
  343. #define unit_acc_m2n2(c1_no,c2_no,...) unit_acc_gen_m2n2(c1_no,c2_no,0,__VA_ARGS__)
  344. #define KERNEL_h_k1m2n2 \
  345. "vmovddup (%0),%%xmm1; vmovddup 8(%0),%%xmm2; addq $16,%0;" unit_acc_m2n2(4,5,%1)
  346. #define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
  347. #define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_acc_m2n2(6,7,%1,%%r12,1)
  348. #define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
  349. #define KERNEL_k1m2n6 KERNEL_h_k1m2n4 unit_acc_m2n2(8,9,%1,%%r12,2) "addq $16,%1;"
  350. #define KERNEL_h_k1m2n8 KERNEL_k1m2n6 unit_acc_m2n2(10,11,%%r15)
  351. #define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%%r15;"
  352. #define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1)
  353. #define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;"
  354. #define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2)
  355. //#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
  356. #define unit_acc_k4m2n2(c1_no,c2_no,...) \
  357. "vmovupd ("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"
  358. #define unit_merge_to_xmm(c1_no) \
  359. "vextractf64x2 $0,%%zmm"#c1_no",%%xmm20; vextractf64x2 $1,%%zmm"#c1_no",%%xmm21; vextractf64x2 $2,%%zmm"#c1_no",%%xmm22; vextractf64x2 $3,%%zmm"#c1_no",%%xmm23;"\
  360. "vaddpd %%xmm20,%%xmm21,%%xmm20; vaddpd %%xmm22,%%xmm23,%%xmm22; vaddpd %%xmm20,%%xmm22,%%xmm"#c1_no";"
  361. #define KERNEL_k1m2n12 \
  362. "cmpq $4,%5; jb 102912f;"\
  363. "\n402912:"\
  364. "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \
  365. unit_acc_k4m2n2(4,5,%1) unit_acc_k4m2n2(6,7,%1,%%r12,1) unit_acc_k4m2n2(8,9,%1,%%r12,2) "addq $64,%1;" \
  366. unit_acc_k4m2n2(10,11,%%r15) unit_acc_k4m2n2(12,13,%%r15,%%r12,1) unit_acc_k4m2n2(14,15,%%r15,%%r12,2) "addq $64,%%r15;" \
  367. "subq $4,%5; cmpq $4,%5; jnb 402912b;"\
  368. unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \
  369. unit_merge_to_xmm(10) unit_merge_to_xmm(11) unit_merge_to_xmm(12) unit_merge_to_xmm(13) unit_merge_to_xmm(14) unit_merge_to_xmm(15) \
  370. "testq %5,%5; jz 1002912f;"\
  371. "\n102912:"\
  372. KERNEL_h_k1m2n12 "addq $16,%%r15;" \
  373. "decq %5; jnz 102912b;" \
  374. "\n1002912:"\
  375. "incq %5;"
  376. #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
  377. #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;"
  378. #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
  379. #define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2)
  380. #define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15)
  381. #define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1)
  382. #define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2)
  383. #endif
  384. #define save_init_m2 "movq %2,%3; addq $16,%2;"
  385. #ifdef TRMMKERNEL
  386. #define SAVE_m2n1 "vmulpd %%xmm4,%%xmm0,%%xmm4; vmovupd %%xmm4,(%2); addq $16,%2;"
  387. #define unit_save_m2n2(c1_no,c2_no)\
  388. "vunpcklpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm1; vmulpd %%xmm1,%%xmm0,%%xmm1; vmovupd %%xmm1,(%3);"\
  389. "vunpckhpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm2; vmulpd %%xmm2,%%xmm0,%%xmm2; vmovupd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
  390. #else
  391. #define SAVE_m2n1 "vfmadd213pd (%2),%%xmm0,%%xmm4; vmovupd %%xmm4,(%2); addq $16,%2;"
  392. #define unit_save_m2n2(c1_no,c2_no)\
  393. "vunpcklpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm1; vfmadd213pd (%3),%%xmm0,%%xmm1; vmovupd %%xmm1,(%3);"\
  394. "vunpckhpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm2; vfmadd213pd (%3,%4,1),%%xmm0,%%xmm2; vmovupd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
  395. #endif
  396. #define SAVE_m2n2 save_init_m2 unit_save_m2n2(4,5)
  397. #define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(6,7)
  398. #define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(8,9)
  399. #define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(10,11)
  400. #define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(12,13)
  401. #define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(14,15)
  402. #define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
  403. #define unit_init_2xmm(c1_no,c2_no) "vpxor %%xmm"#c1_no",%%xmm"#c1_no",%%xmm"#c1_no"; vpxor %%xmm"#c2_no",%%xmm"#c2_no",%%xmm"#c2_no";"
  404. #define INIT_m2n2 unit_init_2xmm(4,5)
  405. #define INIT_m2n4 INIT_m2n2 unit_init_2xmm(6,7)
  406. #define INIT_m2n6 INIT_m2n4 unit_init_2xmm(8,9)
  407. #define INIT_m2n8 INIT_m2n6 unit_init_2xmm(10,11)
  408. #define INIT_m2n10 INIT_m2n8 unit_init_2xmm(12,13)
  409. #define INIT_m2n12 INIT_m2n10 unit_init_2xmm(14,15)
  410. #define KERNEL_k1m1n1 \
  411. "vmovsd (%1),%%xmm1; addq $8,%1;"\
  412. "vfmadd231sd (%0),%%xmm1,%%xmm4; addq $8,%0;"
  413. #define KERNEL_h_k1m1n2 \
  414. "vmovddup (%0),%%xmm1; addq $8,%0;"\
  415. "vfmadd231pd (%1),%%xmm1,%%xmm4;"
  416. #define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
  417. #define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vfmadd231pd (%1,%%r12,1),%%xmm1,%%xmm5;"
  418. #define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
  419. #define KERNEL_k1m1n6 KERNEL_h_k1m1n4 "vfmadd231pd (%1,%%r12,2),%%xmm1,%%xmm6; addq $16,%1;"
  420. #define KERNEL_h_k1m1n8 KERNEL_k1m1n6 "vfmadd231pd (%%r15),%%xmm1,%%xmm7;"
  421. #define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%%r15;"
  422. #define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;"
  423. #define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;"
  424. #define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;"
  425. //#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
  426. #define KERNEL_k1m1n12 \
  427. "cmpq $4,%5; jb 101912f;" \
  428. "vmovupd %11,%%zmm2;"\
  429. "\n401912:"\
  430. "vmovupd (%0),%%ymm1; vpermpd %%zmm1,%%zmm2,%%zmm1; addq $32,%0;" \
  431. "vfmadd231pd (%1),%%zmm1,%%zmm4; vfmadd231pd (%1,%%r12,1),%%zmm1,%%zmm5; vfmadd231pd (%1,%%r12,2),%%zmm1,%%zmm6; addq $64,%1;"\
  432. "vfmadd231pd (%%r15),%%zmm1,%%zmm7; vfmadd231pd (%%r15,%%r12,1),%%zmm1,%%zmm8; vfmadd231pd (%%r15,%%r12,2),%%zmm1,%%zmm9; addq $64,%%r15;"\
  433. "subq $4,%5; cmpq $4,%5; jnb 401912b;"\
  434. unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) \
  435. unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \
  436. "testq %5,%5; jz 1001912f;"\
  437. "\n101912:"\
  438. KERNEL_h_k1m1n12 "addq $16,%%r15;" \
  439. "decq %5; jnz 101912b;" \
  440. "\n1001912:"\
  441. "incq %5;"
  442. #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
  443. #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;"
  444. #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
  445. #define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;"
  446. #define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;"
  447. #define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;"
  448. #define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;"
  449. #endif
  450. #define save_init_m1 "movq %2,%3; addq $8,%2;"
  451. #ifdef TRMMKERNEL
  452. #define SAVE_m1n1 "vmulsd %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2); addq $8,%2;"
  453. #define unit_save_m1n2(c1_no)\
  454. "vmulpd %%xmm"#c1_no",%%xmm0,%%xmm2; vmovsd %%xmm2,(%3); vmovhpd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
  455. #else
  456. #define SAVE_m1n1 "vfmadd213sd (%2),%%xmm0,%%xmm4; vmovsd %%xmm4,(%2); addq $8,%2;"
  457. #define unit_save_m1n2(c1_no)\
  458. "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vfmadd231pd %%xmm"#c1_no",%%xmm0,%%xmm2; vmovsd %%xmm2,(%3); vmovhpd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
  459. #endif
  460. #define SAVE_m1n2 save_init_m1 unit_save_m1n2(4)
  461. #define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(5)
  462. #define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(6)
  463. #define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(7)
  464. #define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(8)
  465. #define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(9)
  466. #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
  467. #define INIT_m1n2 INIT_m1n1
  468. #define INIT_m1n4 INIT_m1n2 "vpxor %%xmm5,%%xmm5,%%xmm5;"
  469. #define INIT_m1n6 INIT_m1n4 "vpxor %%xmm6,%%xmm6,%%xmm6;"
  470. #define INIT_m1n8 INIT_m1n6 "vpxor %%xmm7,%%xmm7,%%xmm7;"
  471. #define INIT_m1n10 INIT_m1n8 "vpxor %%xmm8,%%xmm8,%%xmm8;"
  472. #define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;"
  473. #define COMPUTE_SIMPLE(mdim,ndim)\
  474. init_update_k(mdim) INIT_m##mdim##n##ndim\
  475. "movq %%r13,%5;" INIT_set_papb(mdim,ndim)\
  476. kernel_kstart_n##ndim(mdim,subq)\
  477. "testq %5,%5; jz 7"#mdim"7"#ndim"9f;"\
  478. "7"#mdim"7"#ndim"1:\n\t"\
  479. KERNEL_k1m##mdim##n##ndim "decq %5; jnz 7"#mdim"7"#ndim"1b;"\
  480. "7"#mdim"7"#ndim"9:\n\t"\
  481. kernel_kend_n##ndim(mdim)\
  482. SAVE_set_pa(mdim) SAVE_m##mdim##n##ndim save_update_k(mdim)
  483. #define COMPUTE_m16n1 COMPUTE_SIMPLE(16,1)
  484. #define COMPUTE_m16n2 COMPUTE_SIMPLE(16,2)
  485. #define COMPUTE_m16n4 COMPUTE_SIMPLE(16,4)
  486. #define COMPUTE_m16n6 COMPUTE_SIMPLE(16,6)
  487. #define COMPUTE_m16n8 COMPUTE_SIMPLE(16,8)
  488. #define COMPUTE_m16n10 COMPUTE_SIMPLE(16,10)
  489. #if defined(TRMMKERNEL) && !defined(LEFT) && defined(TRANSA)
  490. #define INVERSE_K_MID "negq %5; leaq 6(%%r13,%5,1),%5;"
  491. #else
  492. #define INVERSE_K_MID "negq %5; leaq 16(%%r13,%5,1),%5;"
  493. #endif
  494. #define COMPUTE_m16n12 \
  495. init_update_k(16) INIT_m16n12 "movq %%r13,%5;" INIT_set_papb(16,12) "movq %2,%3;"\
  496. kernel_kstart_n12(16,subq)\
  497. "cmpq $16,%5; jb 7167123f; movq $16,%5;"\
  498. "7167121:\n\t"\
  499. KERNEL_k1m16n12 "addq $4,%5; testq $12,%5; movq $172,%%r10; cmovz %4,%%r10;"\
  500. KERNEL_k1m16n12 "prefetcht1 (%3); subq $129,%3; addq %%r10,%3;"\
  501. KERNEL_k1m16n12 "prefetcht1 (%6); addq $32,%6; cmpq $208,%5; cmoveq %2,%3;"\
  502. KERNEL_k1m16n12 "cmpq %5,%%r13; jnb 7167121b;"\
  503. "movq %2,%3;" INVERSE_K_MID\
  504. "7167123:\n\t"\
  505. "testq %5,%5; jz 7167129f;"\
  506. "7167125:\n\t"\
  507. "prefetcht0 (%3); prefetcht0 64(%3); prefetcht0 127(%3);"\
  508. KERNEL_k1m16n12 "addq %4,%3; decq %5;jnz 7167125b;"\
  509. "7167129:\n\t"\
  510. kernel_kend_n12(16)\
  511. "prefetcht0 (%%r14);" SAVE_set_pa(16) SAVE_m16n12 save_update_k(16)
  512. #define COMPUTE(ndim) {\
  513. b_pref = b_ptr + ndim * K; HEAD_SET_OFF(ndim)\
  514. __asm__ __volatile__(\
  515. "vbroadcastsd %8,%%zmm0; movq %7,%%r11; movq %1,%%r14; movq %10,%%r12; salq $4,%%r12;" INITASM_SET_K\
  516. "cmpq $16,%%r11; jb "#ndim"33102f;"\
  517. #ndim"33101:\n\t"\
  518. COMPUTE_m16n##ndim "subq $16,%%r11; cmpq $16,%%r11; jnb "#ndim"33101b;"\
  519. #ndim"33102:\n\t"\
  520. "cmpq $8,%%r11; jb "#ndim"33103f;"\
  521. COMPUTE_SIMPLE(8,ndim) "subq $8,%%r11;"\
  522. #ndim"33103:\n\t"\
  523. "cmpq $4,%%r11; jb "#ndim"33104f;"\
  524. COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\
  525. #ndim"33104:\n\t"\
  526. "cmpq $2,%%r11; jb "#ndim"33105f;"\
  527. COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\
  528. #ndim"33105:\n\t"\
  529. "testq %%r11,%%r11; jz "#ndim"33106f;"\
  530. COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\
  531. #ndim"33106:\n\t"\
  532. "movq %%r14,%1;"\
  533. :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K), "o"(permute_table):"r10","r11","r12","r13","r14","r15","cc","memory",\
  534. "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\
  535. "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\
  536. a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\
  537. }
  538. #include "common.h"
  539. #include <stdint.h>
  540. int __attribute__ ((noinline))
  541. CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc
  542. #ifdef TRMMKERNEL
  543. , BLASLONG offset
  544. #endif
  545. )
  546. {
  547. if(m==0||n==0) return 0;
  548. int64_t ldc_in_bytes = (int64_t)ldc * sizeof(double); double ALPHA = alpha;
  549. int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0;
  550. BLASLONG n_count = n, off = 0;
  551. double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B;
  552. int64_t permute_table[] = {
  553. 0, 0, 1, 1, 2, 2, 3, 3, // abcdxxxx -> aabbccdd
  554. 0, 1, 0, 1, 2, 3, 2, 3, // abcdxxxx -> ababcdcd
  555. };
  556. #ifdef TRMMKERNEL
  557. #ifdef LEFT
  558. off = offset;
  559. #else
  560. off = -offset;
  561. #endif
  562. #endif
  563. for(;n_count>11;n_count-=12) COMPUTE(12)
  564. for(;n_count>9;n_count-=10) COMPUTE(10)
  565. for(;n_count>7;n_count-=8) COMPUTE(8)
  566. for(;n_count>5;n_count-=6) COMPUTE(6)
  567. for(;n_count>3;n_count-=4) COMPUTE(4)
  568. for(;n_count>1;n_count-=2) COMPUTE(2)
  569. if(n_count>0) COMPUTE(1)
  570. return 0;
  571. }