You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x2_haswell.c 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. #include "common.h"
  2. #include <stdint.h>
  3. /* recommended settings: GEMM_P = 256, GEMM_Q = 256 */
  4. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  5. #define A_CONJ 0
  6. #define B_CONJ 0
  7. #endif
  8. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  9. #define A_CONJ 1
  10. #define B_CONJ 0
  11. #endif
  12. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  13. #define A_CONJ 0
  14. #define B_CONJ 1
  15. #endif
  16. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  17. #define A_CONJ 1
  18. #define B_CONJ 1
  19. #endif
  20. /* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */
  21. /* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */
  22. /* m=8, ymm 0-3 temp, ymm 4-15 acc */
  23. #if A_CONJ == B_CONJ
  24. #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
  25. #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
  26. #else
  27. #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
  28. #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
  29. #endif
  30. /* expanded accumulators for m8n1 and m8n2 */
  31. #define KERNEL_k1m8n1 \
  32. "vbroadcastsd (%1),%%ymm0; addq $8,%1;"\
  33. "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\
  34. "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\
  35. "addq $64,%0;"
  36. #define KERNEL_k1m8n2 \
  37. "vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\
  38. "vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\
  39. "vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\
  40. "addq $64,%0;"
  41. /* contracted accumulators for m8n4 and m8n6 */
  42. #define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \
  43. "vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\
  44. "vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc)
  45. #define KERNEL_1_k1m8n4 \
  46. "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
  47. acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1)
  48. #define KERNEL_2_k1m8n4 \
  49. "vpermilps $177,-64(%0),%%ymm0; vpermilps $177,-32(%0),%%ymm1;"\
  50. acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1)
  51. #define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2)
  52. #define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2)
  53. #define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;"
  54. #define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;"
  55. #define zero_4ymm(no1,no2,no3,no4) \
  56. "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\
  57. "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";"
  58. /* initialization and storage macros */
  59. #define INIT_m8n1 zero_4ymm(4,5,6,7)
  60. #define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
  61. #define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
  62. #define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15)
  63. #if A_CONJ == B_CONJ
  64. #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";"
  65. #else
  66. #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";"
  67. #endif
  68. #if A_CONJ == 0
  69. #define save_1ymm(c,tmp,off,alpr,alpi,...) \
  70. "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\
  71. "vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");"
  72. #else
  73. #define save_1ymm(c,tmp,off,alpr,alpi,...) \
  74. "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\
  75. "vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");"
  76. #endif
  77. #define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
  78. #define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3)
  79. #define SAVE_m8n2 SAVE_m8n1\
  80. cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1)
  81. #define SAVE_m8n4 save_init_m8\
  82. save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\
  83. save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1)
  84. #define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\
  85. save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1)
  86. #define COMPUTE_m8(ndim) \
  87. "movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\
  88. "testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\
  89. "movq $10,%5; movq $84,%%r15;"\
  90. #ndim"8881:\n\t"\
  91. "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
  92. KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
  93. "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
  94. KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
  95. "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\
  96. "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\
  97. #ndim"8882:\n\t"\
  98. "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
  99. KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\
  100. #ndim"8883:\n\t"\
  101. "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim
  102. /* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
  103. #define KERNEL_k1m4n1 \
  104. "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
  105. "vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;"
  106. #define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \
  107. "vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\
  108. "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r)
  109. #define KERNEL_h_k1m4n2 \
  110. "vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1)
  111. #define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1)
  112. #define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2)
  113. #define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
  114. #define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
  115. #define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;"
  116. #define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
  117. #define INIT_m4n2 zero_4ymm(4,5,6,7)
  118. #define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11)
  119. #define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15)
  120. #define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
  121. #define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3)
  122. #define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1)
  123. #define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\
  124. cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1)
  125. #define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\
  126. cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1)
  127. #define COMPUTE_m4(ndim) \
  128. "movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\
  129. "testq %5,%5; jz "#ndim"4442f;"\
  130. #ndim"4441:\n\t"\
  131. KERNEL_k1m4n##ndim\
  132. "decq %5; jnz "#ndim"4441b;"\
  133. #ndim"4442:\n\t"\
  134. SAVE_m4n##ndim
  135. /* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */
  136. #if A_CONJ == B_CONJ
  137. #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
  138. #else
  139. #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
  140. #endif
  141. #define KERNEL_h_k1m2n1 \
  142. "vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\
  143. "vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5)
  144. #define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\
  145. "vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7)
  146. #define acc_m2n2_exp(c1,c2,c3,c4,...)\
  147. "vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\
  148. "vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4)
  149. #define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1)
  150. #define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2)
  151. #define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;"
  152. #define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
  153. #define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
  154. #define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;"
  155. #define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";"
  156. #define INIT_m2n1 zero_2xmm(4,5)
  157. #define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7)
  158. #define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11)
  159. #define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15)
  160. #if A_CONJ == B_CONJ
  161. #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";"
  162. #else
  163. #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";"
  164. #endif
  165. #if A_CONJ == 0
  166. #define save_1xmm(c,tmp,alpr,alpi) \
  167. "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\
  168. "vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;"
  169. #else
  170. #define save_1xmm(c,tmp,alpr,alpi) \
  171. "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\
  172. "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;"
  173. #endif
  174. #define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
  175. #define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1)
  176. #define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1)
  177. #define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1)
  178. #define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1)
  179. #define COMPUTE_m2(ndim) \
  180. "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\
  181. "testq %5,%5; jz "#ndim"2222f;"\
  182. #ndim"2221:\n\t"\
  183. KERNEL_k1m2n##ndim\
  184. "decq %5; jnz "#ndim"2221b;"\
  185. #ndim"2222:\n\t"\
  186. SAVE_m2n##ndim
  187. /* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */
  188. #if A_CONJ == B_CONJ
  189. #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
  190. #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
  191. #else
  192. #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
  193. #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
  194. #endif
  195. #define KERNEL_k1m1n1 \
  196. "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
  197. "vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5)
  198. #define KERNEL_h_k1m1n2 \
  199. "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
  200. "vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5)
  201. #define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7)
  202. #define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9)
  203. #define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
  204. #define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
  205. #define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;"
  206. #define INIT_m1n1 zero_2xmm(4,5)
  207. #define INIT_m1n2 zero_2xmm(4,5)
  208. #define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7)
  209. #define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9)
  210. #if A_CONJ == 0
  211. #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
  212. "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\
  213. "vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);"
  214. #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
  215. "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
  216. "vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\
  217. "vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;"
  218. #else
  219. #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
  220. "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\
  221. "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);"
  222. #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
  223. "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
  224. "vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\
  225. "vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;"
  226. #endif
  227. #define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
  228. #define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1)
  229. #define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1)
  230. #define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1)
  231. #define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1)
  232. #define COMPUTE_m1(ndim) \
  233. "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\
  234. "testq %5,%5; jz "#ndim"1112f;"\
  235. #ndim"1111:\n\t"\
  236. KERNEL_k1m1n##ndim\
  237. "decq %5; jnz "#ndim"1111b;"\
  238. #ndim"1112:\n\t"\
  239. SAVE_m1n##ndim
  240. #define COMPUTE(ndim) {\
  241. b_pref = b_ptr + ndim * K *2;\
  242. __asm__ __volatile__ (\
  243. "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\
  244. "cmpq $8,%7; jb "#ndim"9992f;"\
  245. #ndim"9991:\n\t"\
  246. COMPUTE_m8(ndim)\
  247. "subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\
  248. #ndim"9992:\n\t"\
  249. "cmpq $4,%7; jb "#ndim"9993f;"\
  250. COMPUTE_m4(ndim) "subq $4,%7;"\
  251. #ndim"9993:\n\t"\
  252. "cmpq $2,%7; jb "#ndim"9994f;"\
  253. COMPUTE_m2(ndim) "subq $2,%7;"\
  254. #ndim"9994:\n\t"\
  255. "testq %7,%7; jz "#ndim"9995f;"\
  256. COMPUTE_m1(ndim)\
  257. #ndim"9995:\n\t"\
  258. "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\
  259. :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\
  260. ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\
  261. "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
  262. a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
  263. }
  264. int __attribute__ ((noinline))
  265. CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
  266. {
  267. if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0;
  268. int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2;
  269. #if A_CONJ == B_CONJ
  270. float const_val[2] = {-alphar, -alphai};
  271. #else
  272. float const_val[2] = {alphar, alphai};
  273. #endif
  274. int64_t M = (int64_t)m, K = (int64_t)k;
  275. BLASLONG n_count = n;
  276. float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B;
  277. for(;n_count>5;n_count-=6) COMPUTE(6)
  278. for(;n_count>3;n_count-=4) COMPUTE(4)
  279. for(;n_count>1;n_count-=2) COMPUTE(2)
  280. if(n_count>0) COMPUTE(1)
  281. return 0;
  282. }