You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_tcopy_16_skylakex.c 5.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. #include <stdio.h>
  2. #include "common.h"
  3. #include <immintrin.h>
  4. int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_dim, double *dst){
  5. double *src1, *src2, *src3, *src4, *dst1;
  6. __m512d z1,z2,z3,z4,z5,z6,z7,z8; __m256d y1,y2,y3,y4; __m128d x1,x2,x3,x4; double s1,s2,s3,s4;
  7. BLASLONG dim1_count, dim2_count, src_inc;
  8. src_inc = 4 * lead_dim - dim_first;
  9. src1 = src; src2 = src + lead_dim; src3 = src2 + lead_dim; src4 = src3 + lead_dim;
  10. for(dim2_count=dim_second; dim2_count>3; dim2_count-=4){
  11. dst1 = dst + 16 * (dim_second - dim2_count);
  12. for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){
  13. z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16;
  14. z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16;
  15. z5 = _mm512_loadu_pd(src3); z6 = _mm512_loadu_pd(src3+8); src3 += 16;
  16. z7 = _mm512_loadu_pd(src4); z8 = _mm512_loadu_pd(src4+8); src4 += 16;
  17. _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2);
  18. _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4);
  19. _mm512_storeu_pd(dst1+32,z5); _mm512_storeu_pd(dst1+40,z6);
  20. _mm512_storeu_pd(dst1+48,z7); _mm512_storeu_pd(dst1+56,z8); dst1 += 16 * dim_second;
  21. }
  22. dst1 -= 8 * (dim_second - dim2_count);
  23. if(dim1_count>7){
  24. z1 = _mm512_loadu_pd(src1); src1 += 8;
  25. z2 = _mm512_loadu_pd(src2); src2 += 8;
  26. z3 = _mm512_loadu_pd(src3); src3 += 8;
  27. z4 = _mm512_loadu_pd(src4); src4 += 8;
  28. _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2);
  29. _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 8 * dim_second;
  30. dim1_count -= 8;
  31. }
  32. dst1 -= 4 * (dim_second - dim2_count);
  33. if(dim1_count>3){
  34. y1 = _mm256_loadu_pd(src1); src1 += 4;
  35. y2 = _mm256_loadu_pd(src2); src2 += 4;
  36. y3 = _mm256_loadu_pd(src3); src3 += 4;
  37. y4 = _mm256_loadu_pd(src4); src4 += 4;
  38. _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2);
  39. _mm256_storeu_pd(dst1+ 8,y3); _mm256_storeu_pd(dst1+12,y4); dst1 += 4 * dim_second;
  40. dim1_count -= 4;
  41. }
  42. dst1 -= 2 * (dim_second - dim2_count);
  43. if(dim1_count>1){
  44. x1 = _mm_loadu_pd(src1); src1 += 2;
  45. x2 = _mm_loadu_pd(src2); src2 += 2;
  46. x3 = _mm_loadu_pd(src3); src3 += 2;
  47. x4 = _mm_loadu_pd(src4); src4 += 2;
  48. _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2);
  49. _mm_storeu_pd(dst1+4,x3); _mm_storeu_pd(dst1+6,x4); dst1 += 2 * dim_second;
  50. dim1_count -= 2;
  51. }
  52. dst1 -= dim_second - dim2_count;
  53. if(dim1_count>0){
  54. s1 = *src1; src1++; s2 = *src2; src2++; s3 = *src3; src3++; s4 = *src4; src4++;
  55. dst1[0] = s1; dst1[1] = s2; dst1[2] = s3; dst1[3] = s4;
  56. }
  57. src1 += src_inc; src2 += src_inc; src3 += src_inc; src4 += src_inc;
  58. }
  59. src_inc -= 2 * lead_dim;
  60. for(; dim2_count>1; dim2_count-=2){
  61. dst1 = dst + 16 * (dim_second - dim2_count);
  62. for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){
  63. z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16;
  64. z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16;
  65. _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2);
  66. _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 16 * dim_second;
  67. }
  68. dst1 -= 8 * (dim_second - dim2_count);
  69. if(dim1_count>7){
  70. z1 = _mm512_loadu_pd(src1); src1 += 8;
  71. z2 = _mm512_loadu_pd(src2); src2 += 8;
  72. _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 8 * dim_second;
  73. dim1_count -= 8;
  74. }
  75. dst1 -= 4 * (dim_second - dim2_count);
  76. if(dim1_count>3){
  77. y1 = _mm256_loadu_pd(src1); src1 += 4;
  78. y2 = _mm256_loadu_pd(src2); src2 += 4;
  79. _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); dst1 += 4 * dim_second;
  80. dim1_count -= 4;
  81. }
  82. dst1 -= 2 * (dim_second - dim2_count);
  83. if(dim1_count>1){
  84. x1 = _mm_loadu_pd(src1); src1 += 2;
  85. x2 = _mm_loadu_pd(src2); src2 += 2;
  86. _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); dst1 += 2 * dim_second;
  87. dim1_count -= 2;
  88. }
  89. dst1 -= dim_second - dim2_count;
  90. if(dim1_count>0){
  91. s1 = *src1; src1++; s2 = *src2; src2++;
  92. dst1[0] = s1; dst1[1] = s2;
  93. }
  94. src1 += src_inc; src2 += src_inc;
  95. }
  96. src_inc -= lead_dim;
  97. for(; dim2_count>0; dim2_count--){
  98. dst1 = dst + 16 * (dim_second - dim2_count);
  99. for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){
  100. z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16;
  101. _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 16 * dim_second;
  102. }
  103. dst1 -= 8 * (dim_second - dim2_count);
  104. if(dim1_count>7){
  105. z1 = _mm512_loadu_pd(src1); src1 += 8;
  106. _mm512_storeu_pd(dst1+ 0,z1); dst1 += 8 * dim_second;
  107. dim1_count -= 8;
  108. }
  109. dst1 -= 4 * (dim_second - dim2_count);
  110. if(dim1_count>3){
  111. y1 = _mm256_loadu_pd(src1); src1 += 4;
  112. _mm256_storeu_pd(dst1+ 0,y1); dst1 += 4 * dim_second;
  113. dim1_count -= 4;
  114. }
  115. dst1 -= 2 * (dim_second - dim2_count);
  116. if(dim1_count>1){
  117. x1 = _mm_loadu_pd(src1); src1 += 2;
  118. _mm_storeu_pd(dst1+0,x1); dst1 += 2 * dim_second;
  119. dim1_count -= 2;
  120. }
  121. dst1 -= dim_second - dim2_count;
  122. if(dim1_count>0){
  123. s1 = *src1; src1++;
  124. dst1[0] = s1;
  125. }
  126. src1 += src_inc;
  127. }
  128. return 0;
  129. }