You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sasum_microk_haswell-2.c 2.5 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
  2. #define HAVE_SASUM_KERNEL 1
  3. #include <immintrin.h>
  4. #include <stdint.h>
  5. #ifndef ABS_K
  6. #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
  7. #endif
  8. static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
  9. {
  10. BLASLONG i = 0;
  11. FLOAT sumf = 0.0;
  12. if (n >= 256) {
  13. BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7;
  14. for (i = 0; i < align_256; i++) {
  15. sumf += ABS_K(x1[i]);
  16. }
  17. n -= align_256;
  18. x1 += align_256;
  19. }
  20. BLASLONG tail_index_SSE = n&(~7);
  21. BLASLONG tail_index_AVX2 = n&(~255);
  22. if (n >= 256) {
  23. __m256 accum_0, accum_1, accum_2, accum_3;
  24. accum_0 = _mm256_setzero_ps();
  25. accum_1 = _mm256_setzero_ps();
  26. accum_2 = _mm256_setzero_ps();
  27. accum_3 = _mm256_setzero_ps();
  28. __m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
  29. for (i = 0; i < tail_index_AVX2; i += 32) {
  30. accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
  31. accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
  32. accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask);
  33. accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask);
  34. }
  35. accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
  36. __m128 half_accum0;
  37. half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1));
  38. half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
  39. half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
  40. sumf += half_accum0[0];
  41. }
  42. if (n >= 8) {
  43. __m128 accum_20, accum_21;
  44. accum_20 = _mm_setzero_ps();
  45. accum_21 = _mm_setzero_ps();
  46. __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
  47. for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
  48. accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
  49. accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
  50. }
  51. accum_20 += accum_21;
  52. accum_20 = _mm_hadd_ps(accum_20, accum_20);
  53. accum_20 = _mm_hadd_ps(accum_20, accum_20);
  54. sumf += accum_20[0];
  55. }
  56. for (i = tail_index_SSE; i < n; ++i) {
  57. sumf += ABS_K(x1[i]);
  58. }
  59. return sumf;
  60. }
  61. #endif