You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sasum_microk_haswell-2.c 2.7 kB

4 years ago
4 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #ifdef __NVCOMPILER
  2. #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
  3. #endif
  4. #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
  5. #define HAVE_SASUM_KERNEL 1
  6. #include <immintrin.h>
  7. #include <stdint.h>
  8. #ifndef ABS_K
  9. #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
  10. #endif
  11. static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
  12. {
  13. BLASLONG i = 0;
  14. FLOAT sumf = 0.0;
  15. if (n >= 256) {
  16. BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7;
  17. for (i = 0; i < align_256; i++) {
  18. sumf += ABS_K(x1[i]);
  19. }
  20. n -= align_256;
  21. x1 += align_256;
  22. }
  23. BLASLONG tail_index_SSE = n&(~7);
  24. BLASLONG tail_index_AVX2 = n&(~255);
  25. if (n >= 256) {
  26. __m256 accum_0, accum_1, accum_2, accum_3;
  27. accum_0 = _mm256_setzero_ps();
  28. accum_1 = _mm256_setzero_ps();
  29. accum_2 = _mm256_setzero_ps();
  30. accum_3 = _mm256_setzero_ps();
  31. __m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
  32. for (i = 0; i < tail_index_AVX2; i += 32) {
  33. accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
  34. accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
  35. accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask);
  36. accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask);
  37. }
  38. accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
  39. __m128 half_accum0;
  40. half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1));
  41. half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
  42. half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
  43. sumf += half_accum0[0];
  44. }
  45. if (n >= 8) {
  46. __m128 accum_20, accum_21;
  47. accum_20 = _mm_setzero_ps();
  48. accum_21 = _mm_setzero_ps();
  49. __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
  50. for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
  51. accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
  52. accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
  53. }
  54. accum_20 += accum_21;
  55. accum_20 = _mm_hadd_ps(accum_20, accum_20);
  56. accum_20 = _mm_hadd_ps(accum_20, accum_20);
  57. sumf += accum_20[0];
  58. }
  59. for (i = tail_index_SSE; i < n; ++i) {
  60. sumf += ABS_K(x1[i]);
  61. }
  62. return sumf;
  63. }
  64. #endif