|
|
@@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) |
|
|
|
|
|
|
|
__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); |
|
|
|
for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { |
|
|
|
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); |
|
|
|
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); |
|
|
|
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); |
|
|
|
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); |
|
|
|
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); |
|
|
|
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); |
|
|
|
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); |
|
|
|
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); |
|
|
|
} |
|
|
|
|
|
|
|
accum_20 = accum_20 + accum_21 + accum_22 + accum_23; |
|
|
|