Browse Source

align to 64, using SSE when input size is small

tags/v0.3.11^2
Gengxin Xie 5 years ago
parent
commit
1b0f17eeed
6 changed files with 392 additions and 224 deletions
  1. +63
    -77
      kernel/x86_64/dasum.c
  2. +71
    -20
      kernel/x86_64/dasum_microk_haswell-2.c
  3. +66
    -13
      kernel/x86_64/dasum_microk_skylakex-2.c
  4. +66
    -80
      kernel/x86_64/sasum.c
  5. +67
    -21
      kernel/x86_64/sasum_microk_haswell-2.c
  6. +59
    -13
      kernel/x86_64/sasum_microk_skylakex-2.c

+ 63
- 77
kernel/x86_64/dasum.c View File

@@ -1,7 +1,8 @@
#include "common.h"
#include <math.h>

#define ABS fabs
#ifndef ABS_K
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
#endif

#if defined(SKYLAKEX)
#include "dasum_microk_skylakex-2.c"
@@ -9,88 +10,73 @@
#include "dasum_microk_haswell-2.c"
#endif

#ifndef HAVE_KERNEL_16
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
#ifndef HAVE_DASUM_KERNEL
static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
{

BLASLONG i=0;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;

while ( i< n )
{

temp0 = ABS(x[0]);
temp1 = ABS(x[1]);
temp2 = ABS(x[2]);
temp3 = ABS(x[3]);
temp4 = ABS(x[4]);
temp5 = ABS(x[5]);
temp6 = ABS(x[6]);
temp7 = ABS(x[7]);

sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;

sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;

x+=8;
i+=8;

}

return sum0+sum1+sum2+sum3;
BLASLONG i=0;
BLASLONG n_8 = n & -8;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;
FLOAT sum4 = 0.0;
while (i < n_8) {
temp0 = ABS_K(x[0]);
temp1 = ABS_K(x[1]);
temp2 = ABS_K(x[2]);
temp3 = ABS_K(x[3]);
temp4 = ABS_K(x[4]);
temp5 = ABS_K(x[5]);
temp6 = ABS_K(x[6]);
temp7 = ABS_K(x[7]);
sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;
sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;
x+=8;
i+=8;
}

while (i < n) {
sum4 += ABS_K(x1[i]);
i++;
}

return sum0+sum1+sum2+sum3+sum4;
}

#endif

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG n1;

if (n <= 0 || inc_x <= 0) return(sumf);

if ( inc_x == 1 )
{

n1 = n & -16;
if ( n1 > 0 )
{

sumf = dasum_kernel_16(n1, x);
i=n1;
}

while(i < n)
{
sumf += ABS(x[i]);
i++;
}

}
else
{

n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}

}
return(sumf);
BLASLONG i=0;
FLOAT sumf = 0.0;

if (n <= 0 || inc_x <= 0) return(sumf);

if ( inc_x == 1 ) {
sumf = dasum_kernel(n, x);
}
else {
n *= inc_x;
while(i < n) {
sumf += ABS_K(x[i]);
i += inc_x;
}
}
return(sumf);
}


+ 71
- 20
kernel/x86_64/dasum_microk_haswell-2.c View File

@@ -1,35 +1,86 @@
#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)

#define HAVE_KERNEL_16 1
#define HAVE_DASUM_KERNEL

#include <immintrin.h>
#include <stdint.h>

static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
#ifndef ABS_K
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
#endif

static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
__m256d accum_0, accum_1, accum_2, accum_3;

accum_0 = _mm256_setzero_pd();
accum_1 = _mm256_setzero_pd();
accum_2 = _mm256_setzero_pd();
accum_3 = _mm256_setzero_pd();

__m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
for (; i < n; i += 16) {
accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask);
accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask);
accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask);
accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask);
FLOAT sumf = 0.0;

if (n >= 256) {
BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 3) & 0x3;

for (i = 0; i < align_256; i++) {
sumf += ABS_K(x1[i]);
}

n -= align_256;
x1 += align_256;
}

BLASLONG tail_index_SSE = n&(~7);
BLASLONG tail_index_AVX2 = n&(~255);

if (n >= 256) {
__m256d accum_0, accum_1, accum_2, accum_3;

accum_0 = _mm256_setzero_pd();
accum_1 = _mm256_setzero_pd();
accum_2 = _mm256_setzero_pd();
accum_3 = _mm256_setzero_pd();

__m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
for (i = 0; i < tail_index_AVX2; i += 16) {
accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask);
accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask);
}

accum_0 = accum_0 + accum_1 + accum_2 + accum_3;

__m128d half_accum0;
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));

half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);

sumf += half_accum0[0];
}
if (n >= 8) {
__m128d accum_20, accum_21, accum_22, accum_23;
accum_20 = _mm_setzero_pd();
accum_21 = _mm_setzero_pd();
accum_22 = _mm_setzero_pd();
accum_23 = _mm_setzero_pd();

accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
}

__m128d half_accum0;
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
__m128d half_accum20;
half_accum20 = _mm_hadd_pd(accum_20, accum_20);

half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
sumf += half_accum20[0];
}
for (i = tail_index_SSE; i < n; ++i) {
sumf += ABS_K(x1[i]);
}

return half_accum0[0];
return sumf;

}
#endif

+ 66
- 13
kernel/x86_64/dasum_microk_skylakex-2.c View File

@@ -1,27 +1,80 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))

#if defined(__AVX512CD__)
#define HAVE_KERNEL_16 1
#define HAVE_DASUM_KERNEL 1

#include <immintrin.h>

static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
#include <stdint.h>

#ifndef ABS_K
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
#endif

static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
FLOAT sumf = 0.0;

if (n >= 256) {
BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7;

__m512d accum_0, accum_1;
for (i = 0; i < align_512; i++) {
sumf += ABS_K(x1[i]);
}
n -= align_512;
x1 += align_512;
}

BLASLONG tail_index_SSE = n&(~7);
BLASLONG tail_index_AVX512 = n&(~255);

accum_0 = _mm512_setzero_pd();
accum_1 = _mm512_setzero_pd();
//
if ( n >= 256 ) {

for (; i < n; i += 16) {
accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0]));
accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8]));
__m512d accum_0, accum_1, accum_2, accum_3;
accum_0 = _mm512_setzero_pd();
accum_1 = _mm512_setzero_pd();
accum_2 = _mm512_setzero_pd();
accum_3 = _mm512_setzero_pd();
for (i = 0; i < tail_index_AVX512; i += 32) {
accum_0 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 0]));
accum_1 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 8]));
accum_2 += _mm512_abs_pd(_mm512_load_pd(&x1[i +16]));
accum_3 += _mm512_abs_pd(_mm512_load_pd(&x1[i +24]));
}

accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
sumf += _mm512_reduce_add_pd(accum_0);
}

accum_0 += accum_1;
return _mm512_reduce_add_pd(accum_0);
if (n >= 8) {
__m128d accum_20, accum_21, accum_22, accum_23;
accum_20 = _mm_setzero_pd();
accum_21 = _mm_setzero_pd();
accum_22 = _mm_setzero_pd();
accum_23 = _mm_setzero_pd();

__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
}

accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
__m128d half_accum20;
half_accum20 = _mm_hadd_pd(accum_20, accum_20);

sumf += half_accum20[0];
}

for (i = tail_index_SSE; i < n; ++i) {
sumf += ABS_K(x1[i]);
}

return sumf;
}
#endif
#endif

+ 66
- 80
kernel/x86_64/sasum.c View File

@@ -1,13 +1,11 @@
#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#error supports float only

#else

#define ABS fabsf
#ifndef ABS_K
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
#endif

#endif

@@ -17,88 +15,76 @@
#include "sasum_microk_haswell-2.c"
#endif

#ifndef HAVE_KERNEL_32
#ifndef HAVE_SASUM_KERNEL

static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
{

BLASLONG i=0;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;

while ( i< n )
{

temp0 = ABS(x[0]);
temp1 = ABS(x[1]);
temp2 = ABS(x[2]);
temp3 = ABS(x[3]);
temp4 = ABS(x[4]);
temp5 = ABS(x[5]);
temp6 = ABS(x[6]);
temp7 = ABS(x[7]);

sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;

sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;

x+=8;
i+=8;

}

return sum0+sum1+sum2+sum3;
BLASLONG i=0;
BLASLONG n_8 = n & -8;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;
FLOAT sum4 = 0.0;

while (i < n_8) {

temp0 = ABS_K(x[0]);
temp1 = ABS_K(x[1]);
temp2 = ABS_K(x[2]);
temp3 = ABS_K(x[3]);
temp4 = ABS_K(x[4]);
temp5 = ABS_K(x[5]);
temp6 = ABS_K(x[6]);
temp7 = ABS_K(x[7]);

sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;

sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;

x+=8;
i+=8;

}

while (i < n) {
sum4 += ABS_K(x1[i]);
i++;
}

return sum0+sum1+sum2+sum3+sum4;
}

#endif

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG n1;

if (n <= 0 || inc_x <= 0) return(sumf);

if ( inc_x == 1 )
{

n1 = n & -32;
if ( n1 > 0 )
{

sumf = sasum_kernel_32(n1, x);
i=n1;
}

while(i < n)
{
sumf += ABS(x[i]);
i++;
}

}
else
{

n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}

}
return(sumf);
BLASLONG i=0;
FLOAT sumf = 0.0;

if (n <= 0 || inc_x <= 0) return(sumf);

if ( inc_x == 1 ) {
sumf = sasum_kernel(n, x);
}
else {

n *= inc_x;
while(i < n) {
sumf += ABS_K(x[i]);
i += inc_x;
}

}
return(sumf);
}

+ 67
- 21
kernel/x86_64/sasum_microk_haswell-2.c View File

@@ -1,36 +1,82 @@
#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)

#define HAVE_KERNEL_32 1
#define HAVE_SASUM_KERNEL 1

#include <immintrin.h>
#include <stdint.h>

static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
#ifndef ABS_K
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
#endif

static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
__m256 accum_0, accum_1, accum_2, accum_3;

accum_0 = _mm256_setzero_ps();
accum_1 = _mm256_setzero_ps();
accum_2 = _mm256_setzero_ps();
accum_3 = _mm256_setzero_ps();

__m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
for (; i < n; i += 32) {
accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask);
accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask);
accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask);
accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask);
FLOAT sumf = 0.0;

if (n >= 256) {
BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7;

for (i = 0; i < align_256; i++) {
sumf += ABS_K(x1[i]);
}

n -= align_256;
x1 += align_256;
}

accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
BLASLONG tail_index_SSE = n&(~7);
BLASLONG tail_index_AVX2 = n&(~255);

if (n >= 256) {
__m256 accum_0, accum_1, accum_2, accum_3;
accum_0 = _mm256_setzero_ps();
accum_1 = _mm256_setzero_ps();
accum_2 = _mm256_setzero_ps();
accum_3 = _mm256_setzero_ps();

__m128 half_accum0;
half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1));
__m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
for (i = 0; i < tail_index_AVX2; i += 32) {
accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask);
accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask);
}

half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
__m128 half_accum0;
half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1));

return half_accum0[0];
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);

sumf += half_accum0[0];
}
if (n >= 8) {
__m128 accum_20, accum_21;
accum_20 = _mm_setzero_ps();
accum_21 = _mm_setzero_ps();

__m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
}
accum_20 += accum_21;
accum_20 = _mm_hadd_ps(accum_20, accum_20);
accum_20 = _mm_hadd_ps(accum_20, accum_20);
sumf += accum_20[0];
}

for (i = tail_index_SSE; i < n; ++i) {
sumf += ABS_K(x1[i]);
}

return sumf;
}
#endif

+ 59
- 13
kernel/x86_64/sasum_microk_skylakex-2.c View File

@@ -1,27 +1,73 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))

#if defined(__AVX512CD__)
#define HAVE_KERNEL_32 1
#define HAVE_SASUM_KERNEL 1

#ifndef ABS_K
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
#endif

#include <immintrin.h>
#include <stdint.h>

static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
FLOAT sumf = 0.0;

if (n >= 256) {
BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf;

for (i = 0; i < align_512; i++) {
sumf += ABS_K(x1[i]);
}
n -= align_512;
x1 += align_512;
}

BLASLONG tail_index_SSE = n&(~7);
BLASLONG tail_index_AVX512 = n&(~255);

__m512 accum_0, accum_1;
if (n >= 256) {
__m512 accum_0, accum_1, accum_2, accum_3;
accum_0 = _mm512_setzero_ps();
accum_1 = _mm512_setzero_ps();
accum_2 = _mm512_setzero_ps();
accum_3 = _mm512_setzero_ps();

accum_0 = _mm512_setzero_ps();
accum_1 = _mm512_setzero_ps();
for (i = 0; i < tail_index_AVX512; i += 64) {
accum_0 += _mm512_abs_ps(_mm512_load_ps(&x1[i + 0]));
accum_1 += _mm512_abs_ps(_mm512_load_ps(&x1[i +16]));
accum_2 += _mm512_abs_ps(_mm512_load_ps(&x1[i +32]));
accum_3 += _mm512_abs_ps(_mm512_load_ps(&x1[i +48]));
}

for (; i < n; i += 32) {
accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0]));
accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16]));
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
sumf += _mm512_reduce_add_ps(accum_0);
}

accum_0 += accum_1;
return _mm512_reduce_add_ps(accum_0);
if (n >= 8) {
__m128 accum_20, accum_21;
accum_20 = _mm_setzero_ps();
accum_21 = _mm_setzero_ps();

__m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
}
accum_20 += accum_21;
accum_20 = _mm_hadd_ps(accum_20, accum_20);
accum_20 = _mm_hadd_ps(accum_20, accum_20);
sumf += accum_20[0];
}

for (i = tail_index_SSE; i < n; i++) {
sumf += ABS_K(x1[i]);
}

return sumf;
}
#endif
#endif

Loading…
Cancel
Save