Improve the performance of dasum and sasum when SMP is definedtags/v0.3.13^2
@@ -58,21 +58,19 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) | |||
} | |||
#endif | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG i = 0; | |||
FLOAT sumf = 0.0; | |||
if (n <= 0 || inc_x <= 0) return (sumf); | |||
if (n <= 0 || inc_x <= 0) return(sumf); | |||
if ( inc_x == 1 ) { | |||
if (inc_x == 1) { | |||
sumf = dasum_kernel(n, x); | |||
} | |||
} | |||
else { | |||
n *= inc_x; | |||
while(i < n) { | |||
while (i < n) { | |||
sumf += ABS_K(x[i]); | |||
i += inc_x; | |||
} | |||
@@ -80,3 +78,53 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
return(sumf); | |||
} | |||
#if defined(SMP) | |||
static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) | |||
{ | |||
*(FLOAT *)result = asum_compute(n, x, inc_x); | |||
return 0; | |||
} | |||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
#endif | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
#if defined(SMP) | |||
int nthreads; | |||
FLOAT dummy_alpha; | |||
FLOAT * dummy_b; | |||
#endif | |||
FLOAT sumf = 0.0; | |||
#if defined(SMP) | |||
int num_cpu = num_cpu_avail(1); | |||
if (n <= 100000 || inc_x <= 0) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu < n/100000 ? num_cpu : n/100000; | |||
if (nthreads == 1) { | |||
sumf = asum_compute(n, x, inc_x); | |||
} else { | |||
int mode, i; | |||
char result[MAX_CPU_NUMBER * sizeof(double) *2]; | |||
FLOAT *ptr; | |||
#if !defined(DOUBLE) | |||
mode = BLAS_SINGLE | BLAS_REAL; | |||
#else | |||
mode = BLAS_DOUBLE | BLAS_REAL; | |||
#endif | |||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); | |||
ptr = (FLOAT *)result; | |||
for (i = 0; i < nthreads; i++) { | |||
sumf += (*ptr); | |||
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); | |||
} | |||
} | |||
#else | |||
sumf = asum_compute(n, x, inc_x); | |||
#endif | |||
return(sumf); | |||
} | |||
@@ -67,24 +67,71 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) | |||
#endif | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
static FLOAT asum_compute(BLASLONG n, FLOAT * x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG i = 0; | |||
FLOAT sumf = 0.0; | |||
if (n <= 0 || inc_x <= 0) return (sumf); | |||
if (n <= 0 || inc_x <= 0) return(sumf); | |||
if ( inc_x == 1 ) { | |||
if (inc_x == 1) { | |||
sumf = sasum_kernel(n, x); | |||
} | |||
else { | |||
n *= inc_x; | |||
while(i < n) { | |||
sumf += ABS_K(x[i]); | |||
i += inc_x; | |||
} | |||
} | |||
return (sumf); | |||
} | |||
#if defined(SMP) | |||
static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) | |||
{ | |||
*(FLOAT *)result = asum_compute(n, x, inc_x); | |||
return 0; | |||
} | |||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int(*function)(), int nthreads); | |||
#endif | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
#if defined(SMP) | |||
int nthreads; | |||
FLOAT dummy_alpha; | |||
#endif | |||
FLOAT sumf = 0.0; | |||
#if defined(SMP) | |||
int num_cpu = num_cpu_avail(1); | |||
if (n <= 100000 || inc_x <= 0) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu < n/100000 ? num_cpu : n/100000; | |||
if (nthreads == 1) { | |||
sumf = asum_compute(n, x, inc_x); | |||
} | |||
else { | |||
int mode, i; | |||
char result[MAX_CPU_NUMBER * sizeof(double) *2]; | |||
FLOAT * ptr; | |||
#if !defined(DOUBLE) | |||
mode = BLAS_SINGLE | BLAS_REAL; | |||
#else | |||
mode = BLAS_DOUBLE | BLAS_REAL; | |||
#endif | |||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | |||
ptr = (FLOAT *)result; | |||
for (i = 0; i < nthreads; i++) { | |||
sumf += (*ptr); | |||
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); | |||
} | |||
} | |||
#else | |||
sumf = asum_compute(n, x, inc_x); | |||
#endif | |||
return(sumf); | |||
} |