I don't have as many benchmarks for these as for gemm, but it should still make a difference for small matrices.tags/v0.3.1
@@ -40,11 +40,11 @@ | |||||
#include "common.h" | #include "common.h" | ||||
#ifdef FUNCTION_PROFILE | #ifdef FUNCTION_PROFILE | ||||
#include "functable.h" | #include "functable.h" | ||||
#endif | |||||
#endif | |||||
#if defined(Z13) | #if defined(Z13) | ||||
#define MULTI_THREAD_MINIMAL 200000 | #define MULTI_THREAD_MINIMAL 200000 | ||||
#else | #else | ||||
#define MULTI_THREAD_MINIMAL 10000 | |||||
#define MULTI_THREAD_MINIMAL 10000 | |||||
#endif | #endif | ||||
#ifndef CBLAS | #ifndef CBLAS | ||||
@@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||||
if (incy < 0) y -= (n - 1) * incy; | if (incy < 0) y -= (n - 1) * incy; | ||||
#ifdef SMP | #ifdef SMP | ||||
nthreads = num_cpu_avail(1); | |||||
//disable multi-thread when incx==0 or incy==0 | //disable multi-thread when incx==0 or incy==0 | ||||
//In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
if (incx == 0 || incy == 0) | |||||
nthreads = 1; | |||||
// | |||||
//Temporarily work-around the low performance issue with small imput size & | //Temporarily work-around the low performance issue with small imput size & | ||||
//multithreads. | //multithreads. | ||||
if (n <= MULTI_THREAD_MINIMAL) | |||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
#endif | #endif | ||||
@@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ | |||||
#ifdef SMP | #ifdef SMP | ||||
nthreads = num_cpu_avail(1); | |||||
if (n <= 1048576 ) | if (n <= 1048576 ) | ||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
#endif | #endif | ||||
@@ -90,18 +90,16 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||||
if (incy < 0) y -= (n - 1) * incy * 2; | if (incy < 0) y -= (n - 1) * incy * 2; | ||||
#ifdef SMP | #ifdef SMP | ||||
nthreads = num_cpu_avail(1); | |||||
//disable multi-thread when incx==0 or incy==0 | //disable multi-thread when incx==0 or incy==0 | ||||
//In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
if (incx == 0 || incy == 0) | |||||
nthreads = 1; | |||||
//Work around the low performance issue with small imput size & | |||||
// | |||||
//Temporarily work-around the low performance issue with small imput size & | |||||
//multithreads. | //multithreads. | ||||
if (n <= MULTI_THREAD_MINIMAL) { | |||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | |||||
nthreads = 1; | nthreads = 1; | ||||
} | |||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
#endif | #endif | ||||
@@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||||
FUNCTION_PROFILE_START(); | FUNCTION_PROFILE_START(); | ||||
#ifdef SMP | #ifdef SMP | ||||
nthreads = num_cpu_avail(1); | |||||
if ( n <= 1048576 ) | if ( n <= 1048576 ) | ||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
#endif | #endif | ||||
@@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy; | |||||
if (incy < 0) y -= (n - 1) * incy * 2; | if (incy < 0) y -= (n - 1) * incy * 2; | ||||
#ifdef SMP | #ifdef SMP | ||||
nthreads = num_cpu_avail(1); | |||||
//disable multi-thread when incx==0 or incy==0 | //disable multi-thread when incx==0 or incy==0 | ||||
//In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
if (incx == 0 || incy == 0) | if (incx == 0 || incy == 0) | ||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
#endif | #endif | ||||
@@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT asum = 0.0; | FLOAT asum = 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
asum = casum_compute(n, x, inc_x); | asum = casum_compute(n, x, inc_x); | ||||
@@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
if (n <= 0) return 0; | if (n <= 0) return 0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
do_copy(n, x, inc_x, y, inc_y); | do_copy(n, x, inc_x, y, inc_y); | ||||
@@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT asum = 0.0; | FLOAT asum = 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
asum = dasum_compute(n, x, inc_x); | asum = dasum_compute(n, x, inc_x); | ||||
@@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
" faddp "DOTF", v0.2d \n" | " faddp "DOTF", v0.2d \n" | ||||
#endif /* !defined(DSDOT) */ | #endif /* !defined(DSDOT) */ | ||||
#else /* !defined(DOUBLE) */ | |||||
#else /* !defined(DOUBLE) */ | |||||
#define KERNEL_F1 \ | #define KERNEL_F1 \ | ||||
" ldr "TMPX", ["X"] \n" \ | " ldr "TMPX", ["X"] \n" \ | ||||
" ldr "TMPY", ["Y"] \n" \ | " ldr "TMPY", ["Y"] \n" \ | ||||
@@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||||
RETURN_TYPE dot = 0.0; | RETURN_TYPE dot = 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0 || inc_y == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
dot = dot_compute(n, x, inc_x, y, inc_y); | dot = dot_compute(n, x, inc_x, y, inc_y); | ||||
@@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
if (n <= 0 || inc_x <= 0) return 0.0; | if (n <= 0 || inc_x <= 0) return 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (n <= 10000) | if (n <= 10000) | ||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
nrm2_compute(n, x, inc_x, &ssq, &scale); | nrm2_compute(n, x, inc_x, &ssq, &scale); | ||||
@@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
if (n <= 0 || inc_x <= 0) return 0.0; | if (n <= 0 || inc_x <= 0) return 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (n <= 10000) | if (n <= 10000) | ||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
nrm2 = nrm2_compute(n, x, inc_x); | nrm2 = nrm2_compute(n, x, inc_x); | ||||
@@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
BLASLONG max_index = 0; | BLASLONG max_index = 0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
max_index = iamax_compute(n, x, inc_x); | max_index = iamax_compute(n, x, inc_x); | ||||
@@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
BLASLONG max_index = 0; | BLASLONG max_index = 0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
max_index = izamax_compute(n, x, inc_x); | max_index = izamax_compute(n, x, inc_x); | ||||
@@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT asum = 0.0; | FLOAT asum = 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
asum = sasum_compute(n, x, inc_x); | asum = sasum_compute(n, x, inc_x); | ||||
@@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
if (n <= 0 || inc_x <= 0) return 0.0; | if (n <= 0 || inc_x <= 0) return 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (n <= 10000) | if (n <= 10000) | ||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
nrm2_double = nrm2_compute(n, x, inc_x); | nrm2_double = nrm2_compute(n, x, inc_x); | ||||
@@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT asum = 0.0; | FLOAT asum = 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
asum = zasum_compute(n, x, inc_x); | asum = zasum_compute(n, x, inc_x); | ||||
@@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
CIMAG(zdot) = 0.0; | CIMAG(zdot) = 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0 || inc_y == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
zdot_compute(n, x, inc_x, y, inc_y, &zdot); | zdot_compute(n, x, inc_x, y, inc_y, &zdot); | ||||
@@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if defined(BULLDOZER) | |||||
#if defined(BULLDOZER) | |||||
#include "ddot_microk_bulldozer-2.c" | #include "ddot_microk_bulldozer-2.c" | ||||
#elif defined(STEAMROLLER) || defined(EXCAVATOR) | #elif defined(STEAMROLLER) || defined(EXCAVATOR) | ||||
#include "ddot_microk_steamroller-2.c" | #include "ddot_microk_steamroller-2.c" | ||||
#elif defined(PILEDRIVER) | #elif defined(PILEDRIVER) | ||||
#include "ddot_microk_piledriver-2.c" | #include "ddot_microk_piledriver-2.c" | ||||
#elif defined(NEHALEM) | |||||
#elif defined(NEHALEM) | |||||
#include "ddot_microk_nehalem-2.c" | #include "ddot_microk_nehalem-2.c" | ||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | ||||
#include "ddot_microk_haswell-2.c" | #include "ddot_microk_haswell-2.c" | ||||
@@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON | |||||
FLOAT temp1 = 0.0; | FLOAT temp1 = 0.0; | ||||
FLOAT temp2 = 0.0; | FLOAT temp2 = 0.0; | ||||
BLASLONG n1 = n & -4; | |||||
BLASLONG n1 = n & -4; | |||||
while(i < n1) | while(i < n1) | ||||
{ | { | ||||
@@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
FLOAT dot = 0.0; | FLOAT dot = 0.0; | ||||
#if defined(SMP) | #if defined(SMP) | ||||
nthreads = num_cpu_avail(1); | |||||
if (inc_x == 0 || inc_y == 0) | |||||
nthreads = 1; | |||||
if (n <= 10000) | |||||
if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | |||||
nthreads = num_cpu_avail(1); | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
dot = dot_compute(n, x, inc_x, y, inc_y); | dot = dot_compute(n, x, inc_x, y, inc_y); | ||||