Browse Source

Fixed a few more unnecessary calls to num_cpu_avail.

I don't have as many benchmarks for these as for gemm, but it should still
make a difference for small matrices.
tags/v0.3.1
Craig Donner 7 years ago
parent
commit
c2545b0fd6
18 changed files with 59 additions and 92 deletions
  1. +6
    -8
      interface/axpy.c
  2. +3
    -2
      interface/scal.c
  3. +6
    -8
      interface/zaxpy.c
  4. +2
    -2
      interface/zscal.c
  5. +2
    -2
      interface/zswap.c
  6. +3
    -6
      kernel/arm64/casum_thunderx2t99.c
  7. +3
    -6
      kernel/arm64/copy_thunderx2t99.c
  8. +3
    -6
      kernel/arm64/dasum_thunderx2t99.c
  9. +4
    -7
      kernel/arm64/dot_thunderx2t99.c
  10. +2
    -2
      kernel/arm64/dznrm2_thunderx2t99.c
  11. +2
    -2
      kernel/arm64/dznrm2_thunderx2t99_fast.c
  12. +3
    -6
      kernel/arm64/iamax_thunderx2t99.c
  13. +3
    -6
      kernel/arm64/izamax_thunderx2t99.c
  14. +3
    -6
      kernel/arm64/sasum_thunderx2t99.c
  15. +2
    -2
      kernel/arm64/scnrm2_thunderx2t99.c
  16. +3
    -6
      kernel/arm64/zasum_thunderx2t99.c
  17. +3
    -6
      kernel/arm64/zdot_thunderx2t99.c
  18. +6
    -9
      kernel/x86_64/ddot.c

+ 6
- 8
interface/axpy.c View File

@@ -40,11 +40,11 @@
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#endif
#if defined(Z13)
#define MULTI_THREAD_MINIMAL 200000
#else
#define MULTI_THREAD_MINIMAL 10000
#define MULTI_THREAD_MINIMAL 10000
#endif
#ifndef CBLAS

@@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
if (incy < 0) y -= (n - 1) * incy;

#ifdef SMP
nthreads = num_cpu_avail(1);

//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;

//
//Temporarily work-around the low performance issue with small imput size &
//multithreads.
if (n <= MULTI_THREAD_MINIMAL)
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
#endif


+ 3
- 2
interface/scal.c View File

@@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){


#ifdef SMP
nthreads = num_cpu_avail(1);

if (n <= 1048576 )
nthreads = 1;
else
nthreads = num_cpu_avail(1);


if (nthreads == 1) {
#endif


+ 6
- 8
interface/zaxpy.c View File

@@ -90,18 +90,16 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
if (incy < 0) y -= (n - 1) * incy * 2;

#ifdef SMP
nthreads = num_cpu_avail(1);

//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;

//Work around the low performance issue with small imput size &
//
//Temporarily work-around the low performance issue with small imput size &
//multithreads.
if (n <= MULTI_THREAD_MINIMAL) {
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;
}
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
#endif



+ 2
- 2
interface/zscal.c View File

@@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
FUNCTION_PROFILE_START();

#ifdef SMP
nthreads = num_cpu_avail(1);

if ( n <= 1048576 )
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
#endif


+ 2
- 2
interface/zswap.c View File

@@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy;
if (incy < 0) y -= (n - 1) * incy * 2;

#ifdef SMP
nthreads = num_cpu_avail(1);

//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
#endif


+ 3
- 6
kernel/arm64/casum_thunderx2t99.c View File

@@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asum = 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
asum = casum_compute(n, x, inc_x);


+ 3
- 6
kernel/arm64/copy_thunderx2t99.c View File

@@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if (n <= 0) return 0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
do_copy(n, x, inc_x, y, inc_y);


+ 3
- 6
kernel/arm64/dasum_thunderx2t99.c View File

@@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asum = 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
asum = dasum_compute(n, x, inc_x);


+ 4
- 7
kernel/arm64/dot_thunderx2t99.c View File

@@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
" faddp "DOTF", v0.2d \n"
#endif /* !defined(DSDOT) */

#else /* !defined(DOUBLE) */
#else /* !defined(DOUBLE) */
#define KERNEL_F1 \
" ldr "TMPX", ["X"] \n" \
" ldr "TMPY", ["Y"] \n" \
@@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
RETURN_TYPE dot = 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0 || inc_y == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || inc_y == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
dot = dot_compute(n, x, inc_x, y, inc_y);


+ 2
- 2
kernel/arm64/dznrm2_thunderx2t99.c View File

@@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
nrm2_compute(n, x, inc_x, &ssq, &scale);


+ 2
- 2
kernel/arm64/dznrm2_thunderx2t99_fast.c View File

@@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
nrm2 = nrm2_compute(n, x, inc_x);


+ 3
- 6
kernel/arm64/iamax_thunderx2t99.c View File

@@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG max_index = 0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
max_index = iamax_compute(n, x, inc_x);


+ 3
- 6
kernel/arm64/izamax_thunderx2t99.c View File

@@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG max_index = 0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
max_index = izamax_compute(n, x, inc_x);


+ 3
- 6
kernel/arm64/sasum_thunderx2t99.c View File

@@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asum = 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
asum = sasum_compute(n, x, inc_x);


+ 2
- 2
kernel/arm64/scnrm2_thunderx2t99.c View File

@@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
nrm2_double = nrm2_compute(n, x, inc_x);


+ 3
- 6
kernel/arm64/zasum_thunderx2t99.c View File

@@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asum = 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
asum = zasum_compute(n, x, inc_x);


+ 3
- 6
kernel/arm64/zdot_thunderx2t99.c View File

@@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
CIMAG(zdot) = 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0 || inc_y == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || inc_y == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
zdot_compute(n, x, inc_x, y, inc_y, &zdot);


+ 6
- 9
kernel/x86_64/ddot.c View File

@@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"


#if defined(BULLDOZER)
#if defined(BULLDOZER)
#include "ddot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ddot_microk_steamroller-2.c"
#elif defined(PILEDRIVER)
#include "ddot_microk_piledriver-2.c"
#elif defined(NEHALEM)
#elif defined(NEHALEM)
#include "ddot_microk_nehalem-2.c"
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
#include "ddot_microk_haswell-2.c"
@@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;

BLASLONG n1 = n & -4;
BLASLONG n1 = n & -4;

while(i < n1)
{
@@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
FLOAT dot = 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0 || inc_y == 0)
nthreads = 1;

if (n <= 10000)
if (inc_x == 0 || inc_y == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
dot = dot_compute(n, x, inc_x, y, inc_y);


Loading…
Cancel
Save