Browse Source

Make sure that range_n of last thread never exceeds the actual data size when splitting the workload

tags/v0.3.0
Martin Kroeker GitHub 8 years ago
parent
commit
c4e5ba1bfe
7 changed files with 19 additions and 3 deletions
  1. +2
    -0
      driver/level2/gbmv_thread.c
  2. +3
    -0
      driver/level2/sbmv_thread.c
  3. +2
    -0
      driver/level2/spmv_thread.c
  4. +3
    -1
      driver/level2/symv_thread.c
  5. +3
    -0
      driver/level2/tbmv_thread.c
  6. +3
    -1
      driver/level2/tpmv_thread.c
  7. +3
    -1
      driver/level2/trmv_thread.c

+ 2
- 0
driver/level2/gbmv_thread.c View File

@@ -230,8 +230,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT


#ifndef TRANSA #ifndef TRANSA
range_m[num_cpu] = num_cpu * ((m + 15) & ~15); range_m[num_cpu] = num_cpu * ((m + 15) & ~15);
if (range_m[num_cpu] > m) range_m[num_cpu] = m;
#else #else
range_m[num_cpu] = num_cpu * ((n + 15) & ~15); range_m[num_cpu] = num_cpu * ((n + 15) & ~15);
if (range_m[num_cpu] > n) range_m[num_cpu] = n;
#endif #endif


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;


+ 3
- 0
driver/level2/sbmv_thread.c View File

@@ -246,6 +246,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x


range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
if (range_n[num_cpu] > n) range_n[num_cpu] = n;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].routine = sbmv_kernel;
@@ -285,6 +286,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x


range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
if (range_n[num_cpu] > n) range_n[num_cpu] = n;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].routine = sbmv_kernel;
@@ -316,6 +318,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;


range_n[num_cpu] = num_cpu * ((n + 15) & ~15); range_n[num_cpu] = num_cpu * ((n + 15) & ~15);
if (range_n[num_cpu] > n) range_n[num_cpu] = n;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].routine = sbmv_kernel;


+ 2
- 0
driver/level2/spmv_thread.c View File

@@ -246,6 +246,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,


range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = spmv_kernel; queue[num_cpu].routine = spmv_kernel;
@@ -285,6 +286,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,


range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = spmv_kernel; queue[num_cpu].routine = spmv_kernel;


+ 3
- 1
driver/level2/symv_thread.c View File

@@ -177,7 +177,8 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i


range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);

if (range_n[num_cpu] > m) range_n[num_cpu] = m;
queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode;
queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel;
queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args;
@@ -225,6 +226,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i


range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = symv_kernel; queue[num_cpu].routine = symv_kernel;


+ 3
- 0
driver/level2/tbmv_thread.c View File

@@ -288,6 +288,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc


range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
if (range_n[num_cpu] > n) range_n[num_cpu] = n;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;
@@ -327,6 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc


range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
if (range_n[num_cpu] > n) range_n[num_cpu] = n;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;
@@ -356,6 +358,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc


range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
if (range_n[num_cpu] > n) range_n[num_cpu] = n;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;


+ 3
- 1
driver/level2/tpmv_thread.c View File

@@ -307,7 +307,8 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr


range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);

if (range_n[num_cpu] > m) range_n[num_cpu] = m;
queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].routine = tpmv_kernel;
queue[num_cpu].args = &args; queue[num_cpu].args = &args;
@@ -346,6 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr


range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].routine = tpmv_kernel;


+ 3
- 1
driver/level2/trmv_thread.c View File

@@ -346,7 +346,8 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu


range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);

if (range_n[num_cpu] > m) range_n[num_cpu] = m;
queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;
queue[num_cpu].args = &args; queue[num_cpu].args = &args;
@@ -385,6 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu


range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;


queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;


Loading…
Cancel
Save