Make sure that range limit of last thread never exceeds data sizetags/v0.3.0
| @@ -230,8 +230,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT | |||||
| #ifndef TRANSA | #ifndef TRANSA | ||||
| range_m[num_cpu] = num_cpu * ((m + 15) & ~15); | range_m[num_cpu] = num_cpu * ((m + 15) & ~15); | ||||
| if (range_m[num_cpu] > m) range_m[num_cpu] = m; | |||||
| #else | #else | ||||
| range_m[num_cpu] = num_cpu * ((n + 15) & ~15); | range_m[num_cpu] = num_cpu * ((n + 15) & ~15); | ||||
| if (range_m[num_cpu] > n) range_m[num_cpu] = n; | |||||
| #endif | #endif | ||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| @@ -246,6 +246,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | ||||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = sbmv_kernel; | queue[num_cpu].routine = sbmv_kernel; | ||||
| @@ -285,6 +286,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = sbmv_kernel; | queue[num_cpu].routine = sbmv_kernel; | ||||
| @@ -316,6 +318,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * ((n + 15) & ~15); | range_n[num_cpu] = num_cpu * ((n + 15) & ~15); | ||||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = sbmv_kernel; | queue[num_cpu].routine = sbmv_kernel; | ||||
| @@ -246,6 +246,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, | |||||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | ||||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = spmv_kernel; | queue[num_cpu].routine = spmv_kernel; | ||||
| @@ -285,6 +286,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = spmv_kernel; | queue[num_cpu].routine = spmv_kernel; | ||||
| @@ -177,7 +177,8 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
| queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; | queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; | ||||
| queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; | queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; | ||||
| queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; | queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; | ||||
| @@ -225,6 +226,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = symv_kernel; | queue[num_cpu].routine = symv_kernel; | ||||
| @@ -288,6 +288,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | ||||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = trmv_kernel; | queue[num_cpu].routine = trmv_kernel; | ||||
| @@ -327,6 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = trmv_kernel; | queue[num_cpu].routine = trmv_kernel; | ||||
| @@ -356,6 +358,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = trmv_kernel; | queue[num_cpu].routine = trmv_kernel; | ||||
| @@ -307,7 +307,8 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr | |||||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | ||||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = tpmv_kernel; | queue[num_cpu].routine = tpmv_kernel; | ||||
| queue[num_cpu].args = &args; | queue[num_cpu].args = &args; | ||||
| @@ -346,6 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = tpmv_kernel; | queue[num_cpu].routine = tpmv_kernel; | ||||
| @@ -346,6 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | ||||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = trmv_kernel; | queue[num_cpu].routine = trmv_kernel; | ||||
| @@ -385,6 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
| queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
| queue[num_cpu].routine = trmv_kernel; | queue[num_cpu].routine = trmv_kernel; | ||||