|
|
@@ -246,6 +246,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
|
|
|
|
BLASLONG nthreads_m; |
|
|
|
BLASLONG mypos_m, mypos_n; |
|
|
|
BLASLONG divide_rate = DIVIDE_RATE; |
|
|
|
|
|
|
|
BLASLONG is, js, ls, bufferside, jjs; |
|
|
|
BLASLONG min_i, min_l, div_n, min_jj; |
|
|
@@ -280,6 +281,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
alpha = (FLOAT *)args -> alpha; |
|
|
|
beta = (FLOAT *)args -> beta; |
|
|
|
|
|
|
|
/* Disable divide_rate when N of all threads are less than to DIVIDE_LIMIT */ |
|
|
|
#ifdef DIVIDE_LIMIT |
|
|
|
if (N < DIVIDE_LIMIT) divide_rate = 1; |
|
|
|
#endif |
|
|
|
|
|
|
|
/* Initialize 2D CPU distribution */ |
|
|
|
nthreads_m = args -> nthreads; |
|
|
|
if (range_m) { |
|
|
@@ -321,9 +327,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
) return 0; |
|
|
|
|
|
|
|
/* Initialize workspace for local region of B */ |
|
|
|
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; |
|
|
|
div_n = (n_to - n_from + divide_rate - 1) / divide_rate; |
|
|
|
buffer[0] = sb; |
|
|
|
for (i = 1; i < DIVIDE_RATE; i++) { |
|
|
|
for (i = 1; i < divide_rate; i++) { |
|
|
|
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE; |
|
|
|
} |
|
|
|
|
|
|
@@ -365,7 +371,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
STOP_RPCC(copy_A); |
|
|
|
|
|
|
|
/* Copy local region of B into workspace and apply kernel */ |
|
|
|
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; |
|
|
|
div_n = (n_to - n_from + divide_rate - 1) / divide_rate; |
|
|
|
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { |
|
|
|
|
|
|
|
/* Make sure if no one is using workspace */ |
|
|
@@ -434,7 +440,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m; |
|
|
|
|
|
|
|
/* Split other region of B into parts */ |
|
|
|
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; |
|
|
|
div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate; |
|
|
|
for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) { |
|
|
|
if (current != mypos) { |
|
|
|
|
|
|
@@ -485,7 +491,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
do { |
|
|
|
|
|
|
|
/* Split region of B into parts and apply kernel */ |
|
|
|
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; |
|
|
|
div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate; |
|
|
|
for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) { |
|
|
|
|
|
|
|
/* Apply kernel with local region of A and part of region of B */ |
|
|
@@ -520,7 +526,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
/* Wait until all other threads are done with local region of B */ |
|
|
|
START_RPCC(); |
|
|
|
for (i = 0; i < args -> nthreads; i++) { |
|
|
|
for (js = 0; js < DIVIDE_RATE; js++) { |
|
|
|
for (js = 0; js < divide_rate; js++) { |
|
|
|
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; |
|
|
|
} |
|
|
|
} |
|
|
|