|
|
@@ -525,7 +525,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG |
|
|
|
BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2]; |
|
|
|
BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2]; |
|
|
|
BLASLONG *range_M, *range_N; |
|
|
|
BLASLONG num_cpu_m, num_cpu_n; |
|
|
|
BLASLONG num_parts; |
|
|
|
|
|
|
|
BLASLONG nthreads = args -> nthreads; |
|
|
|
|
|
|
@@ -596,16 +596,16 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG |
|
|
|
} |
|
|
|
|
|
|
|
/* Partition m into nthreads_m regions */ |
|
|
|
num_cpu_m = 0; |
|
|
|
num_parts = 0; |
|
|
|
while (m > 0){ |
|
|
|
width = blas_quickdivide(m + nthreads_m - num_cpu_m - 1, nthreads_m - num_cpu_m); |
|
|
|
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); |
|
|
|
m -= width; |
|
|
|
if (m < 0) width = width + m; |
|
|
|
range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; |
|
|
|
num_cpu_m ++; |
|
|
|
range_M[num_parts + 1] = range_M[num_parts] + width; |
|
|
|
num_parts ++; |
|
|
|
} |
|
|
|
for (i = num_cpu_m; i < MAX_CPU_NUMBER; i++) { |
|
|
|
range_M[i + 1] = range_M[num_cpu_m]; |
|
|
|
for (i = num_parts; i < MAX_CPU_NUMBER; i++) { |
|
|
|
range_M[i + 1] = range_M[num_parts]; |
|
|
|
} |
|
|
|
|
|
|
|
/* Initialize parameters for parallel execution */ |
|
|
@@ -637,16 +637,19 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG |
|
|
|
|
|
|
|
/* Partition (a step of) n into nthreads regions */ |
|
|
|
range_N[0] = js; |
|
|
|
num_cpu_n = 0; |
|
|
|
num_parts = 0; |
|
|
|
while (n > 0){ |
|
|
|
width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); |
|
|
|
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); |
|
|
|
if (width < SWITCH_RATIO) { |
|
|
|
width = SWITCH_RATIO; |
|
|
|
} |
|
|
|
n -= width; |
|
|
|
if (n < 0) width = width + n; |
|
|
|
range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; |
|
|
|
num_cpu_n ++; |
|
|
|
range_N[num_parts + 1] = range_N[num_parts] + width; |
|
|
|
num_parts ++; |
|
|
|
} |
|
|
|
for (j = num_cpu_n; j < MAX_CPU_NUMBER; j++) { |
|
|
|
range_N[j + 1] = range_N[num_cpu_n]; |
|
|
|
for (j = num_parts; j < MAX_CPU_NUMBER; j++) { |
|
|
|
range_N[j + 1] = range_N[num_parts]; |
|
|
|
} |
|
|
|
|
|
|
|
/* Clear synchronization flags */ |
|
|
@@ -683,7 +686,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO |
|
|
|
n = range_n[1] - range_n[0]; |
|
|
|
} |
|
|
|
|
|
|
|
/* CPU partitions in m should have at least SWITCH_RATIO rows */ |
|
|
|
/* Partitions in m should have at least SWITCH_RATIO rows */ |
|
|
|
if (m < 2 * SWITCH_RATIO) { |
|
|
|
nthreads_m = 1; |
|
|
|
} else { |
|
|
@@ -693,11 +696,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/* At most one CPU partition in n should have less than nthreads_m columns */ |
|
|
|
if (n < nthreads_m) { |
|
|
|
/* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */ |
|
|
|
if (n < SWITCH_RATIO * nthreads_m) { |
|
|
|
nthreads_n = 1; |
|
|
|
} else { |
|
|
|
nthreads_n = blas_quickdivide(n + nthreads_m - 1, nthreads_m); |
|
|
|
nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m); |
|
|
|
if (nthreads_m * nthreads_n > args -> nthreads) { |
|
|
|
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); |
|
|
|
} |
|
|
|