Browse Source

Multi-thread GEMM Performance Improvement on NeoverseV1 (DIVIDE_RATE=1)

pull/5407/head
Masato Nakagawa 2 months ago
parent
commit
7e29f11396
3 changed files with 18 additions and 6 deletions
  1. +4
    -0
      driver/level3/gemm.c
  2. +12
    -6
      driver/level3/level3_thread.c
  3. +2
    -0
      param.h

+ 4
- 0
driver/level3/gemm.c View File

@@ -63,6 +63,10 @@
#define DIVIDE_RATE GEMM_DIVIDE_RATE
#endif

#ifdef GEMM_DIVIDE_LIMIT
#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT
#endif

#ifdef THREADED_LEVEL3
#include "level3_thread.c"
#else


+ 12
- 6
driver/level3/level3_thread.c View File

@@ -246,6 +246,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

BLASLONG nthreads_m;
BLASLONG mypos_m, mypos_n;
BLASLONG divide_rate = DIVIDE_RATE;

BLASLONG is, js, ls, bufferside, jjs;
BLASLONG min_i, min_l, div_n, min_jj;
@@ -280,6 +281,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
alpha = (FLOAT *)args -> alpha;
beta = (FLOAT *)args -> beta;

/* Disable divide_rate when N of all threads are less than to DIVIDE_LIMIT */
#ifdef DIVIDE_LIMIT
if (N < DIVIDE_LIMIT) divide_rate = 1;
#endif

/* Initialize 2D CPU distribution */
nthreads_m = args -> nthreads;
if (range_m) {
@@ -321,9 +327,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
) return 0;

/* Initialize workspace for local region of B */
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
div_n = (n_to - n_from + divide_rate - 1) / divide_rate;
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
for (i = 1; i < divide_rate; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
}

@@ -365,7 +371,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
STOP_RPCC(copy_A);

/* Copy local region of B into workspace and apply kernel */
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
div_n = (n_to - n_from + divide_rate - 1) / divide_rate;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {

/* Make sure if no one is using workspace */
@@ -434,7 +440,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;

/* Split other region of B into parts */
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate;
for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
if (current != mypos) {

@@ -485,7 +491,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
do {

/* Split region of B into parts and apply kernel */
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate;
for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {

/* Apply kernel with local region of A and part of region of B */
@@ -520,7 +526,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until all other threads are done with local region of B */
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) {
for (js = 0; js < divide_rate; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
}
}


+ 2
- 0
param.h View File

@@ -3585,6 +3585,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d

#elif defined(NEOVERSEV1) // 256-bit SVE

#define GEMM_DIVIDE_LIMIT 3

#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#define GEMM_PREFERED_SIZE 4


Loading…
Cancel
Save