Multi-thread GEMM Performance Improvement on NeoverseV1 (DIVIDE_RATE=1)

3 months ago · 7e29f11396
--- a/driver/level3/gemm.c
+++ b/driver/level3/gemm.c
@@ -63,6 +63,10 @@
 #define DIVIDE_RATE GEMM_DIVIDE_RATE
 #endif

 #ifdef GEMM_DIVIDE_LIMIT
 #define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT
 #endif

 #ifdef THREADED_LEVEL3
 #include "level3_thread.c"
 #else
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -246,6 +246,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

  BLASLONG nthreads_m;
  BLASLONG mypos_m, mypos_n;
  BLASLONG divide_rate = DIVIDE_RATE;

  BLASLONG is, js, ls, bufferside, jjs;
  BLASLONG min_i, min_l, div_n, min_jj;
@@ -280,6 +281,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  /* Disable divide_rate when N of all threads are less than to DIVIDE_LIMIT */
 #ifdef DIVIDE_LIMIT
  if (N < DIVIDE_LIMIT) divide_rate = 1;
 #endif

  /* Initialize 2D CPU distribution */
  nthreads_m = args -> nthreads;
  if (range_m) {
@@ -321,9 +327,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      ) return 0;

  /* Initialize workspace for local region of B */
  div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  div_n = (n_to - n_from + divide_rate - 1) / divide_rate;
  buffer[0] = sb;
  for (i = 1; i < DIVIDE_RATE; i++) {
  for (i = 1; i < divide_rate; i++) {
    buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
  }

@@ -365,7 +371,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
    STOP_RPCC(copy_A);

    /* Copy local region of B into workspace and apply kernel */
    div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
    div_n = (n_to - n_from + divide_rate - 1) / divide_rate;
    for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {

      /* Make sure if no one is using workspace */
@@ -434,7 +440,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;

      /* Split other region of B into parts */
      div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
      div_n = (range_n[current + 1]  - range_n[current] + divide_rate - 1) / divide_rate;
      for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
        if (current != mypos) {

@@ -485,7 +491,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      do {

        /* Split region of B into parts and apply kernel */
 	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
 	div_n = (range_n[current + 1]  - range_n[current] + divide_rate - 1) / divide_rate;
 	for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {

          /* Apply kernel with local region of A and part of region of B */
@@ -520,7 +526,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
  /* Wait until all other threads are done with local region of B */
  START_RPCC();
  for (i = 0; i < args -> nthreads; i++) {
    for (js = 0; js < DIVIDE_RATE; js++) {
    for (js = 0; js < divide_rate; js++) {
      while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
    }
  }
--- a/param.h
+++ b/param.h
@@ -3585,6 +3585,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d

 #elif defined(NEOVERSEV1) // 256-bit SVE

 #define GEMM_DIVIDE_LIMIT       3

 #if defined(XDOUBLE) || defined(DOUBLE)
 #define SWITCH_RATIO            8
 #define GEMM_PREFERED_SIZE      4