Browse Source

Multi-thread Performance Improvement of GEMM with DIVIDE_RATE=1 for

A64FX.
pull/5353/head
Masato Nakagawa 3 months ago
parent
commit
5253c8f165
2 changed files with 6 additions and 0 deletions
  1. +4
    -0
      driver/level3/gemm.c
  2. +2
    -0
      param.h

+ 4
- 0
driver/level3/gemm.c View File

@@ -59,6 +59,10 @@
#define GEMM_Q 128
#endif

#ifdef GEMM_DIVIDE_RATE
#define DIVIDE_RATE GEMM_DIVIDE_RATE
#endif

#ifdef THREADED_LEVEL3
#include "level3_thread.c"
#else


+ 2
- 0
param.h View File

@@ -3701,6 +3701,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d

#elif defined(A64FX) // 512-bit SVE

#define GEMM_DIVIDE_RATE 1

#if defined(XDOUBLE) || defined(DOUBLE)
#define GEMM_PREFERED_SIZE 8
#else


Loading…
Cancel
Save