sgemm/dgemm: add a way for an arch kernel to specify prefered sizes

The current gemm threading code can make very unfortunate choices, for example on my 10 core system a 1024x1024x1024 matrix multiply ends up chunking into blocks of 102... which is not a vector friendly size and performance ends up horrible. this patch adds a helper define where an architecture can specify a preference for size multiples. This is different from existing defines that are minimum sizes and such. The performance increase with this patch for the 1024x1024x1024 sgemm is 2.3x (!!)
7 years ago · 5b708e5eb1
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -48,6 +48,10 @@
 #define SWITCH_RATIO 2
 #endif

 #ifndef GEMM_PREFERED_SIZE
 #define GEMM_PREFERED_SIZE 1
 #endif

 //The array of job_t may overflow the stack.
 //Instead, use malloc to alloc job_t.
 #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
  return 0;
 }

 static int round_up(int remainder, int width, int multiple)
 {
 	if (multiple > remainder || width <= multiple)
 		return width;
 	width = (width + multiple - 1) / multiple;
 	width = width * multiple;
 	return width;
 }


 static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
 		       *range_n, FLOAT *sa, FLOAT *sb,
                       BLASLONG nthreads_m, BLASLONG nthreads_n) {
@@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  num_parts = 0;
  while (m > 0){
    width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);

    width = round_up(m, width, GEMM_PREFERED_SIZE);

    m -= width;

    if (m < 0) width = width + m;
    range_M[num_parts + 1] = range_M[num_parts] + width;

    num_parts ++;
  }
  for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
      if (width < SWITCH_RATIO) {
        width = SWITCH_RATIO;
      }
      width = round_up(n, width, GEMM_PREFERED_SIZE);

      n -= width;
      if (n < 0) width = width + n;
      range_N[num_parts + 1] = range_N[num_parts] + width;

      num_parts ++;
    }
    for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
--- a/param.h
+++ b/param.h
@@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P  8

 #define SWITCH_RATIO	32
 #define GEMM_PREFERED_SIZE	32

 #ifdef ARCH_X86