Previously dynamic builds were either using the default SWITCH_RATIO or one from the higher level architecture; this patch ensures the dynamic builds can use this parameter as well.tags/v0.3.24
@@ -1,5 +1,6 @@ | |||||
/*********************************************************************/ | /*********************************************************************/ | ||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | /* Copyright 2009, 2010 The University of Texas at Austin. */ | ||||
/* Copyright 2023 The OpenBLAS Project. */ | |||||
/* All rights reserved. */ | /* All rights reserved. */ | ||||
/* */ | /* */ | ||||
/* Redistribution and use in source and binary forms, with or */ | /* Redistribution and use in source and binary forms, with or */ | ||||
@@ -45,6 +46,7 @@ | |||||
typedef struct { | typedef struct { | ||||
int dtb_entries; | int dtb_entries; | ||||
int switch_ratio; | |||||
int offsetA, offsetB, align; | int offsetA, offsetB, align; | ||||
#ifdef BUILD_BFLOAT16 | #ifdef BUILD_BFLOAT16 | ||||
@@ -1,5 +1,6 @@ | |||||
/*********************************************************************/ | /*********************************************************************/ | ||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | /* Copyright 2009, 2010 The University of Texas at Austin. */ | ||||
/* Copyright 2023 The OpenBLAS Project. */ | |||||
/* All rights reserved. */ | /* All rights reserved. */ | ||||
/* */ | /* */ | ||||
/* Redistribution and use in source and binary forms, with or */ | /* Redistribution and use in source and binary forms, with or */ | ||||
@@ -44,10 +45,6 @@ | |||||
#define DIVIDE_RATE 2 | #define DIVIDE_RATE 2 | ||||
#endif | #endif | ||||
#ifndef SWITCH_RATIO | |||||
#define SWITCH_RATIO 2 | |||||
#endif | |||||
//The array of job_t may overflow the stack. | //The array of job_t may overflow the stack. | ||||
//Instead, use malloc to alloc job_t. | //Instead, use malloc to alloc job_t. | ||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | ||||
@@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
BLASLONG divN, divT; | BLASLONG divN, divT; | ||||
int mode; | int mode; | ||||
#if defined(DYNAMIC_ARCH) | |||||
int switch_ratio = gotoblas->switch_ratio; | |||||
#else | |||||
int switch_ratio = SWITCH_RATIO; | |||||
#endif | |||||
if (range_m) { | if (range_m) { | ||||
BLASLONG m_from = *(((BLASLONG *)range_m) + 0); | BLASLONG m_from = *(((BLASLONG *)range_m) + 0); | ||||
BLASLONG m_to = *(((BLASLONG *)range_m) + 1); | BLASLONG m_to = *(((BLASLONG *)range_m) + 1); | ||||
@@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
} | } | ||||
*/ | */ | ||||
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { | |||||
if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) { | |||||
GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); | GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); | ||||
return 0; | return 0; | ||||
} | } | ||||
@@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
divT = nthreads; | divT = nthreads; | ||||
divN = 1; | divN = 1; | ||||
while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { | |||||
while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) { | |||||
do { | do { | ||||
divT --; | divT --; | ||||
divN = 1; | divN = 1; | ||||
@@ -1,5 +1,6 @@ | |||||
/*********************************************************************/ | /*********************************************************************/ | ||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | /* Copyright 2009, 2010 The University of Texas at Austin. */ | ||||
/* Copyright 2023 The OpenBLAS Project. */ | |||||
/* All rights reserved. */ | /* All rights reserved. */ | ||||
/* */ | /* */ | ||||
/* Redistribution and use in source and binary forms, with or */ | /* Redistribution and use in source and binary forms, with or */ | ||||
@@ -44,10 +45,6 @@ | |||||
#define DIVIDE_RATE 2 | #define DIVIDE_RATE 2 | ||||
#endif | #endif | ||||
#ifndef SWITCH_RATIO | |||||
#define SWITCH_RATIO 2 | |||||
#endif | |||||
//The array of job_t may overflow the stack. | //The array of job_t may overflow the stack. | ||||
//Instead, use malloc to alloc job_t. | //Instead, use malloc to alloc job_t. | ||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | ||||
@@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
int mode, mask; | int mode, mask; | ||||
double dnum, di, dinum; | double dnum, di, dinum; | ||||
if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { | |||||
#if defined(DYNAMIC_ARCH) | |||||
int switch_ratio = gotoblas->switch_ratio; | |||||
#else | |||||
int switch_ratio = SWITCH_RATIO; | |||||
#endif | |||||
if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) { | |||||
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); | SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); | ||||
return 0; | return 0; | ||||
} | } | ||||
@@ -1,5 +1,6 @@ | |||||
/*********************************************************************/ | /*********************************************************************/ | ||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | /* Copyright 2009, 2010 The University of Texas at Austin. */ | ||||
/* Copyright 2023 The OpenBLAS Project. */ | |||||
/* All rights reserved. */ | /* All rights reserved. */ | ||||
/* */ | /* */ | ||||
/* Redistribution and use in source and binary forms, with or */ | /* Redistribution and use in source and binary forms, with or */ | ||||
@@ -44,10 +45,6 @@ | |||||
#define DIVIDE_RATE 2 | #define DIVIDE_RATE 2 | ||||
#endif | #endif | ||||
#ifndef SWITCH_RATIO | |||||
#define SWITCH_RATIO 2 | |||||
#endif | |||||
#ifndef GEMM_PREFERED_SIZE | #ifndef GEMM_PREFERED_SIZE | ||||
#define GEMM_PREFERED_SIZE 1 | #define GEMM_PREFERED_SIZE 1 | ||||
#endif | #endif | ||||
@@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||||
BLASLONG width, i, j, k, js; | BLASLONG width, i, j, k, js; | ||||
BLASLONG m, n, n_from, n_to; | BLASLONG m, n, n_from, n_to; | ||||
int mode; | int mode; | ||||
#if defined(DYNAMIC_ARCH) | |||||
int switch_ratio = gotoblas->switch_ratio; | |||||
#else | |||||
int switch_ratio = SWITCH_RATIO; | |||||
#endif | |||||
/* Get execution mode */ | /* Get execution mode */ | ||||
#ifndef COMPLEX | #ifndef COMPLEX | ||||
@@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||||
num_parts = 0; | num_parts = 0; | ||||
while (n > 0){ | while (n > 0){ | ||||
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); | width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); | ||||
if (width < SWITCH_RATIO) { | |||||
width = SWITCH_RATIO; | |||||
if (width < switch_ratio) { | |||||
width = switch_ratio; | |||||
} | } | ||||
width = round_up(n, width, GEMM_PREFERED_SIZE); | width = round_up(n, width, GEMM_PREFERED_SIZE); | ||||
@@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF | |||||
BLASLONG m = args -> m; | BLASLONG m = args -> m; | ||||
BLASLONG n = args -> n; | BLASLONG n = args -> n; | ||||
BLASLONG nthreads_m, nthreads_n; | BLASLONG nthreads_m, nthreads_n; | ||||
#if defined(DYNAMIC_ARCH) | |||||
int switch_ratio = gotoblas->switch_ratio; | |||||
#else | |||||
int switch_ratio = SWITCH_RATIO; | |||||
#endif | |||||
/* Get dimensions from index ranges if available */ | /* Get dimensions from index ranges if available */ | ||||
if (range_m) { | if (range_m) { | ||||
@@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF | |||||
n = range_n[1] - range_n[0]; | n = range_n[1] - range_n[0]; | ||||
} | } | ||||
/* Partitions in m should have at least SWITCH_RATIO rows */ | |||||
if (m < 2 * SWITCH_RATIO) { | |||||
/* Partitions in m should have at least switch_ratio rows */ | |||||
if (m < 2 * switch_ratio) { | |||||
nthreads_m = 1; | nthreads_m = 1; | ||||
} else { | } else { | ||||
nthreads_m = args -> nthreads; | nthreads_m = args -> nthreads; | ||||
while (m < nthreads_m * SWITCH_RATIO) { | |||||
while (m < nthreads_m * switch_ratio) { | |||||
nthreads_m = nthreads_m / 2; | nthreads_m = nthreads_m / 2; | ||||
} | } | ||||
} | } | ||||
/* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */ | |||||
if (n < SWITCH_RATIO * nthreads_m) { | |||||
/* Partitions in n should have at most switch_ratio * nthreads_m columns */ | |||||
if (n < switch_ratio * nthreads_m) { | |||||
nthreads_n = 1; | nthreads_n = 1; | ||||
} else { | } else { | ||||
nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m); | |||||
nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m); | |||||
if (nthreads_m * nthreads_n > args -> nthreads) { | if (nthreads_m * nthreads_n > args -> nthreads) { | ||||
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); | nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); | ||||
} | } | ||||
@@ -1,5 +1,6 @@ | |||||
/*********************************************************************/ | /*********************************************************************/ | ||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | /* Copyright 2009, 2010 The University of Texas at Austin. */ | ||||
/* Copyright 2023 The OpenBLAS Project. */ | |||||
/* All rights reserved. */ | /* All rights reserved. */ | ||||
/* */ | /* */ | ||||
/* Redistribution and use in source and binary forms, with or */ | /* Redistribution and use in source and binary forms, with or */ | ||||
@@ -49,7 +50,9 @@ | |||||
static void init_parameter(void); | static void init_parameter(void); | ||||
gotoblas_t TABLE_NAME = { | gotoblas_t TABLE_NAME = { | ||||
DTB_DEFAULT_ENTRIES , | |||||
DTB_DEFAULT_ENTRIES, | |||||
SWITCH_RATIO, | |||||
GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, | GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, | ||||
@@ -80,10 +80,6 @@ static FLOAT dm1 = -1.; | |||||
#define DIVIDE_RATE 2 | #define DIVIDE_RATE 2 | ||||
#endif | #endif | ||||
#ifndef SWITCH_RATIO | |||||
#define SWITCH_RATIO 2 | |||||
#endif | |||||
#ifndef LOWER | #ifndef LOWER | ||||
#define TRANS | #define TRANS | ||||
#endif | #endif | ||||
@@ -3838,6 +3838,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout | |||||
#endif | #endif | ||||
#ifndef SWITCH_RATIO | |||||
#define SWITCH_RATIO 2 | |||||
#endif | |||||
#ifndef QGEMM_DEFAULT_UNROLL_M | #ifndef QGEMM_DEFAULT_UNROLL_M | ||||
#define QGEMM_DEFAULT_UNROLL_M 2 | #define QGEMM_DEFAULT_UNROLL_M 2 | ||||
#endif | #endif | ||||