@@ -50,6 +50,7 @@ typedef struct { | |||||
#ifdef BUILD_BFLOAT16 | #ifdef BUILD_BFLOAT16 | ||||
int sbgemm_p, sbgemm_q, sbgemm_r; | int sbgemm_p, sbgemm_q, sbgemm_r; | ||||
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; | int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; | ||||
int sbgemm_align_k; | |||||
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | ||||
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | ||||
@@ -1193,7 +1194,6 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||||
#ifdef BUILD_COMPLEX16 | #ifdef BUILD_COMPLEX16 | ||||
int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); | int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); | ||||
#endif | #endif | ||||
int align_k; // must be 2^n | |||||
} gotoblas_t; | } gotoblas_t; | ||||
extern gotoblas_t *gotoblas; | extern gotoblas_t *gotoblas; | ||||
@@ -305,13 +305,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
} | } | ||||
BLASLONG pad_min_l = min_l; | BLASLONG pad_min_l = min_l; | ||||
#if defined(HALF) && defined(DYNAMIC_ARCH) | |||||
pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1); | |||||
#if defined(HALF) | |||||
#if defined(DYNAMIC_ARCH) | |||||
pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); | |||||
#else | |||||
pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; | |||||
#endif | #endif | ||||
#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2) | |||||
pad_min_l = (min_l + 3) & ~3; | |||||
#endif | #endif | ||||
/* First, we have to move data A to L2 cache */ | /* First, we have to move data A to L2 cache */ | ||||
@@ -327,12 +327,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
BLASLONG pad_min_l = min_l; | BLASLONG pad_min_l = min_l; | ||||
#if defined(HALF) && defined(DYNAMIC_ARCH) | |||||
pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1); | |||||
#if defined(HALF) | |||||
#if defined(DYNAMIC_ARCH) | |||||
pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); | |||||
#else | |||||
pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; | |||||
#endif | #endif | ||||
#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2) | |||||
pad_min_l = (min_l + 3) & ~3; | |||||
#endif | #endif | ||||
/* Determine step size in m | /* Determine step size in m | ||||
@@ -62,6 +62,8 @@ gotoblas_t TABLE_NAME = { | |||||
MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), | MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), | ||||
#endif | #endif | ||||
SBGEMM_ALIGN_K, | |||||
sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, | sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, | ||||
samax_kTS, samin_kTS, smax_kTS, smin_kTS, | samax_kTS, samin_kTS, smax_kTS, smin_kTS, | ||||
@@ -973,12 +975,6 @@ static void init_parameter(void) { | |||||
TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; | TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; | ||||
#endif | #endif | ||||
#endif | #endif | ||||
#if defined(NEOVERSEN2) && BUILD_BFLOAT16 == 1 | |||||
TABLE_NAME.align_k = 4; | |||||
#else | |||||
TABLE_NAME.align_k = 1; | |||||
#endif | |||||
} | } | ||||
#else // (ARCH_ARM64) | #else // (ARCH_ARM64) | ||||
@@ -79,6 +79,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define SBGEMM_DEFAULT_P 256 | #define SBGEMM_DEFAULT_P 256 | ||||
#define SBGEMM_DEFAULT_R 256 | #define SBGEMM_DEFAULT_R 256 | ||||
#define SBGEMM_DEFAULT_Q 256 | #define SBGEMM_DEFAULT_Q 256 | ||||
#define SBGEMM_ALIGN_K 1 // must be 2^x | |||||
#ifdef OPTERON | #ifdef OPTERON | ||||
#define SNUMOPT 4 | #define SNUMOPT 4 | ||||
@@ -3394,6 +3396,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
#elif defined(NEOVERSEN2) | #elif defined(NEOVERSEN2) | ||||
#undef SBGEMM_ALIGN_K | |||||
#define SBGEMM_ALIGN_K 4 | |||||
#undef SBGEMM_DEFAULT_UNROLL_M | #undef SBGEMM_DEFAULT_UNROLL_M | ||||
#undef SBGEMM_DEFAULT_UNROLL_N | #undef SBGEMM_DEFAULT_UNROLL_N | ||||
#define SBGEMM_DEFAULT_UNROLL_M 8 | #define SBGEMM_DEFAULT_UNROLL_M 8 | ||||