Move "direct SGEMM" functionality out of the SkylakeX SGEMM kernel and make it available (on x86_64 targets only for now) in DYNAMIC_ARCH builds * Add sgemm_direct targets in the kernel Makefile.L3 and CMakeLists.txt * Add direct_sgemm functions to the gotoblas struct in common_param.h * Move sgemm_direct_performant helper to separate file * Update gemm.c to macros for sgemm_direct to support dynamic_arch naming via common_s,h * (Conditionally) add sgemm_direct functions in setparam-ref.ctags/v0.3.11^2
@@ -47,12 +47,12 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); | |||||
extern "C" { | extern "C" { | ||||
#endif | #endif | ||||
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, | |||||
void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, | |||||
float * A, BLASLONG strideA, | float * A, BLASLONG strideA, | ||||
float * B, BLASLONG strideB, | float * B, BLASLONG strideB, | ||||
float * R, BLASLONG strideR); | float * R, BLASLONG strideR); | ||||
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); | |||||
int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); | |||||
int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | ||||
@@ -175,6 +175,11 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||||
int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | ||||
int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | ||||
#ifdef ARCH_X86_64 | |||||
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||||
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | |||||
#endif | |||||
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | ||||
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
@@ -45,6 +45,10 @@ | |||||
#define SSYMV_THREAD_U ssymv_thread_U | #define SSYMV_THREAD_U ssymv_thread_U | ||||
#define SSYMV_THREAD_L ssymv_thread_L | #define SSYMV_THREAD_L ssymv_thread_L | ||||
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | |||||
#define SGEMM_DIRECT sgemm_direct | |||||
#define SGEMM_ONCOPY sgemm_oncopy | #define SGEMM_ONCOPY sgemm_oncopy | ||||
#define SGEMM_OTCOPY sgemm_otcopy | #define SGEMM_OTCOPY sgemm_otcopy | ||||
@@ -204,6 +208,14 @@ | |||||
#define SSYMV_THREAD_U ssymv_thread_U | #define SSYMV_THREAD_U ssymv_thread_U | ||||
#define SSYMV_THREAD_L ssymv_thread_L | #define SSYMV_THREAD_L ssymv_thread_L | ||||
#ifdef ARCH_X86_64 | |||||
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | |||||
#define SGEMM_DIRECT gotoblas -> sgemm_direct | |||||
#else | |||||
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | |||||
#define SGEMM_DIRECT sgemm_direct | |||||
#endif | |||||
#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy | #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy | ||||
#define SGEMM_OTCOPY gotoblas -> sgemm_otcopy | #define SGEMM_OTCOPY gotoblas -> sgemm_otcopy | ||||
#define SGEMM_INCOPY gotoblas -> sgemm_incopy | #define SGEMM_INCOPY gotoblas -> sgemm_incopy | ||||
@@ -275,8 +275,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
#ifdef DYNAMIC_ARCH | #ifdef DYNAMIC_ARCH | ||||
if (support_avx512() ) | if (support_avx512() ) | ||||
#endif | #endif | ||||
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { | |||||
sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); | |||||
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||||
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||||
return; | return; | ||||
} | } | ||||
@@ -134,6 +134,20 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
set(USE_TRMM true) | set(USE_TRMM true) | ||||
endif () | endif () | ||||
set(USE_DIRECT_SGEMM false) | |||||
if (X86_64) | |||||
set(USE_DIRECT_SGEMM true) | |||||
endif() | |||||
if (USE_DIRECT_SGEMM) | |||||
# if (NOT DEFINED SGEMMDIRECTKERNEL) | |||||
set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) | |||||
set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) | |||||
# endif() | |||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | |||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | |||||
endif() | |||||
foreach (float_type SINGLE DOUBLE HALF) | foreach (float_type SINGLE DOUBLE HALF) | ||||
string(SUBSTRING ${float_type} 0 1 float_char) | string(SUBSTRING ${float_type} 0 1 float_char) | ||||
if (${float_type} STREQUAL "HALF") | if (${float_type} STREQUAL "HALF") | ||||
@@ -9,6 +9,10 @@ ifeq ($(ARCH), x86_64) | |||||
USE_GEMM3M = 1 | USE_GEMM3M = 1 | ||||
endif | endif | ||||
ifeq ($(ARCH), x86_64) | |||||
USE_DIRECT_SGEMM = 1 | |||||
endif | |||||
ifeq ($(ARCH), ia64) | ifeq ($(ARCH), ia64) | ||||
USE_GEMM3M = 1 | USE_GEMM3M = 1 | ||||
endif | endif | ||||
@@ -65,6 +69,13 @@ ifeq ($(CORE), Z14) | |||||
USE_TRMM = 1 | USE_TRMM = 1 | ||||
endif | endif | ||||
ifdef USE_DIRECT_SGEMM | |||||
ifndef SGEMMDIRECTKERNEL | |||||
SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c | |||||
SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c | |||||
endif | |||||
endif | |||||
ifeq ($(BUILD_HALF), 1) | ifeq ($(BUILD_HALF), 1) | ||||
ifndef SHGEMMKERNEL | ifndef SHGEMMKERNEL | ||||
SHGEMM_BETA = ../generic/gemm_beta.c | SHGEMM_BETA = ../generic/gemm_beta.c | ||||
@@ -90,6 +101,12 @@ SKERNELOBJS += \ | |||||
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ | $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ | ||||
$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) | $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) | ||||
ifdef USE_DIRECT_SGEMM | |||||
SKERNELOBJS += \ | |||||
sgemm_direct$(TSUFFIX).$(SUFFIX) \ | |||||
sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
DKERNELOBJS += \ | DKERNELOBJS += \ | ||||
dgemm_kernel$(TSUFFIX).$(SUFFIX) \ | dgemm_kernel$(TSUFFIX).$(SUFFIX) \ | ||||
$(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ | $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ | ||||
@@ -668,6 +685,13 @@ else | |||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | ||||
endif | endif | ||||
ifdef USE_DIRECT_SGEMM | |||||
$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) | |||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
endif | |||||
ifeq ($(BUILD_HALF), 1) | ifeq ($(BUILD_HALF), 1) | ||||
$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | ||||
@@ -135,6 +135,11 @@ gotoblas_t TABLE_NAME = { | |||||
sgemv_nTS, sgemv_tTS, sger_kTS, | sgemv_nTS, sgemv_tTS, sger_kTS, | ||||
ssymv_LTS, ssymv_UTS, | ssymv_LTS, ssymv_UTS, | ||||
#ifdef ARCH_X86_64 | |||||
sgemm_directTS, | |||||
sgemm_direct_performantTS, | |||||
#endif | |||||
sgemm_kernelTS, sgemm_betaTS, | sgemm_kernelTS, sgemm_betaTS, | ||||
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N | #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N | ||||
sgemm_incopyTS, sgemm_itcopyTS, | sgemm_incopyTS, sgemm_itcopyTS, | ||||
@@ -0,0 +1,30 @@ | |||||
#include "common.h" | |||||
/* helper for the direct sgemm code written by Arjan van der Ven */ | |||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K) | |||||
{ | |||||
unsigned long long mnk = M * N * K; | |||||
/* large matrixes -> not performant */ | |||||
if (mnk >= 28 * 512 * 512) | |||||
return 0; | |||||
/* | |||||
* if the B matrix is not a nice multiple if 4 we get many unaligned accesses, | |||||
* and the regular sgemm copy/realignment of data pays off much quicker | |||||
*/ | |||||
if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) | |||||
return 0; | |||||
#ifdef SMP | |||||
/* if we can run multithreaded, the threading changes the based threshold */ | |||||
if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) | |||||
return 0; | |||||
#endif | |||||
return 1; | |||||
} | |||||
@@ -1,7 +1,7 @@ | |||||
#if defined(SKYLAKEX) || defined (COOPERLAKE) | |||||
/* the direct sgemm code written by Arjan van der Ven */ | /* the direct sgemm code written by Arjan van der Ven */ | ||||
//#include <immintrin.h> | |||||
#include <immintrin.h> | |||||
#include "common.h" | |||||
/* | /* | ||||
* "Direct sgemm" code. This code operates directly on the inputs and outputs | * "Direct sgemm" code. This code operates directly on the inputs and outputs | ||||
* of the sgemm call, avoiding the copies, memory realignments and threading, | * of the sgemm call, avoiding the copies, memory realignments and threading, | ||||
@@ -38,6 +38,7 @@ | |||||
#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; | #define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; | ||||
#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; | #define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; | ||||
#if 0 | |||||
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) | int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) | ||||
{ | { | ||||
unsigned long long mnk = M * N * K; | unsigned long long mnk = M * N * K; | ||||
@@ -61,9 +62,10 @@ int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) | |||||
return 1; | return 1; | ||||
} | } | ||||
#endif | |||||
void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) | |||||
//void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) | |||||
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) | |||||
{ | { | ||||
int i, j, k; | int i, j, k; | ||||
@@ -465,3 +467,8 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict | |||||
} | } | ||||
} | } | ||||
} | } | ||||
#else | |||||
#include "common.h" | |||||
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) | |||||
{} | |||||
#endif |
@@ -512,4 +512,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f | |||||
return 0; | return 0; | ||||
} | } | ||||
#include <immintrin.h> | #include <immintrin.h> | ||||
#include "sgemm_direct_skylakex.c" | |||||
//#include "sgemm_direct_skylakex.c" |