| @@ -187,3 +187,6 @@ In chronological order: | |||||
| * Marius Hillenbrand <https://github.com/mhillenibm> | * Marius Hillenbrand <https://github.com/mhillenibm> | ||||
| * [2020-05-12] Revise dynamic architecture detection for IBM z | * [2020-05-12] Revise dynamic architecture detection for IBM z | ||||
| * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 | * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 | ||||
| * Danfeng Zhang <https://github.com/craft-zhang> | |||||
| * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 | |||||
| @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) | |||||
| override ARCH=x86_64 | override ARCH=x86_64 | ||||
| else ifeq ($(ARCH), powerpc64) | else ifeq ($(ARCH), powerpc64) | ||||
| override ARCH=power | override ARCH=power | ||||
| else ifeq ($(ARCH), powerpc) | |||||
| override ARCH=power | |||||
| else ifeq ($(ARCH), i386) | else ifeq ($(ARCH), i386) | ||||
| override ARCH=x86 | override ARCH=x86 | ||||
| else ifeq ($(ARCH), aarch64) | else ifeq ($(ARCH), aarch64) | ||||
| @@ -277,6 +279,15 @@ NO_LAPACK = 1 | |||||
| override FEXTRALIB = | override FEXTRALIB = | ||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), GCC) | |||||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||||
| endif | |||||
| # | # | ||||
| # OS dependent settings | # OS dependent settings | ||||
| # | # | ||||
| @@ -323,13 +334,7 @@ ifeq ($(C_COMPILER), CLANG) | |||||
| CCOMMON_OPT += -DMS_ABI | CCOMMON_OPT += -DMS_ABI | ||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), GCC) | |||||
| #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) | #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) | ||||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||||
| ifeq ($(GCCVERSIONGT4), 1) | ifeq ($(GCCVERSIONGT4), 1) | ||||
| # GCC Major version > 4 | # GCC Major version > 4 | ||||
| # It is compatible with MSVC ABI. | # It is compatible with MSVC ABI. | ||||
| @@ -343,7 +348,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1) | |||||
| CCOMMON_OPT += -DMS_ABI | CCOMMON_OPT += -DMS_ABI | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| # Ensure the correct stack alignment on Win32 | # Ensure the correct stack alignment on Win32 | ||||
| # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 | # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 | ||||
| @@ -310,6 +310,7 @@ $linker_a = ""; | |||||
| && ($flags !~ /advapi32/) | && ($flags !~ /advapi32/) | ||||
| && ($flags !~ /shell32/) | && ($flags !~ /shell32/) | ||||
| && ($flags !~ /omp/) | && ($flags !~ /omp/) | ||||
| && ($flags !~ /[0-9]+/) | |||||
| ) { | ) { | ||||
| $linker_l .= $flags . " " | $linker_l .= $flags . " " | ||||
| } | } | ||||
| @@ -335,6 +335,7 @@ if ($link ne "") { | |||||
| && ($flags !~ /advapi32/) | && ($flags !~ /advapi32/) | ||||
| && ($flags !~ /shell32/) | && ($flags !~ /shell32/) | ||||
| && ($flags !~ /omp/) | && ($flags !~ /omp/) | ||||
| && ($flags !~ /[0-9]+/) | |||||
| && ($flags !~ /^\-l$/) | && ($flags !~ /^\-l$/) | ||||
| ) { | ) { | ||||
| $linker_l .= $flags . " "; | $linker_l .= $flags . " "; | ||||
| @@ -1,3 +1,194 @@ | |||||
| include $(KERNELDIR)/KERNEL.ARMV8 | |||||
| SAMINKERNEL = ../arm/amin.c | |||||
| DAMINKERNEL = ../arm/amin.c | |||||
| CAMINKERNEL = ../arm/zamin.c | |||||
| ZAMINKERNEL = ../arm/zamin.c | |||||
| SMAXKERNEL = ../arm/max.c | |||||
| DMAXKERNEL = ../arm/max.c | |||||
| SMINKERNEL = ../arm/min.c | |||||
| DMINKERNEL = ../arm/min.c | |||||
| ISAMINKERNEL = ../arm/iamin.c | |||||
| IDAMINKERNEL = ../arm/iamin.c | |||||
| ICAMINKERNEL = ../arm/izamin.c | |||||
| IZAMINKERNEL = ../arm/izamin.c | |||||
| ISMAXKERNEL = ../arm/imax.c | |||||
| IDMAXKERNEL = ../arm/imax.c | |||||
| ISMINKERNEL = ../arm/imin.c | |||||
| IDMINKERNEL = ../arm/imin.c | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| SAMAXKERNEL = amax.S | |||||
| DAMAXKERNEL = amax.S | |||||
| CAMAXKERNEL = zamax.S | |||||
| ZAMAXKERNEL = zamax.S | |||||
| SAXPYKERNEL = axpy.S | |||||
| DAXPYKERNEL = axpy.S | |||||
| CAXPYKERNEL = zaxpy.S | |||||
| ZAXPYKERNEL = zaxpy.S | |||||
| SROTKERNEL = rot.S | |||||
| DROTKERNEL = rot.S | |||||
| CROTKERNEL = zrot.S | |||||
| ZROTKERNEL = zrot.S | |||||
| SSCALKERNEL = scal.S | |||||
| DSCALKERNEL = scal.S | |||||
| CSCALKERNEL = zscal.S | |||||
| ZSCALKERNEL = zscal.S | |||||
| SGEMVNKERNEL = gemv_n.S | |||||
| DGEMVNKERNEL = gemv_n.S | |||||
| CGEMVNKERNEL = zgemv_n.S | |||||
| ZGEMVNKERNEL = zgemv_n.S | |||||
| SGEMVTKERNEL = gemv_t.S | |||||
| DGEMVTKERNEL = gemv_t.S | |||||
| CGEMVTKERNEL = zgemv_t.S | |||||
| ZGEMVTKERNEL = zgemv_t.S | |||||
| SASUMKERNEL = asum.S | |||||
| DASUMKERNEL = asum.S | |||||
| CASUMKERNEL = casum.S | |||||
| ZASUMKERNEL = zasum.S | |||||
| SCOPYKERNEL = copy.S | |||||
| DCOPYKERNEL = copy.S | |||||
| CCOPYKERNEL = copy.S | |||||
| ZCOPYKERNEL = copy.S | |||||
| SSWAPKERNEL = swap.S | |||||
| DSWAPKERNEL = swap.S | |||||
| CSWAPKERNEL = swap.S | |||||
| ZSWAPKERNEL = swap.S | |||||
| ISAMAXKERNEL = iamax.S | |||||
| IDAMAXKERNEL = iamax.S | |||||
| ICAMAXKERNEL = izamax.S | |||||
| IZAMAXKERNEL = izamax.S | |||||
| SNRM2KERNEL = nrm2.S | |||||
| DNRM2KERNEL = nrm2.S | |||||
| CNRM2KERNEL = znrm2.S | |||||
| ZNRM2KERNEL = znrm2.S | |||||
| DDOTKERNEL = dot.S | |||||
| SDOTKERNEL = dot.S | |||||
| CDOTKERNEL = zdot.S | |||||
| ZDOTKERNEL = zdot.S | |||||
| DSDOTKERNEL = dot.S | |||||
| DGEMM_BETA = dgemm_beta.S | |||||
| SGEMM_BETA = sgemm_beta.S | |||||
| ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) | |||||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S | |||||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S | |||||
| else | |||||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
| endif | |||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||||
| else | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||||
| else | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
| endif | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||||
| else | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||||
| else | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
| endif | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||||
| else | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
| endif | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||||
| else | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
| endif | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| @@ -87,7 +87,7 @@ CGEMVTKERNEL = cgemv_t_4.c | |||||
| ZGEMVTKERNEL = zgemv_t_4.c | ZGEMVTKERNEL = zgemv_t_4.c | ||||
| STRMMKERNEL = gemm_vec.c | STRMMKERNEL = gemm_vec.c | ||||
| DTRMMKERNEL = trmm8x4V.S | |||||
| DTRMMKERNEL = gemm_vec.c | |||||
| CTRMMKERNEL = ctrmm4x4V.S | CTRMMKERNEL = ctrmm4x4V.S | ||||
| ZTRMMKERNEL = ztrmm4x4V.S | ZTRMMKERNEL = ztrmm4x4V.S | ||||
| @@ -103,7 +103,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL = gemm8x4V.S | |||||
| DGEMMKERNEL = gemm_vec.c | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | DGEMMINCOPY = ../generic/gemm_ncopy_8.c | ||||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | DGEMMITCOPY = ../generic/gemm_tcopy_8.c | ||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | DGEMMONCOPY = ../generic/gemm_ncopy_4.c | ||||
| @@ -203,9 +203,12 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { | |||||
| "rows in block must be multiples of vector length"); \ | "rows in block must be multiples of vector length"); \ | ||||
| vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ | vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ | ||||
| \ | \ | ||||
| for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \ | |||||
| for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ | |||||
| vector_float A0 = \ | |||||
| vec_load_hinted(A + i * VLEN_FLOATS); \ | |||||
| for (BLASLONG j = 0; j < COLS; j++) \ | for (BLASLONG j = 0; j < COLS; j++) \ | ||||
| Caux[i][j] = vec_splats(ZERO); \ | |||||
| Caux[i][j] = A0 * B[j]; \ | |||||
| } \ | |||||
| \ | \ | ||||
| /* \ | /* \ | ||||
| * Stream over the row-block of A, which is packed \ | * Stream over the row-block of A, which is packed \ | ||||
| @@ -216,7 +219,7 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { | |||||
| * That equates to unrolling the loop over rows (in i) and \ | * That equates to unrolling the loop over rows (in i) and \ | ||||
| * executing each unrolled iteration as a vector element. \ | * executing each unrolled iteration as a vector element. \ | ||||
| */ \ | */ \ | ||||
| for (BLASLONG k = 0; k < bk; k++) { \ | |||||
| for (BLASLONG k = 1; k < bk; k++) { \ | |||||
| for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ | for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ | ||||
| vector_float Ak = vec_load_hinted( \ | vector_float Ak = vec_load_hinted( \ | ||||
| A + i * VLEN_FLOATS + k * ROWS); \ | A + i * VLEN_FLOATS + k * ROWS); \ | ||||
| @@ -12575,7 +12575,7 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i | |||||
| /* APIs for set/get nancheck flags */ | /* APIs for set/get nancheck flags */ | ||||
| void LAPACKE_set_nancheck( int flag ); | void LAPACKE_set_nancheck( int flag ); | ||||
| int LAPACKE_get_nancheck( ); | |||||
| int LAPACKE_get_nancheck( void ); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| @@ -2623,7 +2623,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SYMV_P 16 | #define SYMV_P 16 | ||||
| #if defined(CORTEXA53) || defined(CORTEXA57) || \ | |||||
| #if defined(CORTEXA57) || \ | |||||
| defined(CORTEXA72) || defined(CORTEXA73) || \ | defined(CORTEXA72) || defined(CORTEXA73) || \ | ||||
| defined(FALKOR) || defined(TSV110) || defined(EMAG8180) | defined(FALKOR) || defined(TSV110) || defined(EMAG8180) | ||||
| @@ -2669,6 +2669,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define CGEMM_DEFAULT_R 4096 | #define CGEMM_DEFAULT_R 4096 | ||||
| #define ZGEMM_DEFAULT_R 2048 | #define ZGEMM_DEFAULT_R 2048 | ||||
| #elif defined(CORTEXA53) | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define SGEMM_DEFAULT_P 256 | |||||
| #define DGEMM_DEFAULT_P 160 | |||||
| #define CGEMM_DEFAULT_P 128 | |||||
| #define ZGEMM_DEFAULT_P 128 | |||||
| #define SGEMM_DEFAULT_Q 256 | |||||
| #define DGEMM_DEFAULT_Q 128 | |||||
| #define CGEMM_DEFAULT_Q 224 | |||||
| #define ZGEMM_DEFAULT_Q 112 | |||||
| #define SGEMM_DEFAULT_R 4096 | |||||
| #define DGEMM_DEFAULT_R 4096 | |||||
| #define CGEMM_DEFAULT_R 4096 | |||||
| #define ZGEMM_DEFAULT_R 2048 | |||||
| #elif defined(THUNDERX) | #elif defined(THUNDERX) | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||