Browse Source

Merge pull request #59 from xianyi/develop

rebase
tags/v0.3.10^2
Martin Kroeker GitHub 5 years ago
parent
commit
dd7a650792
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 5403 additions and 15 deletions
  1. +3
    -0
      CONTRIBUTORS.md
  2. +11
    -7
      Makefile.system
  3. +1
    -0
      c_check
  4. +1
    -0
      f_check
  5. +192
    -1
      kernel/arm64/KERNEL.CORTEXA53
  6. +2333
    -0
      kernel/arm64/sgemm_kernel_8x8_cortexa53.S
  7. +2823
    -0
      kernel/arm64/strmm_kernel_8x8_cortexa53.S
  8. +2
    -2
      kernel/zarch/KERNEL.Z14
  9. +6
    -3
      kernel/zarch/gemm_vec.c
  10. +1
    -1
      lapack-netlib/LAPACKE/include/lapacke.h
  11. +30
    -1
      param.h

+ 3
- 0
CONTRIBUTORS.md View File

@@ -187,3 +187,6 @@ In chronological order:
* Marius Hillenbrand <https://github.com/mhillenibm>
* [2020-05-12] Revise dynamic architecture detection for IBM z
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14

* Danfeng Zhang <https://github.com/craft-zhang>
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53

+ 11
- 7
Makefile.system View File

@@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64)
override ARCH=x86_64
else ifeq ($(ARCH), powerpc64)
override ARCH=power
else ifeq ($(ARCH), powerpc)
override ARCH=power
else ifeq ($(ARCH), i386)
override ARCH=x86
else ifeq ($(ARCH), aarch64)
@@ -277,6 +279,15 @@ NO_LAPACK = 1
override FEXTRALIB =
endif

ifeq ($(C_COMPILER), GCC)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
endif

#
# OS dependent settings
#
@@ -323,13 +334,7 @@ ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -DMS_ABI
endif

ifeq ($(C_COMPILER), GCC)
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
# GCC Major version > 4
# It is compatible with MSVC ABI.
@@ -343,7 +348,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
CCOMMON_OPT += -DMS_ABI
endif
endif
endif

# Ensure the correct stack alignment on Win32
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97


+ 1
- 0
c_check View File

@@ -310,6 +310,7 @@ $linker_a = "";
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
&& ($flags !~ /[0-9]+/)
) {
$linker_l .= $flags . " "
}


+ 1
- 0
f_check View File

@@ -335,6 +335,7 @@ if ($link ne "") {
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
&& ($flags !~ /[0-9]+/)
&& ($flags !~ /^\-l$/)
) {
$linker_l .= $flags . " ";


+ 192
- 1
kernel/arm64/KERNEL.CORTEXA53 View File

@@ -1,3 +1,194 @@
include $(KERNELDIR)/KERNEL.ARMV8
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c

SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c

SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c

ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c

ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c

ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S

SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S

SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S

SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S

SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S


SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S

SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S

SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S

ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S

SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S

DDOTKERNEL = dot.S
SDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S

DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S

ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
else
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
endif
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
ifeq ($(SGEMM_UNROLL_M), 16)
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
else
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
endif
ifeq ($(SGEMM_UNROLL_M), 4)
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
else
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
endif
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(SGEMM_UNROLL_N), 16)
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
else
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
endif
ifeq ($(SGEMM_UNROLL_N), 4)
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
else
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)

DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S

ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))

ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif

DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif

ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif

DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)

CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)

ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

+ 2333
- 0
kernel/arm64/sgemm_kernel_8x8_cortexa53.S
File diff suppressed because it is too large
View File


+ 2823
- 0
kernel/arm64/strmm_kernel_8x8_cortexa53.S
File diff suppressed because it is too large
View File


+ 2
- 2
kernel/zarch/KERNEL.Z14 View File

@@ -87,7 +87,7 @@ CGEMVTKERNEL = cgemv_t_4.c
ZGEMVTKERNEL = zgemv_t_4.c

STRMMKERNEL = gemm_vec.c
DTRMMKERNEL = trmm8x4V.S
DTRMMKERNEL = gemm_vec.c
CTRMMKERNEL = ctrmm4x4V.S
ZTRMMKERNEL = ztrmm4x4V.S

@@ -103,7 +103,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)

DGEMMKERNEL = gemm8x4V.S
DGEMMKERNEL = gemm_vec.c
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c


+ 6
- 3
kernel/zarch/gemm_vec.c View File

@@ -203,9 +203,12 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
"rows in block must be multiples of vector length"); \
vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \
\
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
vector_float A0 = \
vec_load_hinted(A + i * VLEN_FLOATS); \
for (BLASLONG j = 0; j < COLS; j++) \
Caux[i][j] = vec_splats(ZERO); \
Caux[i][j] = A0 * B[j]; \
} \
\
/* \
* Stream over the row-block of A, which is packed \
@@ -216,7 +219,7 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
* That equates to unrolling the loop over rows (in i) and \
* executing each unrolled iteration as a vector element. \
*/ \
for (BLASLONG k = 0; k < bk; k++) { \
for (BLASLONG k = 1; k < bk; k++) { \
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
vector_float Ak = vec_load_hinted( \
A + i * VLEN_FLOATS + k * ROWS); \


+ 1
- 1
lapack-netlib/LAPACKE/include/lapacke.h View File

@@ -12575,7 +12575,7 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
/* APIs for set/get nancheck flags */
void LAPACKE_set_nancheck( int flag );
int LAPACKE_get_nancheck( );
int LAPACKE_get_nancheck( void );

#ifdef __cplusplus
}


+ 30
- 1
param.h View File

@@ -2623,7 +2623,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define SYMV_P 16

#if defined(CORTEXA53) || defined(CORTEXA57) || \
#if defined(CORTEXA57) || \
defined(CORTEXA72) || defined(CORTEXA73) || \
defined(FALKOR) || defined(TSV110) || defined(EMAG8180)

@@ -2669,6 +2669,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 2048

#elif defined(CORTEXA53)

#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 8

#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4

#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4

#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4

#define SGEMM_DEFAULT_P 256
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128

#define SGEMM_DEFAULT_Q 256
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112

#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 2048

#elif defined(THUNDERX)

#define SGEMM_DEFAULT_UNROLL_M 4


Loading…
Cancel
Save