* Fix ARMV9SME target and add support_sme1 code for MacOS * make sgemm_direct unconditionally available on all arm64 * build a (dummy) sgemm_direct kernel on all arm64 * Update dynamic_arm64.ctags/v0.3.30
@@ -224,10 +224,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||||
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | ||||
#endif | #endif | ||||
#ifdef ARCH_ARM64 | #ifdef ARCH_ARM64 | ||||
#ifdef HAVE_SME | |||||
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | ||||
#endif | #endif | ||||
#endif | |||||
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | ||||
@@ -43,6 +43,14 @@ | |||||
#include <sys/auxv.h> | #include <sys/auxv.h> | ||||
#endif | #endif | ||||
#ifdef __APPLE__ | |||||
#include <sys/sysctl.h> | |||||
int32_t value; | |||||
size_t length=sizeof(value); | |||||
int64_t value64; | |||||
size_t length64=sizeof(value64); | |||||
#endif | |||||
extern gotoblas_t gotoblas_ARMV8; | extern gotoblas_t gotoblas_ARMV8; | ||||
#ifdef DYNAMIC_LIST | #ifdef DYNAMIC_LIST | ||||
#ifdef DYN_CORTEXA53 | #ifdef DYN_CORTEXA53 | ||||
@@ -120,7 +128,7 @@ extern gotoblas_t gotoblas_ARMV9SME; | |||||
#else | #else | ||||
#define gotoblas_ARMV9SME gotoblas_ARMV8 | #define gotoblas_ARMV9SME gotoblas_ARMV8 | ||||
#endif | #endif | ||||
#ifdef DYN_CORTEX_A55 | |||||
#ifdef DYN_CORTEXA55 | |||||
extern gotoblas_t gotoblas_CORTEXA55; | extern gotoblas_t gotoblas_CORTEXA55; | ||||
#else | #else | ||||
#define gotoblas_CORTEXA55 gotoblas_ARMV8 | #define gotoblas_CORTEXA55 gotoblas_ARMV8 | ||||
@@ -147,17 +155,17 @@ extern gotoblas_t gotoblas_NEOVERSEV1; | |||||
extern gotoblas_t gotoblas_NEOVERSEN2; | extern gotoblas_t gotoblas_NEOVERSEN2; | ||||
extern gotoblas_t gotoblas_ARMV8SVE; | extern gotoblas_t gotoblas_ARMV8SVE; | ||||
extern gotoblas_t gotoblas_A64FX; | extern gotoblas_t gotoblas_A64FX; | ||||
#ifndef NO_SME | |||||
extern gotoblas_t gotoblas_ARMV9SME; | |||||
#else | |||||
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||||
#endif | |||||
#else | #else | ||||
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | ||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | ||||
#define gotoblas_ARMV8SVE gotoblas_ARMV8 | #define gotoblas_ARMV8SVE gotoblas_ARMV8 | ||||
#define gotoblas_A64FX gotoblas_ARMV8 | #define gotoblas_A64FX gotoblas_ARMV8 | ||||
#endif | |||||
#ifndef NO_SME | |||||
extern gotoblas_t gotoblas_ARMV9SME; | |||||
#else | |||||
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||||
#define gotoblas_ARMV9SME gotoblas_ARMV8 | |||||
#endif | #endif | ||||
extern gotoblas_t gotoblas_THUNDERX3T110; | extern gotoblas_t gotoblas_THUNDERX3T110; | ||||
@@ -168,7 +176,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||||
#define FALLBACK_VERBOSE 1 | #define FALLBACK_VERBOSE 1 | ||||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | ||||
#define NUM_CORETYPES 18 | |||||
#define NUM_CORETYPES 19 | |||||
/* | /* | ||||
* In case asm/hwcap.h is outdated on the build system, make sure | * In case asm/hwcap.h is outdated on the build system, make sure | ||||
@@ -207,6 +215,7 @@ static char *corename[] = { | |||||
"cortexa55", | "cortexa55", | ||||
"armv8sve", | "armv8sve", | ||||
"a64fx", | "a64fx", | ||||
"armv9sme", | |||||
"unknown" | "unknown" | ||||
}; | }; | ||||
@@ -229,6 +238,7 @@ char *gotoblas_corename(void) { | |||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | ||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | ||||
if (gotoblas == &gotoblas_A64FX) return corename[17]; | if (gotoblas == &gotoblas_A64FX) return corename[17]; | ||||
if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; | |||||
return corename[NUM_CORETYPES]; | return corename[NUM_CORETYPES]; | ||||
} | } | ||||
@@ -266,6 +276,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||||
case 15: return (&gotoblas_CORTEXA55); | case 15: return (&gotoblas_CORTEXA55); | ||||
case 16: return (&gotoblas_ARMV8SVE); | case 16: return (&gotoblas_ARMV8SVE); | ||||
case 17: return (&gotoblas_A64FX); | case 17: return (&gotoblas_A64FX); | ||||
case 18: return (&gotoblas_ARMV9SME); | |||||
} | } | ||||
snprintf(message, 128, "Core not found: %s\n", coretype); | snprintf(message, 128, "Core not found: %s\n", coretype); | ||||
openblas_warning(1, message); | openblas_warning(1, message); | ||||
@@ -277,6 +288,11 @@ static gotoblas_t *get_coretype(void) { | |||||
char coremsg[128]; | char coremsg[128]; | ||||
#if defined (OS_DARWIN) | #if defined (OS_DARWIN) | ||||
//future #if !defined(NO_SME) | |||||
// if (support_sme1()) { | |||||
// return &gotoblas_ARMV9SME; | |||||
// } | |||||
// #endif | |||||
return &gotoblas_NEOVERSEN1; | return &gotoblas_NEOVERSEN1; | ||||
#endif | #endif | ||||
@@ -439,6 +455,7 @@ static gotoblas_t *get_coretype(void) { | |||||
} | } | ||||
break; | break; | ||||
case 0x61: // Apple | case 0x61: // Apple | ||||
//future if (support_sme1()) return &gotoblas_ARMV9SME; | |||||
return &gotoblas_NEOVERSEN1; | return &gotoblas_NEOVERSEN1; | ||||
break; | break; | ||||
default: | default: | ||||
@@ -446,8 +463,8 @@ static gotoblas_t *get_coretype(void) { | |||||
openblas_warning(1, coremsg); | openblas_warning(1, coremsg); | ||||
} | } | ||||
#if !defined(NO_SME) && defined(HWCAP2_SME) | |||||
if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) { | |||||
#if !defined(NO_SME) | |||||
if (support_sme1()) { | |||||
return &gotoblas_ARMV9SME; | return &gotoblas_ARMV9SME; | ||||
} | } | ||||
#endif | #endif | ||||
@@ -511,6 +528,10 @@ int support_sme1(void) { | |||||
if(getauxval(AT_HWCAP2) & HWCAP2_SME){ | if(getauxval(AT_HWCAP2) & HWCAP2_SME){ | ||||
ret = 1; | ret = 1; | ||||
} | } | ||||
#endif | |||||
#if defined(__APPLE__) | |||||
sysctlbyname("hw.optional.arm.FEAT_SME",&value64,&length64,NULL,0); | |||||
ret = value64; | |||||
#endif | #endif | ||||
return ret; | return ret; | ||||
} | } |
@@ -208,7 +208,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
set(USE_TRMM true) | set(USE_TRMM true) | ||||
endif () | endif () | ||||
set(USE_DIRECT_SGEMM false) | set(USE_DIRECT_SGEMM false) | ||||
if (X86_64 OR (ARM64 AND (UC_TARGET_CORE MATCHES ARMV9SME))) | |||||
if (X86_64 OR ARM64) | |||||
set(USE_DIRECT_SGEMM true) | set(USE_DIRECT_SGEMM true) | ||||
endif() | endif() | ||||
@@ -225,9 +225,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) | set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) | ||||
set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) | set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) | ||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | ||||
if (HAVE_SME) | |||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) | ||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) | ||||
endif () | endif () | ||||
endif () | |||||
endif() | endif() | ||||
foreach (float_type SINGLE DOUBLE) | foreach (float_type SINGLE DOUBLE) | ||||
@@ -103,8 +103,8 @@ endif | |||||
ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
ifeq ($(TARGET_CORE), ARMV9SME) | ifeq ($(TARGET_CORE), ARMV9SME) | ||||
HAVE_SME = 1 | HAVE_SME = 1 | ||||
SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | |||||
endif | endif | ||||
SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | |||||
endif | endif | ||||
endif | endif | ||||
endif | endif | ||||
@@ -143,9 +143,10 @@ SKERNELOBJS += \ | |||||
sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | ||||
endif | endif | ||||
ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
SKERNELOBJS += \ | |||||
sgemm_direct$(TSUFFIX).$(SUFFIX) | |||||
ifdef HAVE_SME | ifdef HAVE_SME | ||||
SKERNELOBJS += \ | SKERNELOBJS += \ | ||||
sgemm_direct$(TSUFFIX).$(SUFFIX) \ | |||||
sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ | sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ | ||||
sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) | sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) | ||||
endif | endif | ||||
@@ -835,9 +836,9 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | ||||
endif | endif | ||||
ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
ifdef HAVE_SME | |||||
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | ||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | ||||
ifdef HAVE_SME | |||||
$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : | $(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : | ||||
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ | $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ | ||||
$(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : | $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : | ||||
@@ -71,4 +71,10 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
free(A_mod); | free(A_mod); | ||||
} | } | ||||
#else | |||||
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | |||||
float * __restrict R, BLASLONG strideR){} | |||||
#endif | #endif |
@@ -180,9 +180,7 @@ gotoblas_t TABLE_NAME = { | |||||
sgemm_direct_performantTS, | sgemm_direct_performantTS, | ||||
#endif | #endif | ||||
#ifdef ARCH_ARM64 | #ifdef ARCH_ARM64 | ||||
#ifdef HAVE_SME | |||||
sgemm_directTS, | sgemm_directTS, | ||||
#endif | |||||
#endif | #endif | ||||
sgemm_kernelTS, sgemm_betaTS, | sgemm_kernelTS, sgemm_betaTS, | ||||