| @@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| ifeq ($(CORE), THUNDERX3T110) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| else | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq ($(CORE), TSV110) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| @@ -11,34 +11,34 @@ endif | |||
| ifeq ($(CORE), POWER10) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math | |||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp | |||
| endif | |||
| else | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math | |||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive | |||
| endif | |||
| @@ -48,26 +48,26 @@ endif | |||
| ifeq ($(CORE), POWER8) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp | |||
| endif | |||
| else | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math | |||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| ifeq ($(OSNAME), AIX) | |||
| FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||
| else | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive | |||
| @@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| DYNAMIC_CORE += TSV110 | |||
| DYNAMIC_CORE += EMAG8180 | |||
| DYNAMIC_CORE += THUNDERX3T110 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| @@ -617,7 +618,6 @@ DYNAMIC_CORE += POWER8 | |||
| ifneq ($(C_COMPILER), GCC) | |||
| DYNAMIC_CORE += POWER9 | |||
| DYNAMIC_CORE += POWER10 | |||
| override LDFLAGS += -Wl,-no-power10-stubs | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| ifeq ($(GCCVERSIONGT5), 1) | |||
| @@ -627,11 +627,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| DYNAMIC_CORE += POWER10 | |||
| override LDFLAGS += -Wl,-no-power10-stubs | |||
| else ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq ($(GCCMINORVERSIONGTEQ2), 1) | |||
| DYNAMIC_CORE += POWER10 | |||
| override LDFLAGS += -Wl,-no-power10-stubs | |||
| endif | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | |||
| @@ -1241,7 +1239,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) | |||
| include $(TOPDIR)/Makefile.$(ARCH) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME | |||
| endif | |||
| CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" | |||
| ifeq ($(CORE), PPC440) | |||
| @@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge | |||
| ## Installation from Source | |||
| Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code | |||
| using Git from https://github.com/xianyi/OpenBLAS.git. | |||
| using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be | |||
| sure to use the develop branch - master is several years out of date due to a change of maintainership.) | |||
| Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. | |||
| Most can also be given directly on the make or cmake command line. | |||
| @@ -96,6 +96,7 @@ FALKOR | |||
| THUNDERX | |||
| THUNDERX2T99 | |||
| TSV110 | |||
| THUNDERX3T110 | |||
| 9.System Z: | |||
| ZARCH_GENERIC | |||
| @@ -45,7 +45,7 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| endif () | |||
| if (POWER) | |||
| @@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| if ("${TCORE}" STREQUAL "CORTEXA57") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| else () | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 8) | |||
| endif () | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| @@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "THUNDERX3T110") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define THUNDERX3T110\n" | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t8\n" | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t8\n" | |||
| "#define L2_SIZE\t524288\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define L3_SIZE\t94371840\n" | |||
| "#define L3_LINESIZE\t64\n" | |||
| "#define L3_ASSOCIATIVE\t32\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "TSV110") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define ARMV8\n" | |||
| @@ -40,6 +40,7 @@ | |||
| // Cavium | |||
| #define CPU_THUNDERX 7 | |||
| #define CPU_THUNDERX2T99 8 | |||
| #define CPU_THUNDERX3T110 12 | |||
| //Hisilicon | |||
| #define CPU_TSV110 9 | |||
| // Ampere | |||
| @@ -57,7 +58,8 @@ static char *cpuname[] = { | |||
| "THUNDERX2T99", | |||
| "TSV110", | |||
| "EMAG8180", | |||
| "NEOVERSEN1" | |||
| "NEOVERSEN1", | |||
| "THUNDERX3T110" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -72,7 +74,8 @@ static char *cpuname_lower[] = { | |||
| "thunderx2t99", | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1" | |||
| "neoversen1", | |||
| "thunderx3t110" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -158,6 +161,8 @@ int detect(void) | |||
| return CPU_THUNDERX; | |||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | |||
| return CPU_THUNDERX2T99; | |||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8")) | |||
| return CPU_THUNDERX3T110; | |||
| // HiSilicon | |||
| else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) | |||
| return CPU_TSV110; | |||
| @@ -372,7 +377,25 @@ void get_cpuconfig(void) | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_THUNDERX3T110: | |||
| printf("#define THUNDERX3T110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 524288 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 94371840 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| } | |||
| get_cpucount(); | |||
| } | |||
| @@ -1454,10 +1454,11 @@ int get_cpuname(void){ | |||
| return CPUTYPE_OPTERON; | |||
| case 1: | |||
| case 3: | |||
| case 7: | |||
| case 10: | |||
| // case 7: | |||
| // case 10: | |||
| return CPUTYPE_BARCELONA; | |||
| case 5: | |||
| case 7: | |||
| return CPUTYPE_BOBCAT; | |||
| case 6: | |||
| switch (model) { | |||
| @@ -1507,6 +1508,8 @@ int get_cpuname(void){ | |||
| // AMD Ryzen | |||
| case 8: | |||
| // AMD Ryzen2 | |||
| default: | |||
| // Matisse/Renoir and other recent Ryzen2 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_ZEN; | |||
| @@ -1516,6 +1519,16 @@ int get_cpuname(void){ | |||
| else | |||
| return CPUTYPE_BARCELONA; | |||
| } | |||
| break; | |||
| case 10: // Zen3 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_ZEN; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator | |||
| #endif | |||
| else | |||
| return CPUTYPE_BARCELONA; | |||
| } | |||
| break; | |||
| } | |||
| @@ -2107,7 +2120,7 @@ int get_coretype(void){ | |||
| return CORE_PILEDRIVER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. | |||
| case 5: // New EXCAVATOR | |||
| case 5: // New EXCAVATOR | |||
| if(support_avx()) | |||
| return CORE_EXCAVATOR; | |||
| else | |||
| @@ -2135,12 +2148,14 @@ int get_coretype(void){ | |||
| } | |||
| break; | |||
| } | |||
| } else if (exfamily == 8) { | |||
| } else if (exfamily == 8 || exfamily == 10) { | |||
| switch (model) { | |||
| case 1: | |||
| // AMD Ryzen | |||
| case 8: | |||
| // Ryzen 2 | |||
| // Ryzen 2 | |||
| default: | |||
| // Matisse,Renoir Ryzen2 models | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_ZEN; | |||
| @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ | |||
| if ((exfamily == 0) || (exfamily == 2)) { | |||
| if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; | |||
| else return &gotoblas_OPTERON; | |||
| } else if (exfamily == 5) { | |||
| } else if (exfamily == 5 || exfamily == 7) { | |||
| return &gotoblas_BOBCAT; | |||
| } else if (exfamily == 6) { | |||
| if(model == 1){ | |||
| @@ -710,7 +710,7 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| } else if (exfamily == 8) { | |||
| if (model == 1 || model == 8) { | |||
| /* if (model == 1 || model == 8) */ { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| @@ -718,16 +718,24 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } else if (exfamily == 9) { | |||
| } else if (exfamily == 9) { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } else if (exfamily == 10) { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else { | |||
| return &gotoblas_BARCELONA; | |||
| } | |||
| } | |||
| } | |||
| @@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99; | |||
| extern gotoblas_t gotoblas_TSV110; | |||
| extern gotoblas_t gotoblas_EMAG8180; | |||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 11 | |||
| #define NUM_CORETYPES 12 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -82,6 +83,7 @@ static char *corename[] = { | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1", | |||
| "thunderx3t110", | |||
| "unknown" | |||
| }; | |||
| @@ -97,6 +99,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_TSV110) return corename[ 8]; | |||
| if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 8: return (&gotoblas_TSV110); | |||
| case 9: return (&gotoblas_EMAG8180); | |||
| case 10: return (&gotoblas_NEOVERSEN1); | |||
| case 11: return (&gotoblas_THUNDERX3T110); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_THUNDERX; | |||
| case 0x0af: // ThunderX2 | |||
| return &gotoblas_THUNDERX2T99; | |||
| case 0x0b8: // ThunderX3 | |||
| return &gotoblas_THUNDERX3T110; | |||
| } | |||
| break; | |||
| case 0x48: // HiSilicon | |||
| @@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "EMAG8180" | |||
| #endif | |||
| #ifdef FORCE_THUNDERX3T110 | |||
| #define ARMV8 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "THUNDERX3T110" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DTHUNDERX3T110 " \ | |||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "thunderx3t110" | |||
| #define CORENAME "THUNDERX3T110" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ZARCH_GENERIC | |||
| #define FORCE | |||
| #define ARCHITECTURE "ZARCH" | |||
| @@ -42,7 +42,7 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) | |||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | |||
| #else | |||
| // Disable multi-threading as it does not show any performance | |||
| @@ -42,7 +42,7 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) | |||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | |||
| #else | |||
| // Disable multi-threading as it does not show any performance | |||
| @@ -10,6 +10,11 @@ ifeq ($(C_COMPILER), GCC) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| endif | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| override CFLAGS += -fno-integrated-as | |||
| endif | |||
| endif | |||
| AVX2OPT = | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # AVX2 support was added in 4.7.0 | |||
| @@ -44,8 +44,10 @@ USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| ifeq ($(BINARY64),1) | |||
| USE_TRMM = 1 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| USE_TRMM = 1 | |||
| @@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| dot[0]=0.0; | |||
| dot[1]=0.0; | |||
| #if !defined(__PPC__) | |||
| CREAL(result) = 0.0 ; | |||
| CIMAG(result) = 0.0 ; | |||
| #else | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||
| #endif | |||
| if ( n < 1 ) return(result); | |||
| inc_x2 = 2 * inc_x ; | |||
| @@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| i++ ; | |||
| } | |||
| CREAL(result) = dot[0]; | |||
| #if !defined(__POWER__) | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| #else | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); | |||
| #endif | |||
| return(result); | |||
| } | |||
| @@ -0,0 +1,184 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||
| SCOPYKERNEL = copy_thunderx2t99.c | |||
| DCOPYKERNEL = copy_thunderx2t99.c | |||
| CCOPYKERNEL = copy_thunderx2t99.c | |||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||
| SSWAPKERNEL = swap_thunderx2t99.S | |||
| DSWAPKERNEL = swap_thunderx2t99.S | |||
| CSWAPKERNEL = swap_thunderx2t99.S | |||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) | |||
| DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) | |||
| SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S | |||
| endif | |||
| ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) | |||
| CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S | |||
| endif | |||
| ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) | |||
| ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S | |||
| endif | |||
| @@ -1,3 +1,44 @@ | |||
| # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM | |||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||
| SGEMMKERNEL = gemm_kernel_power6.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = gemm_ncopy_4.S | |||
| SGEMMOTCOPY = gemm_tcopy_4.S | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_power6.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = gemm_ncopy_4.S | |||
| DGEMMOTCOPY = gemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_power6.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_power6.S | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| @@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||
| DTRSMKERNEL_LN = trsm_kernel_power6_LN.S | |||
| DTRSMKERNEL_LT = trsm_kernel_power6_LT.S | |||
| DTRSMKERNEL_RN = trsm_kernel_power6_LT.S | |||
| DTRSMKERNEL_RT = trsm_kernel_power6_RT.S | |||
| else | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c | |||
| # | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| # | |||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||
| CAXPYKERNEL = zaxpy.S | |||
| else | |||
| ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CAXPYKERNEL = caxpy_power8.S | |||
| @@ -162,6 +215,7 @@ endif | |||
| else | |||
| CAXPYKERNEL = caxpy.c | |||
| endif | |||
| endif | |||
| # | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| @@ -239,4 +293,3 @@ IDAMINKERNEL = ../arm/iamin.c | |||
| IZAMAXKERNEL = ../arm/izamax.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| endif | |||
| @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "casum_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "ccopy_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| @@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/zdot.c" | |||
| #else | |||
| #include "common.h" | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| return (result); | |||
| } | |||
| #endif | |||
| @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/zgemv_n.c" | |||
| #else | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| @@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/zgemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| @@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| @@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| } | |||
| #endif | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| @@ -183,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| @@ -191,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| i=n1; | |||
| ix=2*n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[ix] ; | |||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "cswap_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "dasum_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "daxpy_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "dcopy_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "ddot_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | |||
| #ifdef TRMMKERNEL | |||
| #define SAVE_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC[0] = result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] = result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | |||
| rowC[0] = result[0] * alpha; | |||
| rowC[0] = result[3] * alpha; | |||
| #define SAVE_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC[0] = result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] = result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | |||
| rowC[0] = result[0] * alpha; | |||
| rowC[0] = result[3] * alpha; | |||
| #define SAVE2x4_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC[0] = result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; | |||
| rowC[0] = result[1] * alpha; | |||
| #else | |||
| #define SAVE_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] += result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[3] * alpha; | |||
| #define SAVE_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] += result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[3] * alpha; | |||
| #define SAVE2x4_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; | |||
| rowC[0] += result[1] * alpha; | |||
| #endif | |||
| #define SET_ACC_ZERO4() \ | |||
| @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "dgemv_n_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #define NBMAX 4096 | |||
| @@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 1024 | |||
| //#define PREFETCH 1 | |||
| #include <altivec.h> | |||
| #define HAVE_KERNEL4x8_ASM 1 | |||
| #if defined(HAVE_KERNEL4x8_ASM) | |||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { | |||
| @@ -355,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do | |||
| "stxvd2x 39, %[off], %[y] \n\t" | |||
| "stxvd2x 40, %[off2], %[y] \n\t" | |||
| : [memy] "+m" (*(const double (*)[8])y), | |||
| : [memy] "+m" (*(double (*)[8])y), | |||
| [n] "+&r" (n), | |||
| [a0] "=b" (a0), | |||
| [a1] "=&b" (a1), | |||
| @@ -369,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do | |||
| [off2]"=&b" (off2), | |||
| [temp] "=&b" (tempR) | |||
| : [memx] "m" (*(const double (*)[n])x), | |||
| [mem_ap] "m" (*(const double (*)[]) ap), | |||
| [mem_ap] "m" (*(const double (*)[n*8]) ap), | |||
| [alpha] "d" (alpha), | |||
| "[a0]" (ap), | |||
| [x] "b" (x), | |||
| @@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "drot_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "dscal_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #if !defined(HAVE_KERNEL_8) | |||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "dswap_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include <altivec.h> | |||
| #endif | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| @@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| @@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| return index; | |||
| } | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| @@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG n1 = n & -32; | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| if (n1 > 0) { | |||
| max = diamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| } | |||
| #endif | |||
| #endif | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| @@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| @@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| return index; | |||
| } | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| @@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| if (inc_x == 1) { | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| min = diamin_kernel_32(n1, x, &minf); | |||
| i = n1; | |||
| } | |||
| #endif | |||
| #endif | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| @@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| /** | |||
| * Find maximum index | |||
| @@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| } | |||
| #endif | |||
| @@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (inc_x == 1) { | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -16; | |||
| if (n1 > 0) { | |||
| @@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| #endif | |||
| #endif | |||
| while(i < n) | |||
| @@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| @@ -32,6 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ABS fabs | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| /** | |||
| * Find minimum index | |||
| @@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| return index; | |||
| } | |||
| #endif | |||
| @@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| minf = CABS1(x,0); //index will not be incremented | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -16; | |||
| if (n1 > 0) { | |||
| @@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| #endif | |||
| #endif | |||
| while(i < n) | |||
| @@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| } | |||
| @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "sasum_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| @@ -28,8 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #define offset_0 0 | |||
| #define offset_1 16 | |||
| #define offset_2 32 | |||
| #define offset_3 48 | |||
| #define offset_4 64 | |||
| #define offset_5 80 | |||
| #define offset_6 96 | |||
| #define offset_7 112 | |||
| #define offset_8 128 | |||
| #define offset_9 144 | |||
| #define offset_10 160 | |||
| #define offset_11 176 | |||
| #define offset_12 192 | |||
| #define offset_13 208 | |||
| #define offset_14 224 | |||
| #define offset_15 240 | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #ifndef HAVE_KERNEL_8 | |||
| #include <altivec.h> | |||
| @@ -37,12 +54,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) | |||
| { | |||
| BLASLONG i = 0; | |||
| __vector float v_a = {alpha,alpha,alpha,alpha}; | |||
| __vector float * v_y=(__vector float *)y; | |||
| __vector float * v_x=(__vector float *)x; | |||
| __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha}; | |||
| __vector float * vptr_y =(__vector float *)y; | |||
| __vector float * vptr_x =(__vector float *)x; | |||
| for(; i<n/4; i+=16){ | |||
| register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ; | |||
| register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ; | |||
| register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ; | |||
| register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ; | |||
| register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ; | |||
| register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ; | |||
| register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ; | |||
| register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ; | |||
| register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ; | |||
| register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ; | |||
| register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ; | |||
| register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ; | |||
| register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ; | |||
| register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ; | |||
| register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ; | |||
| register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ; | |||
| register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ; | |||
| register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ; | |||
| register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ; | |||
| register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ; | |||
| register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ; | |||
| register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ; | |||
| register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ; | |||
| register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ; | |||
| register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ; | |||
| register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ; | |||
| register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ; | |||
| register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ; | |||
| register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ; | |||
| register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ; | |||
| register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ; | |||
| register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ; | |||
| vy_0 += vx_0*v_a; | |||
| vy_1 += vx_1*v_a; | |||
| vy_2 += vx_2*v_a; | |||
| vy_3 += vx_3*v_a; | |||
| vy_4 += vx_4*v_a; | |||
| vy_5 += vx_5*v_a; | |||
| vy_6 += vx_6*v_a; | |||
| vy_7 += vx_7*v_a; | |||
| vy_8 += vx_8*v_a; | |||
| vy_9 += vx_9*v_a; | |||
| vy_10 += vx_10*v_a; | |||
| vy_11 += vx_11*v_a; | |||
| vy_12 += vx_12*v_a; | |||
| vy_13 += vx_13*v_a; | |||
| vy_14 += vx_14*v_a; | |||
| vy_15 += vx_15*v_a; | |||
| vec_vsx_st( vy_0, offset_0 ,vptr_y ) ; | |||
| vec_vsx_st( vy_1, offset_1 ,vptr_y ) ; | |||
| vec_vsx_st( vy_2, offset_2 ,vptr_y ) ; | |||
| vec_vsx_st( vy_3, offset_3 ,vptr_y ) ; | |||
| vec_vsx_st( vy_4, offset_4 ,vptr_y ) ; | |||
| vec_vsx_st( vy_5, offset_5 ,vptr_y ) ; | |||
| vec_vsx_st( vy_6, offset_6 ,vptr_y ) ; | |||
| vec_vsx_st( vy_7, offset_7 ,vptr_y ) ; | |||
| vec_vsx_st( vy_8, offset_8 ,vptr_y ) ; | |||
| vec_vsx_st( vy_9, offset_9 ,vptr_y ) ; | |||
| vec_vsx_st( vy_10, offset_10 ,vptr_y ) ; | |||
| vec_vsx_st( vy_11, offset_11 ,vptr_y ) ; | |||
| vec_vsx_st( vy_12, offset_12 ,vptr_y ) ; | |||
| vec_vsx_st( vy_13, offset_13 ,vptr_y ) ; | |||
| vec_vsx_st( vy_14, offset_14 ,vptr_y ) ; | |||
| vec_vsx_st( vy_15, offset_15 ,vptr_y ) ; | |||
| vptr_x+=16; | |||
| vptr_y+=16; | |||
| /* | |||
| v_y[i] += v_a * v_x[i]; | |||
| v_y[i+1] += v_a * v_x[i+1]; | |||
| v_y[i+2] += v_a * v_x[i+2]; | |||
| @@ -59,9 +149,11 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) | |||
| v_y[i+13] += v_a * v_x[i+13]; | |||
| v_y[i+14] += v_a * v_x[i+14]; | |||
| v_y[i+15] += v_a * v_x[i+15]; | |||
| */ | |||
| } | |||
| } | |||
| #endif | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| @@ -74,11 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| { | |||
| BLASLONG n1 = n & -64; | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| if ( n1 ) | |||
| saxpy_kernel_64(n1, x, y, da); | |||
| i = n1; | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "scopy_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| @@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "sdot_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| @@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | |||
| #if defined(TRMMKERNEL) | |||
| #define SAVE_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC[0] = result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] = result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | |||
| rowC[0] = result[0] * alpha; | |||
| rowC[0] = result[3] * alpha; | |||
| #define SAVE_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC[0] = result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] = result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | |||
| rowC[0] = result[0] * alpha; | |||
| rowC[0] = result[3] * alpha; | |||
| #define SAVE4x2_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v2sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] = result[6] * alpha; \ | |||
| rowC[0] = result[0] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] = result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||
| rowC[0] = result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[3* ldc+J]; \ | |||
| rowC[0] = result[0] * alpha; | |||
| rowC[0] = result[6] * alpha; | |||
| #define SAVE4x2_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v2sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] = result[6] * alpha; \ | |||
| rowC[0] = result[0] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[5* ldc+J]; \ | |||
| rowC[0] = result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||
| rowC[0] = result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[7* ldc+J]; \ | |||
| rowC[0] = result[0] * alpha; | |||
| rowC[0] = result[6] * alpha; | |||
| #define SAVE2x4_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC[0] = result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; | |||
| rowC[0] = result[1] * alpha; | |||
| #else | |||
| #define SAVE_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] += result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[3] * alpha; | |||
| #define SAVE_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] += result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[3] * alpha; | |||
| #define SAVE4x2_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v2sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[6] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] += result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||
| rowC[0] += result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[3* ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[6] * alpha; | |||
| #define SAVE4x2_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v2sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] += result[6] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[5* ldc+J]; \ | |||
| rowC[0] += result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||
| rowC[0] += result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[7* ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[6] * alpha; | |||
| #define SAVE2x4_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; | |||
| rowC[0] += result[1] * alpha; | |||
| #endif | |||
| #define KERNEL(i, j) \ | |||
| __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ | |||
| @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| @@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| return(0); | |||
| } | |||
| #endif | |||
| @@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| @@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } | |||
| #endif | |||
| @@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16) | |||
| #define BF16TOF32(x) x | |||
| #endif | |||
| typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | |||
| @@ -64,54 +64,54 @@ vector char mask = | |||
| #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) | |||
| #define SAVE_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] += result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[3] * alpha; | |||
| #define SAVE_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] += result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[3] * alpha; | |||
| #define SAVE4x2_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v2sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[6] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] += result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||
| rowC[0] += result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[3* ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[6] * alpha; | |||
| #define SAVE4x2_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v2sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] += result[6] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[5* ldc+J]; \ | |||
| rowC[0] += result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||
| rowC[0] += result[4] * alpha; \ | |||
| rowC = (v2sf_t *) &CO[7* ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| rowC[0] += result[6] * alpha; | |||
| #define MMA __builtin_mma_xvbf16ger2pp | |||
| #define SAVE2x4_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC[0] += result[0] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; | |||
| rowC[0] += result[1] * alpha; | |||
| #define SET_ACC_ZERO4() \ | |||
| __builtin_mma_xxsetaccz (&acc0); \ | |||
| @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "srot_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "sscal_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #if !defined(HAVE_KERNEL_16) | |||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "sswap_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "zasum_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "zaxpy_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4 | |||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "zcopy_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "zdot_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in | |||
| FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | |||
| if ( n <= 0 ) | |||
| { | |||
| { /* | |||
| __real__ result = 0.0 ; | |||
| __imag__ result = 0.0 ; | |||
| */ | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||
| return(result); | |||
| } | |||
| @@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in | |||
| } | |||
| #if !defined(CONJ) | |||
| /* | |||
| __real__ result = dot[0] - dot[1]; | |||
| __imag__ result = dot[2] + dot[3]; | |||
| */ | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); | |||
| #else | |||
| /* | |||
| __real__ result = dot[0] + dot[1]; | |||
| __imag__ result = dot[2] - dot[3]; | |||
| */ | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); | |||
| #endif | |||
| @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #define HAVE_KERNEL_4x4_VEC 1 | |||
| #define HAVE_KERNEL_4x2_VEC 1 | |||
| #define HAVE_KERNEL_4x1_VEC 1 | |||
| @@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | |||
| #include <altivec.h> | |||
| #endif | |||
| #endif | |||
| // | |||
| #define NBMAX 4096 | |||
| @@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #define HAVE_KERNEL_4x4_VEC 1 | |||
| #define HAVE_KERNEL_4x2_VEC 1 | |||
| #define HAVE_KERNEL_4x1_VEC 1 | |||
| #endif | |||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | |||
| #include <altivec.h> | |||
| #endif | |||
| @@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/zrot.c" | |||
| #else | |||
| #include "common.h" | |||
| @@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| } | |||
| #endif | |||
| @@ -39,10 +39,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "zswap_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| @@ -1,667 +0,0 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| //The array of job_t may overflow the stack. | |||
| //Instead, use malloc to alloc job_t. | |||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||
| #define USE_ALLOC_HEAP | |||
| #endif | |||
| static FLOAT dm1 = -1.; | |||
| #ifndef KERNEL_FUNC | |||
| #ifndef LOWER | |||
| #define KERNEL_FUNC SYRK_KERNEL_U | |||
| #else | |||
| #define KERNEL_FUNC SYRK_KERNEL_L | |||
| #endif | |||
| #endif | |||
| #ifndef LOWER | |||
| #ifndef COMPLEX | |||
| #define TRSM_KERNEL TRSM_KERNEL_LT | |||
| #else | |||
| #define TRSM_KERNEL TRSM_KERNEL_LC | |||
| #endif | |||
| #else | |||
| #ifndef COMPLEX | |||
| #define TRSM_KERNEL TRSM_KERNEL_RN | |||
| #else | |||
| #define TRSM_KERNEL TRSM_KERNEL_RR | |||
| #endif | |||
| #endif | |||
| #ifndef CACHE_LINE_SIZE | |||
| #define CACHE_LINE_SIZE 8 | |||
| #endif | |||
| #ifndef DIVIDE_RATE | |||
| #define DIVIDE_RATE 2 | |||
| #endif | |||
| #ifndef SWITCH_RATIO | |||
| #define SWITCH_RATIO 2 | |||
| #endif | |||
| #ifndef LOWER | |||
| #define TRANS | |||
| #endif | |||
| #ifndef SYRK_LOCAL | |||
| #if !defined(LOWER) && !defined(TRANS) | |||
| #define SYRK_LOCAL SYRK_UN | |||
| #elif !defined(LOWER) && defined(TRANS) | |||
| #define SYRK_LOCAL SYRK_UT | |||
| #elif defined(LOWER) && !defined(TRANS) | |||
| #define SYRK_LOCAL SYRK_LN | |||
| #else | |||
| #define SYRK_LOCAL SYRK_LT | |||
| #endif | |||
| #endif | |||
| typedef struct { | |||
| #ifdef HAVE_C11 | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| #ifndef KERNEL_OPERATION | |||
| #ifndef COMPLEX | |||
| #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | |||
| KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) | |||
| #else | |||
| #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | |||
| KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) | |||
| #endif | |||
| #endif | |||
| #ifndef ICOPY_OPERATION | |||
| #ifndef TRANS | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #else | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #endif | |||
| #endif | |||
| #ifndef OCOPY_OPERATION | |||
| #ifdef TRANS | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #else | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #endif | |||
| #endif | |||
| #ifndef S | |||
| #define S args -> a | |||
| #endif | |||
| #ifndef A | |||
| #define A args -> b | |||
| #endif | |||
| #ifndef C | |||
| #define C args -> c | |||
| #endif | |||
| #ifndef LDA | |||
| #define LDA args -> lda | |||
| #endif | |||
| #ifndef N | |||
| #define N args -> m | |||
| #endif | |||
| #ifndef K | |||
| #define K args -> k | |||
| #endif | |||
| static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||
| FLOAT *buffer[DIVIDE_RATE]; | |||
| BLASLONG k, lda; | |||
| BLASLONG m_from, m_to; | |||
| FLOAT *alpha; | |||
| FLOAT *a, *c; | |||
| job_t *job = (job_t *)args -> common; | |||
| BLASLONG xxx, bufferside; | |||
| BLASLONG jjs, min_jj; | |||
| BLASLONG is, min_i, div_n; | |||
| BLASLONG i, current; | |||
| k = K; | |||
| a = (FLOAT *)A; | |||
| c = (FLOAT *)C; | |||
| lda = LDA; | |||
| alpha = (FLOAT *)args -> alpha; | |||
| m_from = range_n[mypos + 0]; | |||
| m_to = range_n[mypos + 1]; | |||
| #if 0 | |||
| fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); | |||
| #endif | |||
| div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
| for (i = 1; i < DIVIDE_RATE; i++) { | |||
| buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; | |||
| } | |||
| #ifndef LOWER | |||
| TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); | |||
| #else | |||
| TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); | |||
| #endif | |||
| for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { | |||
| for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ | |||
| min_jj = MIN(m_to, xxx + div_n) - jjs; | |||
| #ifndef LOWER | |||
| if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; | |||
| #else | |||
| if (min_jj > GEMM_P) min_jj = GEMM_P; | |||
| #endif | |||
| #ifndef LOWER | |||
| OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); | |||
| TRSM_KERNEL (k, min_jj, k, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sb, | |||
| buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, | |||
| a + jjs * lda * COMPSIZE, lda, 0); | |||
| #else | |||
| ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); | |||
| TRSM_KERNEL (min_jj, k, k, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, | |||
| sb, | |||
| a + jjs * COMPSIZE, lda, 0); | |||
| #endif | |||
| } | |||
| #ifndef LOWER | |||
| for (i = 0; i <= mypos; i++) | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| #else | |||
| for (i = mypos; i < args -> nthreads; i++) | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| #endif | |||
| WMB; | |||
| } | |||
| min_i = m_to - m_from; | |||
| if (min_i >= GEMM_P * 2) { | |||
| min_i = GEMM_P; | |||
| } else | |||
| if (min_i > GEMM_P) { | |||
| min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| } | |||
| #ifndef LOWER | |||
| ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); | |||
| #else | |||
| OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); | |||
| #endif | |||
| current = mypos; | |||
| #ifndef LOWER | |||
| while (current < args -> nthreads) | |||
| #else | |||
| while (current >= 0) | |||
| #endif | |||
| { | |||
| div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | |||
| /* thread has to wait */ | |||
| if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, lda, m_from, xxx); | |||
| if (m_from + min_i >= m_to) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| #ifndef LOWER | |||
| current ++; | |||
| #else | |||
| current --; | |||
| #endif | |||
| } | |||
| for(is = m_from + min_i; is < m_to; is += min_i){ | |||
| min_i = m_to - is; | |||
| if (min_i >= GEMM_P * 2) { | |||
| min_i = GEMM_P; | |||
| } else | |||
| if (min_i > GEMM_P) { | |||
| min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| } | |||
| #ifndef LOWER | |||
| ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); | |||
| #else | |||
| OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); | |||
| #endif | |||
| current = mypos; | |||
| #ifndef LOWER | |||
| while (current < args -> nthreads) | |||
| #else | |||
| while (current >= 0) | |||
| #endif | |||
| { | |||
| div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | |||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, lda, is, xxx); | |||
| if (is + min_i >= m_to) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| #ifndef LOWER | |||
| current ++; | |||
| #else | |||
| current --; | |||
| #endif | |||
| } | |||
| } | |||
| for (i = 0; i < args -> nthreads; i++) { | |||
| if (i != mypos) { | |||
| for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ | |||
| blas_arg_t newarg; | |||
| #ifndef USE_ALLOC_HEAP | |||
| job_t job[MAX_CPU_NUMBER]; | |||
| #else | |||
| job_t * job = NULL; | |||
| #endif | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| BLASLONG range[MAX_CPU_NUMBER + 100]; | |||
| BLASLONG num_cpu; | |||
| BLASLONG nthreads = args -> nthreads; | |||
| BLASLONG width, i, j, k; | |||
| BLASLONG n, n_from, n_to; | |||
| int mode, mask; | |||
| double dnum; | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; | |||
| #elif defined(HALF) | |||
| mode = BLAS_HALF | BLAS_REAL; | |||
| mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_REAL; | |||
| mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||
| mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; | |||
| #endif | |||
| #endif | |||
| newarg.m = args -> m; | |||
| newarg.k = args -> k; | |||
| newarg.a = args -> a; | |||
| newarg.b = args -> b; | |||
| newarg.c = args -> c; | |||
| newarg.lda = args -> lda; | |||
| newarg.alpha = args -> alpha; | |||
| #ifdef USE_ALLOC_HEAP | |||
| job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); | |||
| if(job==NULL){ | |||
| fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); | |||
| exit(1); | |||
| } | |||
| #endif | |||
| newarg.common = (void *)job; | |||
| n_from = 0; | |||
| n_to = args -> m; | |||
| #ifndef LOWER | |||
| range[MAX_CPU_NUMBER] = n_to - n_from; | |||
| range[0] = 0; | |||
| num_cpu = 0; | |||
| i = 0; | |||
| n = n_to - n_from; | |||
| dnum = (double)n * (double)n /(double)nthreads; | |||
| while (i < n){ | |||
| if (nthreads - num_cpu > 1) { | |||
| double di = (double)i; | |||
| width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); | |||
| if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); | |||
| if ((width > n - i) || (width < mask)) width = n - i; | |||
| } else { | |||
| width = n - i; | |||
| } | |||
| range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = inner_thread; | |||
| queue[num_cpu].args = &newarg; | |||
| queue[num_cpu].range_m = NULL; | |||
| queue[num_cpu].sa = NULL; | |||
| queue[num_cpu].sb = NULL; | |||
| queue[num_cpu].next = &queue[num_cpu + 1]; | |||
| num_cpu ++; | |||
| i += width; | |||
| } | |||
| for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; | |||
| #else | |||
| range[0] = 0; | |||
| num_cpu = 0; | |||
| i = 0; | |||
| n = n_to - n_from; | |||
| dnum = (double)n * (double)n /(double)nthreads; | |||
| while (i < n){ | |||
| if (nthreads - num_cpu > 1) { | |||
| double di = (double)i; | |||
| width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); | |||
| if ((width > n - i) || (width < mask)) width = n - i; | |||
| } else { | |||
| width = n - i; | |||
| } | |||
| range[num_cpu + 1] = range[num_cpu] + width; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = inner_thread; | |||
| queue[num_cpu].args = &newarg; | |||
| queue[num_cpu].range_m = NULL; | |||
| queue[num_cpu].range_n = range; | |||
| queue[num_cpu].sa = NULL; | |||
| queue[num_cpu].sb = NULL; | |||
| queue[num_cpu].next = &queue[num_cpu + 1]; | |||
| num_cpu ++; | |||
| i += width; | |||
| } | |||
| #endif | |||
| newarg.nthreads = num_cpu; | |||
| if (num_cpu) { | |||
| for (j = 0; j < num_cpu; j++) { | |||
| for (i = 0; i < num_cpu; i++) { | |||
| for (k = 0; k < DIVIDE_RATE; k++) { | |||
| job[j].working[i][CACHE_LINE_SIZE * k] = 0; | |||
| } | |||
| } | |||
| } | |||
| queue[0].sa = sa; | |||
| queue[0].sb = sb; | |||
| queue[num_cpu - 1].next = NULL; | |||
| exec_blas(num_cpu, queue); | |||
| } | |||
| #ifdef USE_ALLOC_HEAP | |||
| free(job); | |||
| #endif | |||
| return 0; | |||
| } | |||
| #endif | |||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||
| BLASLONG n, bk, i, blocking, lda; | |||
| BLASLONG info; | |||
| int mode; | |||
| blas_arg_t newarg; | |||
| FLOAT *a; | |||
| FLOAT alpha[2] = { -ONE, ZERO}; | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_REAL; | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| #endif | |||
| if (args -> nthreads == 1) { | |||
| #ifndef LOWER | |||
| info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); | |||
| #else | |||
| info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); | |||
| #endif | |||
| return info; | |||
| } | |||
| n = args -> n; | |||
| a = (FLOAT *)args -> a; | |||
| lda = args -> lda; | |||
| if (range_n) n = range_n[1] - range_n[0]; | |||
| if (n <= GEMM_UNROLL_N * 2) { | |||
| #ifndef LOWER | |||
| info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); | |||
| #else | |||
| info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); | |||
| #endif | |||
| return info; | |||
| } | |||
| newarg.lda = lda; | |||
| newarg.ldb = lda; | |||
| newarg.ldc = lda; | |||
| newarg.alpha = alpha; | |||
| newarg.beta = NULL; | |||
| newarg.nthreads = args -> nthreads; | |||
| blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; | |||
| if (blocking > GEMM_Q) blocking = GEMM_Q; | |||
| for (i = 0; i < n; i += blocking) { | |||
| bk = n - i; | |||
| if (bk > blocking) bk = blocking; | |||
| newarg.m = bk; | |||
| newarg.n = bk; | |||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||
| info = CNAME(&newarg, NULL, NULL, sa, sb, 0); | |||
| if (info) return info + i; | |||
| if (n - i - bk > 0) { | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| newarg.m = n - i - bk; | |||
| newarg.k = bk; | |||
| #ifndef LOWER | |||
| newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; | |||
| #else | |||
| newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; | |||
| #endif | |||
| newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; | |||
| thread_driver(&newarg, sa, sb); | |||
| #else | |||
| #ifndef LOWER | |||
| newarg.m = bk; | |||
| newarg.n = n - i - bk; | |||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||
| newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; | |||
| gemm_thread_n(mode | BLAS_TRANSA_T, | |||
| &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); | |||
| newarg.n = n - i - bk; | |||
| newarg.k = bk; | |||
| newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; | |||
| newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; | |||
| #if 0 | |||
| HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); | |||
| #else | |||
| syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, | |||
| &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); | |||
| #endif | |||
| #else | |||
| newarg.m = n - i - bk; | |||
| newarg.n = bk; | |||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||
| newarg.b = a + (i + bk + i * lda) * COMPSIZE; | |||
| gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, | |||
| &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); | |||
| newarg.n = n - i - bk; | |||
| newarg.k = bk; | |||
| newarg.a = a + (i + bk + i * lda) * COMPSIZE; | |||
| newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; | |||
| #if 0 | |||
| HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); | |||
| #else | |||
| syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, | |||
| &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); | |||
| #endif | |||
| #endif | |||
| #endif | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -101,7 +101,12 @@ static FLOAT dm1 = -1.; | |||
| #endif | |||
| typedef struct { | |||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| #ifdef HAVE_C11 | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; | |||
| #elif defined(HALF) | |||
| mode = BLAS_HALF | BLAS_REAL; | |||
| mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_REAL; | |||
| mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; | |||
| @@ -2225,7 +2225,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #if defined(__32BIT__) | |||
| #warning using BINARY32==POWER6 | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #else | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||
| @@ -2234,7 +2244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #endif | |||
| #define SGEMM_DEFAULT_P 1280UL | |||
| #define DGEMM_DEFAULT_P 640UL | |||
| #define CGEMM_DEFAULT_P 640UL | |||
| @@ -2769,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #elif defined(THUNDERX3T110) | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SGEMM_DEFAULT_P 128 | |||
| #define DGEMM_DEFAULT_P 320 | |||
| #define CGEMM_DEFAULT_P 128 | |||
| #define ZGEMM_DEFAULT_P 128 | |||
| #define SGEMM_DEFAULT_Q 352 | |||
| #define DGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 224 | |||
| #define ZGEMM_DEFAULT_Q 112 | |||
| #define SGEMM_DEFAULT_R 4096 | |||
| #define DGEMM_DEFAULT_R 4096 | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #elif defined(NEOVERSEN1) | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||