| @@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | ||||
| endif | endif | ||||
| ifeq ($(CORE), THUNDERX3T110) | |||||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||||
| CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||||
| FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||||
| else | |||||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||||
| endif | |||||
| endif | |||||
| ifeq ($(GCCVERSIONGTEQ9), 1) | ifeq ($(GCCVERSIONGTEQ9), 1) | ||||
| ifeq ($(CORE), TSV110) | ifeq ($(CORE), TSV110) | ||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | ||||
| @@ -11,34 +11,34 @@ endif | |||||
| ifeq ($(CORE), POWER10) | ifeq ($(CORE), POWER10) | ||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| else | else | ||||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math | |||||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(CORE), POWER9) | ifeq ($(CORE), POWER9) | ||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| ifneq ($(C_COMPILER), PGI) | ifneq ($(C_COMPILER), PGI) | ||||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| else | else | ||||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp | CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp | ||||
| endif | endif | ||||
| ifneq ($(F_COMPILER), PGI) | ifneq ($(F_COMPILER), PGI) | ||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| else | else | ||||
| FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp | FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp | ||||
| endif | endif | ||||
| else | else | ||||
| ifneq ($(C_COMPILER), PGI) | ifneq ($(C_COMPILER), PGI) | ||||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math | |||||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math | |||||
| else | else | ||||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | ||||
| endif | endif | ||||
| ifneq ($(F_COMPILER), PGI) | ifneq ($(F_COMPILER), PGI) | ||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math | |||||
| else | else | ||||
| FCOMMON_OPT += -O2 -Mrecursive | FCOMMON_OPT += -O2 -Mrecursive | ||||
| endif | endif | ||||
| @@ -48,26 +48,26 @@ endif | |||||
| ifeq ($(CORE), POWER8) | ifeq ($(CORE), POWER8) | ||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| ifneq ($(C_COMPILER), PGI) | ifneq ($(C_COMPILER), PGI) | ||||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| else | else | ||||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp | CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp | ||||
| endif | endif | ||||
| ifneq ($(F_COMPILER), PGI) | ifneq ($(F_COMPILER), PGI) | ||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| else | else | ||||
| FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp | FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp | ||||
| endif | endif | ||||
| else | else | ||||
| ifneq ($(C_COMPILER), PGI) | ifneq ($(C_COMPILER), PGI) | ||||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math | |||||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math | |||||
| else | else | ||||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | ||||
| endif | endif | ||||
| ifneq ($(F_COMPILER), PGI) | ifneq ($(F_COMPILER), PGI) | ||||
| ifeq ($(OSNAME), AIX) | ifeq ($(OSNAME), AIX) | ||||
| FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||||
| FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||||
| else | else | ||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||||
| endif | endif | ||||
| else | else | ||||
| FCOMMON_OPT += -O2 -Mrecursive | FCOMMON_OPT += -O2 -Mrecursive | ||||
| @@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX | |||||
| DYNAMIC_CORE += THUNDERX2T99 | DYNAMIC_CORE += THUNDERX2T99 | ||||
| DYNAMIC_CORE += TSV110 | DYNAMIC_CORE += TSV110 | ||||
| DYNAMIC_CORE += EMAG8180 | DYNAMIC_CORE += EMAG8180 | ||||
| DYNAMIC_CORE += THUNDERX3T110 | |||||
| endif | endif | ||||
| ifeq ($(ARCH), zarch) | ifeq ($(ARCH), zarch) | ||||
| @@ -617,7 +618,6 @@ DYNAMIC_CORE += POWER8 | |||||
| ifneq ($(C_COMPILER), GCC) | ifneq ($(C_COMPILER), GCC) | ||||
| DYNAMIC_CORE += POWER9 | DYNAMIC_CORE += POWER9 | ||||
| DYNAMIC_CORE += POWER10 | DYNAMIC_CORE += POWER10 | ||||
| override LDFLAGS += -Wl,-no-power10-stubs | |||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| ifeq ($(GCCVERSIONGT5), 1) | ifeq ($(GCCVERSIONGT5), 1) | ||||
| @@ -627,11 +627,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||||
| endif | endif | ||||
| ifeq ($(GCCVERSIONGTEQ11), 1) | ifeq ($(GCCVERSIONGTEQ11), 1) | ||||
| DYNAMIC_CORE += POWER10 | DYNAMIC_CORE += POWER10 | ||||
| override LDFLAGS += -Wl,-no-power10-stubs | |||||
| else ifeq ($(GCCVERSIONGTEQ10), 1) | else ifeq ($(GCCVERSIONGTEQ10), 1) | ||||
| ifeq ($(GCCMINORVERSIONGTEQ2), 1) | ifeq ($(GCCMINORVERSIONGTEQ2), 1) | ||||
| DYNAMIC_CORE += POWER10 | DYNAMIC_CORE += POWER10 | ||||
| override LDFLAGS += -Wl,-no-power10-stubs | |||||
| endif | endif | ||||
| else | else | ||||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | ||||
| @@ -1241,7 +1239,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) | |||||
| include $(TOPDIR)/Makefile.$(ARCH) | include $(TOPDIR)/Makefile.$(ARCH) | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME | CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME | ||||
| endif | |||||
| CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" | CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" | ||||
| ifeq ($(CORE), PPC440) | ifeq ($(CORE), PPC440) | ||||
| @@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge | |||||
| ## Installation from Source | ## Installation from Source | ||||
| Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code | Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code | ||||
| using Git from https://github.com/xianyi/OpenBLAS.git. | |||||
| using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be | |||||
| sure to use the develop branch - master is several years out of date due to a change of maintainership.) | |||||
| Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. | Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. | ||||
| Most can also be given directly on the make or cmake command line. | Most can also be given directly on the make or cmake command line. | ||||
| @@ -96,6 +96,7 @@ FALKOR | |||||
| THUNDERX | THUNDERX | ||||
| THUNDERX2T99 | THUNDERX2T99 | ||||
| TSV110 | TSV110 | ||||
| THUNDERX3T110 | |||||
| 9.System Z: | 9.System Z: | ||||
| ZARCH_GENERIC | ZARCH_GENERIC | ||||
| @@ -45,7 +45,7 @@ endif () | |||||
| if (DYNAMIC_ARCH) | if (DYNAMIC_ARCH) | ||||
| if (ARM64) | if (ARM64) | ||||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) | |||||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||||
| endif () | endif () | ||||
| if (POWER) | if (POWER) | ||||
| @@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
| "#define HAVE_VFP\n" | "#define HAVE_VFP\n" | ||||
| "#define HAVE_NEON\n" | "#define HAVE_NEON\n" | ||||
| "#define ARMV8\n") | "#define ARMV8\n") | ||||
| if ("${TCORE}" STREQUAL "CORTEXA57") | |||||
| set(SGEMM_UNROLL_M 16) | set(SGEMM_UNROLL_M 16) | ||||
| set(SGEMM_UNROLL_N 4) | set(SGEMM_UNROLL_N 4) | ||||
| else () | |||||
| set(SGEMM_UNROLL_M 8) | |||||
| set(SGEMM_UNROLL_N 8) | |||||
| endif () | |||||
| set(DGEMM_UNROLL_M 8) | set(DGEMM_UNROLL_M 8) | ||||
| set(DGEMM_UNROLL_N 4) | set(DGEMM_UNROLL_N 4) | ||||
| set(CGEMM_UNROLL_M 8) | set(CGEMM_UNROLL_M 8) | ||||
| @@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
| set(ZGEMM_UNROLL_M 4) | set(ZGEMM_UNROLL_M 4) | ||||
| set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
| set(SYMV_P 16) | set(SYMV_P 16) | ||||
| elseif ("${TCORE}" STREQUAL "THUNDERX3T110") | |||||
| file(APPEND ${TARGET_CONF_TEMP} | |||||
| "#define THUNDERX3T110\n" | |||||
| "#define L1_CODE_SIZE\t65536\n" | |||||
| "#define L1_CODE_LINESIZE\t64\n" | |||||
| "#define L1_CODE_ASSOCIATIVE\t8\n" | |||||
| "#define L1_DATA_SIZE\t65536\n" | |||||
| "#define L1_DATA_LINESIZE\t64\n" | |||||
| "#define L1_DATA_ASSOCIATIVE\t8\n" | |||||
| "#define L2_SIZE\t524288\n" | |||||
| "#define L2_LINESIZE\t64\n" | |||||
| "#define L2_ASSOCIATIVE\t8\n" | |||||
| "#define L3_SIZE\t94371840\n" | |||||
| "#define L3_LINESIZE\t64\n" | |||||
| "#define L3_ASSOCIATIVE\t32\n" | |||||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||||
| "#define DTB_SIZE\t4096\n" | |||||
| "#define ARMV8\n") | |||||
| set(SGEMM_UNROLL_M 16) | |||||
| set(SGEMM_UNROLL_N 4) | |||||
| set(DGEMM_UNROLL_M 8) | |||||
| set(DGEMM_UNROLL_N 4) | |||||
| set(CGEMM_UNROLL_M 8) | |||||
| set(CGEMM_UNROLL_N 4) | |||||
| set(ZGEMM_UNROLL_M 4) | |||||
| set(ZGEMM_UNROLL_N 4) | |||||
| set(SYMV_P 16) | |||||
| elseif ("${TCORE}" STREQUAL "TSV110") | elseif ("${TCORE}" STREQUAL "TSV110") | ||||
| file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
| "#define ARMV8\n" | "#define ARMV8\n" | ||||
| @@ -40,6 +40,7 @@ | |||||
| // Cavium | // Cavium | ||||
| #define CPU_THUNDERX 7 | #define CPU_THUNDERX 7 | ||||
| #define CPU_THUNDERX2T99 8 | #define CPU_THUNDERX2T99 8 | ||||
| #define CPU_THUNDERX3T110 12 | |||||
| //Hisilicon | //Hisilicon | ||||
| #define CPU_TSV110 9 | #define CPU_TSV110 9 | ||||
| // Ampere | // Ampere | ||||
| @@ -57,7 +58,8 @@ static char *cpuname[] = { | |||||
| "THUNDERX2T99", | "THUNDERX2T99", | ||||
| "TSV110", | "TSV110", | ||||
| "EMAG8180", | "EMAG8180", | ||||
| "NEOVERSEN1" | |||||
| "NEOVERSEN1", | |||||
| "THUNDERX3T110" | |||||
| }; | }; | ||||
| static char *cpuname_lower[] = { | static char *cpuname_lower[] = { | ||||
| @@ -72,7 +74,8 @@ static char *cpuname_lower[] = { | |||||
| "thunderx2t99", | "thunderx2t99", | ||||
| "tsv110", | "tsv110", | ||||
| "emag8180", | "emag8180", | ||||
| "neoversen1" | |||||
| "neoversen1", | |||||
| "thunderx3t110" | |||||
| }; | }; | ||||
| int get_feature(char *search) | int get_feature(char *search) | ||||
| @@ -158,6 +161,8 @@ int detect(void) | |||||
| return CPU_THUNDERX; | return CPU_THUNDERX; | ||||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | ||||
| return CPU_THUNDERX2T99; | return CPU_THUNDERX2T99; | ||||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8")) | |||||
| return CPU_THUNDERX3T110; | |||||
| // HiSilicon | // HiSilicon | ||||
| else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) | else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) | ||||
| return CPU_TSV110; | return CPU_TSV110; | ||||
| @@ -372,7 +377,25 @@ void get_cpuconfig(void) | |||||
| printf("#define L2_LINESIZE 64\n"); | printf("#define L2_LINESIZE 64\n"); | ||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | printf("#define DTB_DEFAULT_ENTRIES 64\n"); | ||||
| printf("#define DTB_SIZE 4096\n"); | printf("#define DTB_SIZE 4096\n"); | ||||
| break; | |||||
| case CPU_THUNDERX3T110: | |||||
| printf("#define THUNDERX3T110 \n"); | |||||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L2_SIZE 524288 \n"); | |||||
| printf("#define L2_LINESIZE 64 \n"); | |||||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L3_SIZE 94371840 \n"); | |||||
| printf("#define L3_LINESIZE 64 \n"); | |||||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||||
| printf("#define DTB_SIZE 4096 \n"); | |||||
| break; | |||||
| } | } | ||||
| get_cpucount(); | get_cpucount(); | ||||
| } | } | ||||
| @@ -1454,10 +1454,11 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_OPTERON; | return CPUTYPE_OPTERON; | ||||
| case 1: | case 1: | ||||
| case 3: | case 3: | ||||
| case 7: | |||||
| case 10: | |||||
| // case 7: | |||||
| // case 10: | |||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| case 5: | case 5: | ||||
| case 7: | |||||
| return CPUTYPE_BOBCAT; | return CPUTYPE_BOBCAT; | ||||
| case 6: | case 6: | ||||
| switch (model) { | switch (model) { | ||||
| @@ -1507,6 +1508,8 @@ int get_cpuname(void){ | |||||
| // AMD Ryzen | // AMD Ryzen | ||||
| case 8: | case 8: | ||||
| // AMD Ryzen2 | // AMD Ryzen2 | ||||
| default: | |||||
| // Matisse/Renoir and other recent Ryzen2 | |||||
| if(support_avx()) | if(support_avx()) | ||||
| #ifndef NO_AVX2 | #ifndef NO_AVX2 | ||||
| return CPUTYPE_ZEN; | return CPUTYPE_ZEN; | ||||
| @@ -1516,6 +1519,16 @@ int get_cpuname(void){ | |||||
| else | else | ||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| } | } | ||||
| break; | |||||
| case 10: // Zen3 | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return CPUTYPE_ZEN; | |||||
| #else | |||||
| return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator | |||||
| #endif | |||||
| else | |||||
| return CPUTYPE_BARCELONA; | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -2107,7 +2120,7 @@ int get_coretype(void){ | |||||
| return CORE_PILEDRIVER; | return CORE_PILEDRIVER; | ||||
| else | else | ||||
| return CORE_BARCELONA; //OS don't support AVX. | return CORE_BARCELONA; //OS don't support AVX. | ||||
| case 5: // New EXCAVATOR | |||||
| case 5: // New EXCAVATOR | |||||
| if(support_avx()) | if(support_avx()) | ||||
| return CORE_EXCAVATOR; | return CORE_EXCAVATOR; | ||||
| else | else | ||||
| @@ -2135,12 +2148,14 @@ int get_coretype(void){ | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| } else if (exfamily == 8) { | |||||
| } else if (exfamily == 8 || exfamily == 10) { | |||||
| switch (model) { | switch (model) { | ||||
| case 1: | case 1: | ||||
| // AMD Ryzen | // AMD Ryzen | ||||
| case 8: | case 8: | ||||
| // Ryzen 2 | |||||
| // Ryzen 2 | |||||
| default: | |||||
| // Matisse,Renoir Ryzen2 models | |||||
| if(support_avx()) | if(support_avx()) | ||||
| #ifndef NO_AVX2 | #ifndef NO_AVX2 | ||||
| return CORE_ZEN; | return CORE_ZEN; | ||||
| @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ | |||||
| if ((exfamily == 0) || (exfamily == 2)) { | if ((exfamily == 0) || (exfamily == 2)) { | ||||
| if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; | if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; | ||||
| else return &gotoblas_OPTERON; | else return &gotoblas_OPTERON; | ||||
| } else if (exfamily == 5) { | |||||
| } else if (exfamily == 5 || exfamily == 7) { | |||||
| return &gotoblas_BOBCAT; | return &gotoblas_BOBCAT; | ||||
| } else if (exfamily == 6) { | } else if (exfamily == 6) { | ||||
| if(model == 1){ | if(model == 1){ | ||||
| @@ -710,7 +710,7 @@ static gotoblas_t *get_coretype(void){ | |||||
| } | } | ||||
| } | } | ||||
| } else if (exfamily == 8) { | } else if (exfamily == 8) { | ||||
| if (model == 1 || model == 8) { | |||||
| /* if (model == 1 || model == 8) */ { | |||||
| if(support_avx()) | if(support_avx()) | ||||
| return &gotoblas_ZEN; | return &gotoblas_ZEN; | ||||
| else{ | else{ | ||||
| @@ -718,16 +718,24 @@ static gotoblas_t *get_coretype(void){ | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| } | } | ||||
| } else if (exfamily == 9) { | |||||
| } else if (exfamily == 9) { | |||||
| if(support_avx()) | if(support_avx()) | ||||
| return &gotoblas_ZEN; | return &gotoblas_ZEN; | ||||
| else{ | else{ | ||||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | ||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | ||||
| } | |||||
| } | |||||
| } else if (exfamily == 10) { | |||||
| if(support_avx()) | |||||
| return &gotoblas_ZEN; | |||||
| else{ | |||||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| }else { | }else { | ||||
| return &gotoblas_BARCELONA; | return &gotoblas_BARCELONA; | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99; | |||||
| extern gotoblas_t gotoblas_TSV110; | extern gotoblas_t gotoblas_TSV110; | ||||
| extern gotoblas_t gotoblas_EMAG8180; | extern gotoblas_t gotoblas_EMAG8180; | ||||
| extern gotoblas_t gotoblas_NEOVERSEN1; | extern gotoblas_t gotoblas_NEOVERSEN1; | ||||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||||
| extern void openblas_warning(int verbose, const char * msg); | extern void openblas_warning(int verbose, const char * msg); | ||||
| #define NUM_CORETYPES 11 | |||||
| #define NUM_CORETYPES 12 | |||||
| /* | /* | ||||
| * In case asm/hwcap.h is outdated on the build system, make sure | * In case asm/hwcap.h is outdated on the build system, make sure | ||||
| @@ -82,6 +83,7 @@ static char *corename[] = { | |||||
| "tsv110", | "tsv110", | ||||
| "emag8180", | "emag8180", | ||||
| "neoversen1", | "neoversen1", | ||||
| "thunderx3t110", | |||||
| "unknown" | "unknown" | ||||
| }; | }; | ||||
| @@ -97,6 +99,7 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_TSV110) return corename[ 8]; | if (gotoblas == &gotoblas_TSV110) return corename[ 8]; | ||||
| if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; | if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; | ||||
| if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; | if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; | ||||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; | |||||
| return corename[NUM_CORETYPES]; | return corename[NUM_CORETYPES]; | ||||
| } | } | ||||
| @@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||||
| case 8: return (&gotoblas_TSV110); | case 8: return (&gotoblas_TSV110); | ||||
| case 9: return (&gotoblas_EMAG8180); | case 9: return (&gotoblas_EMAG8180); | ||||
| case 10: return (&gotoblas_NEOVERSEN1); | case 10: return (&gotoblas_NEOVERSEN1); | ||||
| case 11: return (&gotoblas_THUNDERX3T110); | |||||
| } | } | ||||
| snprintf(message, 128, "Core not found: %s\n", coretype); | snprintf(message, 128, "Core not found: %s\n", coretype); | ||||
| openblas_warning(1, message); | openblas_warning(1, message); | ||||
| @@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) { | |||||
| return &gotoblas_THUNDERX; | return &gotoblas_THUNDERX; | ||||
| case 0x0af: // ThunderX2 | case 0x0af: // ThunderX2 | ||||
| return &gotoblas_THUNDERX2T99; | return &gotoblas_THUNDERX2T99; | ||||
| case 0x0b8: // ThunderX3 | |||||
| return &gotoblas_THUNDERX3T110; | |||||
| } | } | ||||
| break; | break; | ||||
| case 0x48: // HiSilicon | case 0x48: // HiSilicon | ||||
| @@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "EMAG8180" | #define CORENAME "EMAG8180" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_THUNDERX3T110 | |||||
| #define ARMV8 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "THUNDERX3T110" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DTHUNDERX3T110 " \ | |||||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||||
| "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||||
| #define LIBNAME "thunderx3t110" | |||||
| #define CORENAME "THUNDERX3T110" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_ZARCH_GENERIC | #ifdef FORCE_ZARCH_GENERIC | ||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "ZARCH" | #define ARCHITECTURE "ZARCH" | ||||
| @@ -42,7 +42,7 @@ | |||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | #endif | ||||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) | |||||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) | |||||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | // Multithreaded swap gives performance benefits in ThunderX2T99 | ||||
| #else | #else | ||||
| // Disable multi-threading as it does not show any performance | // Disable multi-threading as it does not show any performance | ||||
| @@ -42,7 +42,7 @@ | |||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | #endif | ||||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) | |||||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) | |||||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | // Multithreaded swap gives performance benefits in ThunderX2T99 | ||||
| #else | #else | ||||
| // Disable multi-threading as it does not show any performance | // Disable multi-threading as it does not show any performance | ||||
| @@ -10,6 +10,11 @@ ifeq ($(C_COMPILER), GCC) | |||||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | ||||
| endif | endif | ||||
| ifeq ($(ARCH), power) | |||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| override CFLAGS += -fno-integrated-as | |||||
| endif | |||||
| endif | |||||
| AVX2OPT = | AVX2OPT = | ||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| # AVX2 support was added in 4.7.0 | # AVX2 support was added in 4.7.0 | ||||
| @@ -44,8 +44,10 @@ USE_TRMM = 1 | |||||
| endif | endif | ||||
| ifeq ($(CORE), POWER8) | ifeq ($(CORE), POWER8) | ||||
| ifeq ($(BINARY64),1) | |||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), POWER9) | ifeq ($(CORE), POWER9) | ||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| @@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| dot[0]=0.0; | dot[0]=0.0; | ||||
| dot[1]=0.0; | dot[1]=0.0; | ||||
| #if !defined(__PPC__) | |||||
| CREAL(result) = 0.0 ; | CREAL(result) = 0.0 ; | ||||
| CIMAG(result) = 0.0 ; | CIMAG(result) = 0.0 ; | ||||
| #else | |||||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||||
| #endif | |||||
| if ( n < 1 ) return(result); | if ( n < 1 ) return(result); | ||||
| inc_x2 = 2 * inc_x ; | inc_x2 = 2 * inc_x ; | ||||
| @@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| i++ ; | i++ ; | ||||
| } | } | ||||
| CREAL(result) = dot[0]; | |||||
| #if !defined(__POWER__) | |||||
| CREAL(result) = dot[0]; | |||||
| CIMAG(result) = dot[1]; | CIMAG(result) = dot[1]; | ||||
| #else | |||||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); | |||||
| #endif | |||||
| return(result); | return(result); | ||||
| } | } | ||||
| @@ -0,0 +1,184 @@ | |||||
| SAMINKERNEL = ../arm/amin.c | |||||
| DAMINKERNEL = ../arm/amin.c | |||||
| CAMINKERNEL = ../arm/zamin.c | |||||
| ZAMINKERNEL = ../arm/zamin.c | |||||
| SMAXKERNEL = ../arm/max.c | |||||
| DMAXKERNEL = ../arm/max.c | |||||
| SMINKERNEL = ../arm/min.c | |||||
| DMINKERNEL = ../arm/min.c | |||||
| ISAMINKERNEL = ../arm/iamin.c | |||||
| IDAMINKERNEL = ../arm/iamin.c | |||||
| ICAMINKERNEL = ../arm/izamin.c | |||||
| IZAMINKERNEL = ../arm/izamin.c | |||||
| ISMAXKERNEL = ../arm/imax.c | |||||
| IDMAXKERNEL = ../arm/imax.c | |||||
| ISMINKERNEL = ../arm/imin.c | |||||
| IDMINKERNEL = ../arm/imin.c | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| SAMAXKERNEL = amax.S | |||||
| DAMAXKERNEL = amax.S | |||||
| CAMAXKERNEL = zamax.S | |||||
| ZAMAXKERNEL = zamax.S | |||||
| SAXPYKERNEL = axpy.S | |||||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||||
| CAXPYKERNEL = zaxpy.S | |||||
| ZAXPYKERNEL = zaxpy.S | |||||
| SROTKERNEL = rot.S | |||||
| DROTKERNEL = rot.S | |||||
| CROTKERNEL = zrot.S | |||||
| ZROTKERNEL = zrot.S | |||||
| SSCALKERNEL = scal.S | |||||
| DSCALKERNEL = scal.S | |||||
| CSCALKERNEL = zscal.S | |||||
| ZSCALKERNEL = zscal.S | |||||
| SGEMVNKERNEL = gemv_n.S | |||||
| DGEMVNKERNEL = gemv_n.S | |||||
| CGEMVNKERNEL = zgemv_n.S | |||||
| ZGEMVNKERNEL = zgemv_n.S | |||||
| SGEMVTKERNEL = gemv_t.S | |||||
| DGEMVTKERNEL = gemv_t.S | |||||
| CGEMVTKERNEL = zgemv_t.S | |||||
| ZGEMVTKERNEL = zgemv_t.S | |||||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||||
| else | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
| endif | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||||
| else | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
| endif | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| SASUMKERNEL = sasum_thunderx2t99.c | |||||
| DASUMKERNEL = dasum_thunderx2t99.c | |||||
| CASUMKERNEL = casum_thunderx2t99.c | |||||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||||
| SCOPYKERNEL = copy_thunderx2t99.c | |||||
| DCOPYKERNEL = copy_thunderx2t99.c | |||||
| CCOPYKERNEL = copy_thunderx2t99.c | |||||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||||
| SSWAPKERNEL = swap_thunderx2t99.S | |||||
| DSWAPKERNEL = swap_thunderx2t99.S | |||||
| CSWAPKERNEL = swap_thunderx2t99.S | |||||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||||
| #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||||
| #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||||
| DDOTKERNEL = dot_thunderx2t99.c | |||||
| SDOTKERNEL = dot_thunderx2t99.c | |||||
| CDOTKERNEL = zdot_thunderx2t99.c | |||||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||||
| DSDOTKERNEL = dot.S | |||||
| ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) | |||||
| DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) | |||||
| SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S | |||||
| endif | |||||
| ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) | |||||
| CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S | |||||
| endif | |||||
| ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) | |||||
| ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S | |||||
| endif | |||||
| @@ -1,3 +1,44 @@ | |||||
| # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM | |||||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||||
| SGEMMKERNEL = gemm_kernel_power6.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = gemm_ncopy_4.S | |||||
| SGEMMOTCOPY = gemm_tcopy_4.S | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_power6.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = gemm_ncopy_4.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_power6.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_power6.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| else | |||||
| #SGEMM_BETA = ../generic/gemm_beta.c | #SGEMM_BETA = ../generic/gemm_beta.c | ||||
| #DGEMM_BETA = ../generic/gemm_beta.c | #DGEMM_BETA = ../generic/gemm_beta.c | ||||
| #CGEMM_BETA = ../generic/zgemm_beta.c | #CGEMM_BETA = ../generic/zgemm_beta.c | ||||
| @@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||||
| DTRSMKERNEL_LN = trsm_kernel_power6_LN.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_power6_LT.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_power6_LT.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_power6_RT.S | |||||
| else | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | ||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| endif | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| @@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c | |||||
| # | # | ||||
| SAXPYKERNEL = saxpy.c | SAXPYKERNEL = saxpy.c | ||||
| DAXPYKERNEL = daxpy.c | DAXPYKERNEL = daxpy.c | ||||
| # | |||||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||||
| CAXPYKERNEL = zaxpy.S | |||||
| else | |||||
| ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | ||||
| ifneq ($(GCCVERSIONGTEQ9),1) | ifneq ($(GCCVERSIONGTEQ9),1) | ||||
| CAXPYKERNEL = caxpy_power8.S | CAXPYKERNEL = caxpy_power8.S | ||||
| @@ -162,6 +215,7 @@ endif | |||||
| else | else | ||||
| CAXPYKERNEL = caxpy.c | CAXPYKERNEL = caxpy.c | ||||
| endif | endif | ||||
| endif | |||||
| # | # | ||||
| ZAXPYKERNEL = zaxpy.c | ZAXPYKERNEL = zaxpy.c | ||||
| # | # | ||||
| @@ -239,4 +293,3 @@ IDAMINKERNEL = ../arm/iamin.c | |||||
| IZAMAXKERNEL = ../arm/izamax.c | IZAMAXKERNEL = ../arm/izamax.c | ||||
| IZAMINKERNEL = ../arm/izamin.c | IZAMINKERNEL = ../arm/izamin.c | ||||
| endif | endif | ||||
| @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "casum_microk_power8.c" | #include "casum_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | #ifndef HAVE_KERNEL_16 | ||||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "ccopy_microk_power8.c" | #include "ccopy_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_32 | #ifndef HAVE_KERNEL_32 | ||||
| @@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
| #include "../arm/zdot.c" | |||||
| #else | |||||
| #include "common.h" | #include "common.h" | ||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| @@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| return (result); | return (result); | ||||
| } | } | ||||
| #endif | |||||
| @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | |||||
| *****************************************************************************/ | |||||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
| #include "../arm/zgemv_n.c" | |||||
| #else | |||||
| #include <stdlib.h> | #include <stdlib.h> | ||||
| #include <stdio.h> | #include <stdio.h> | ||||
| @@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| return (0); | return (0); | ||||
| } | } | ||||
| #endif | |||||
| @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | |||||
| *****************************************************************************/ | |||||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
| #include "../arm/zgemv_t.c" | |||||
| #else | |||||
| #include "common.h" | #include "common.h" | ||||
| @@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| return (0); | return (0); | ||||
| } | } | ||||
| #endif | |||||
| @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | ||||
| { | { | ||||
| @@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||||
| } | } | ||||
| #endif | #endif | ||||
| #endif | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | ||||
| @@ -183,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | if ( (inc_x == 1) && (inc_y == 1) ) | ||||
| { | { | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| BLASLONG n1 = n & -8; | BLASLONG n1 = n & -8; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| @@ -191,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| i=n1; | i=n1; | ||||
| ix=2*n1; | ix=2*n1; | ||||
| } | } | ||||
| #endif | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| temp[0] = c*x[ix] + s*y[ix] ; | temp[0] = c*x[ix] + s*y[ix] ; | ||||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "cswap_microk_power8.c" | #include "cswap_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_32 | #ifndef HAVE_KERNEL_32 | ||||
| @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "dasum_microk_power8.c" | #include "dasum_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | #ifndef HAVE_KERNEL_16 | ||||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "daxpy_microk_power8.c" | #include "daxpy_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "dcopy_microk_power8.c" | #include "dcopy_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_32 | #ifndef HAVE_KERNEL_32 | ||||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "ddot_microk_power8.c" | #include "ddot_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| @@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <altivec.h> | #include <altivec.h> | ||||
| typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||||
| typedef __vector unsigned char vec_t; | |||||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | ||||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | ||||
| #ifdef TRMMKERNEL | #ifdef TRMMKERNEL | ||||
| #define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] = result[3] * alpha; \ | |||||
| rowC[0] = result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
| rowC[0] = result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] = result[1] * alpha; \ | rowC[0] = result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] = result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
| rowC[0] = result[0] * alpha; | |||||
| rowC[0] = result[3] * alpha; | |||||
| #define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
| rowC[0] = result[3] * alpha; \ | |||||
| rowC[0] = result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
| rowC[0] = result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] = result[1] * alpha; \ | rowC[0] = result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] = result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
| rowC[0] = result[0] * alpha; | |||||
| rowC[0] = result[3] * alpha; | |||||
| #define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] = result[3] * alpha; \ | |||||
| rowC[0] = result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
| rowC[0] = result[2] * alpha; | |||||
| rowC[0] = result[1] * alpha; | |||||
| #else | #else | ||||
| #define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[3] * alpha; | |||||
| #define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[3] * alpha; | |||||
| #define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; | |||||
| rowC[0] += result[1] * alpha; | |||||
| #endif | #endif | ||||
| #define SET_ACC_ZERO4() \ | #define SET_ACC_ZERO4() \ | ||||
| @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "dgemv_n_microk_power8.c" | #include "dgemv_n_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #define NBMAX 4096 | #define NBMAX 4096 | ||||
| @@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
| #include "../arm/gemv_t.c" | |||||
| #else | |||||
| #include "common.h" | #include "common.h" | ||||
| #define NBMAX 1024 | #define NBMAX 1024 | ||||
| //#define PREFETCH 1 | //#define PREFETCH 1 | ||||
| #include <altivec.h> | #include <altivec.h> | ||||
| #define HAVE_KERNEL4x8_ASM 1 | #define HAVE_KERNEL4x8_ASM 1 | ||||
| #if defined(HAVE_KERNEL4x8_ASM) | #if defined(HAVE_KERNEL4x8_ASM) | ||||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { | static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { | ||||
| @@ -355,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do | |||||
| "stxvd2x 39, %[off], %[y] \n\t" | "stxvd2x 39, %[off], %[y] \n\t" | ||||
| "stxvd2x 40, %[off2], %[y] \n\t" | "stxvd2x 40, %[off2], %[y] \n\t" | ||||
| : [memy] "+m" (*(const double (*)[8])y), | |||||
| : [memy] "+m" (*(double (*)[8])y), | |||||
| [n] "+&r" (n), | [n] "+&r" (n), | ||||
| [a0] "=b" (a0), | [a0] "=b" (a0), | ||||
| [a1] "=&b" (a1), | [a1] "=&b" (a1), | ||||
| @@ -369,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do | |||||
| [off2]"=&b" (off2), | [off2]"=&b" (off2), | ||||
| [temp] "=&b" (tempR) | [temp] "=&b" (tempR) | ||||
| : [memx] "m" (*(const double (*)[n])x), | : [memx] "m" (*(const double (*)[n])x), | ||||
| [mem_ap] "m" (*(const double (*)[]) ap), | |||||
| [mem_ap] "m" (*(const double (*)[n*8]) ap), | |||||
| [alpha] "d" (alpha), | [alpha] "d" (alpha), | ||||
| "[a0]" (ap), | "[a0]" (ap), | ||||
| [x] "b" (x), | [x] "b" (x), | ||||
| @@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| return (0); | return (0); | ||||
| } | } | ||||
| #endif | |||||
| @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "drot_microk_power8.c" | #include "drot_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | #ifndef HAVE_KERNEL_16 | ||||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "dscal_microk_power8.c" | #include "dscal_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #if !defined(HAVE_KERNEL_8) | #if !defined(HAVE_KERNEL_8) | ||||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "dswap_microk_power8.c" | #include "dswap_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_32 | #ifndef HAVE_KERNEL_32 | ||||
| @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include <altivec.h> | #include <altivec.h> | ||||
| #endif | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #define ABS fabs | #define ABS fabs | ||||
| @@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| /** | /** | ||||
| * Find maximum index | * Find maximum index | ||||
| * Warning: requirements n>0 and n % 32 == 0 | * Warning: requirements n>0 and n % 32 == 0 | ||||
| @@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| return index; | return index; | ||||
| } | } | ||||
| #endif | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| @@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG n1 = n & -32; | BLASLONG n1 = n & -32; | ||||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | #if defined(_CALL_ELF) && (_CALL_ELF == 2) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| if (n1 > 0) { | if (n1 > 0) { | ||||
| max = diamax_kernel_32(n1, x, &maxf); | max = diamax_kernel_32(n1, x, &maxf); | ||||
| i = n1; | i = n1; | ||||
| } | } | ||||
| #endif | |||||
| #endif | #endif | ||||
| while (i < n) { | while (i < n) { | ||||
| if (ABS(x[i]) > maxf) { | if (ABS(x[i]) > maxf) { | ||||
| @@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| /** | /** | ||||
| * Find minimum index | * Find minimum index | ||||
| * Warning: requirements n>0 and n % 32 == 0 | * Warning: requirements n>0 and n % 32 == 0 | ||||
| @@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| return index; | return index; | ||||
| } | } | ||||
| #endif | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | ||||
| @@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| if (inc_x == 1) { | if (inc_x == 1) { | ||||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | #if defined(_CALL_ELF) && (_CALL_ELF == 2) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| BLASLONG n1 = n & -32; | BLASLONG n1 = n & -32; | ||||
| if (n1 > 0) { | if (n1 > 0) { | ||||
| min = diamin_kernel_32(n1, x, &minf); | min = diamin_kernel_32(n1, x, &minf); | ||||
| i = n1; | i = n1; | ||||
| } | } | ||||
| #endif | |||||
| #endif | #endif | ||||
| while (i < n) { | while (i < n) { | ||||
| if (ABS(x[i]) < minf) { | if (ABS(x[i]) < minf) { | ||||
| @@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| /** | /** | ||||
| * Find maximum index | * Find maximum index | ||||
| @@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
| } | } | ||||
| #endif | |||||
| @@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if (inc_x == 1) { | if (inc_x == 1) { | ||||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | #if defined(_CALL_ELF) && (_CALL_ELF == 2) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| BLASLONG n1 = n & -16; | BLASLONG n1 = n & -16; | ||||
| if (n1 > 0) { | if (n1 > 0) { | ||||
| @@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| i = n1; | i = n1; | ||||
| ix = n1 << 1; | ix = n1 << 1; | ||||
| } | } | ||||
| #endif | |||||
| #endif | #endif | ||||
| while(i < n) | while(i < n) | ||||
| @@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| @@ -32,6 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| /** | /** | ||||
| * Find minimum index | * Find minimum index | ||||
| @@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
| return index; | return index; | ||||
| } | } | ||||
| #endif | |||||
| @@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| minf = CABS1(x,0); //index will not be incremented | minf = CABS1(x,0); //index will not be incremented | ||||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | #if defined(_CALL_ELF) && (_CALL_ELF == 2) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| BLASLONG n1 = n & -16; | BLASLONG n1 = n & -16; | ||||
| if (n1 > 0) { | if (n1 > 0) { | ||||
| @@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| i = n1; | i = n1; | ||||
| ix = n1 << 1; | ix = n1 << 1; | ||||
| } | } | ||||
| #endif | |||||
| #endif | #endif | ||||
| while(i < n) | while(i < n) | ||||
| @@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| } | } | ||||
| @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "sasum_microk_power8.c" | #include "sasum_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_32 | #ifndef HAVE_KERNEL_32 | ||||
| @@ -28,8 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #define offset_0 0 | |||||
| #define offset_1 16 | |||||
| #define offset_2 32 | |||||
| #define offset_3 48 | |||||
| #define offset_4 64 | |||||
| #define offset_5 80 | |||||
| #define offset_6 96 | |||||
| #define offset_7 112 | |||||
| #define offset_8 128 | |||||
| #define offset_9 144 | |||||
| #define offset_10 160 | |||||
| #define offset_11 176 | |||||
| #define offset_12 192 | |||||
| #define offset_13 208 | |||||
| #define offset_14 224 | |||||
| #define offset_15 240 | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| #include <altivec.h> | #include <altivec.h> | ||||
| @@ -37,12 +54,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) | static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) | ||||
| { | { | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| __vector float v_a = {alpha,alpha,alpha,alpha}; | |||||
| __vector float * v_y=(__vector float *)y; | |||||
| __vector float * v_x=(__vector float *)x; | |||||
| __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha}; | |||||
| __vector float * vptr_y =(__vector float *)y; | |||||
| __vector float * vptr_x =(__vector float *)x; | |||||
| for(; i<n/4; i+=16){ | for(; i<n/4; i+=16){ | ||||
| register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ; | |||||
| register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ; | |||||
| register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ; | |||||
| register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ; | |||||
| register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ; | |||||
| register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ; | |||||
| register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ; | |||||
| register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ; | |||||
| register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ; | |||||
| register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ; | |||||
| register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ; | |||||
| register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ; | |||||
| register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ; | |||||
| register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ; | |||||
| register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ; | |||||
| register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ; | |||||
| register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ; | |||||
| register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ; | |||||
| register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ; | |||||
| register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ; | |||||
| register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ; | |||||
| register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ; | |||||
| register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ; | |||||
| register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ; | |||||
| register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ; | |||||
| register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ; | |||||
| register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ; | |||||
| register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ; | |||||
| register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ; | |||||
| register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ; | |||||
| register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ; | |||||
| register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ; | |||||
| vy_0 += vx_0*v_a; | |||||
| vy_1 += vx_1*v_a; | |||||
| vy_2 += vx_2*v_a; | |||||
| vy_3 += vx_3*v_a; | |||||
| vy_4 += vx_4*v_a; | |||||
| vy_5 += vx_5*v_a; | |||||
| vy_6 += vx_6*v_a; | |||||
| vy_7 += vx_7*v_a; | |||||
| vy_8 += vx_8*v_a; | |||||
| vy_9 += vx_9*v_a; | |||||
| vy_10 += vx_10*v_a; | |||||
| vy_11 += vx_11*v_a; | |||||
| vy_12 += vx_12*v_a; | |||||
| vy_13 += vx_13*v_a; | |||||
| vy_14 += vx_14*v_a; | |||||
| vy_15 += vx_15*v_a; | |||||
| vec_vsx_st( vy_0, offset_0 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_1, offset_1 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_2, offset_2 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_3, offset_3 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_4, offset_4 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_5, offset_5 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_6, offset_6 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_7, offset_7 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_8, offset_8 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_9, offset_9 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_10, offset_10 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_11, offset_11 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_12, offset_12 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_13, offset_13 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_14, offset_14 ,vptr_y ) ; | |||||
| vec_vsx_st( vy_15, offset_15 ,vptr_y ) ; | |||||
| vptr_x+=16; | |||||
| vptr_y+=16; | |||||
| /* | |||||
| v_y[i] += v_a * v_x[i]; | v_y[i] += v_a * v_x[i]; | ||||
| v_y[i+1] += v_a * v_x[i+1]; | v_y[i+1] += v_a * v_x[i+1]; | ||||
| v_y[i+2] += v_a * v_x[i+2]; | v_y[i+2] += v_a * v_x[i+2]; | ||||
| @@ -59,9 +149,11 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) | |||||
| v_y[i+13] += v_a * v_x[i+13]; | v_y[i+13] += v_a * v_x[i+13]; | ||||
| v_y[i+14] += v_a * v_x[i+14]; | v_y[i+14] += v_a * v_x[i+14]; | ||||
| v_y[i+15] += v_a * v_x[i+15]; | v_y[i+15] += v_a * v_x[i+15]; | ||||
| */ | |||||
| } | } | ||||
| } | } | ||||
| #endif | #endif | ||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| { | { | ||||
| @@ -74,11 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| { | { | ||||
| BLASLONG n1 = n & -64; | BLASLONG n1 = n & -64; | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| if ( n1 ) | if ( n1 ) | ||||
| saxpy_kernel_64(n1, x, y, da); | saxpy_kernel_64(n1, x, y, da); | ||||
| i = n1; | i = n1; | ||||
| #endif | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "scopy_microk_power8.c" | #include "scopy_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_32 | #ifndef HAVE_KERNEL_32 | ||||
| @@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "sdot_microk_power8.c" | #include "sdot_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | #ifndef HAVE_KERNEL_16 | ||||
| @@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <altivec.h> | #include <altivec.h> | ||||
| typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||||
| typedef __vector unsigned char vec_t; | |||||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | ||||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] = result[3] * alpha; \ | |||||
| rowC[0] = result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
| rowC[0] = result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] = result[1] * alpha; \ | rowC[0] = result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] = result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
| rowC[0] = result[0] * alpha; | |||||
| rowC[0] = result[3] * alpha; | |||||
| #define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
| rowC[0] = result[3] * alpha; \ | |||||
| rowC[0] = result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
| rowC[0] = result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] = result[1] * alpha; \ | rowC[0] = result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] = result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
| rowC[0] = result[0] * alpha; | |||||
| rowC[0] = result[3] * alpha; | |||||
| #define SAVE4x2_ACC(ACC, J) \ | #define SAVE4x2_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v2sf_t *) &CO[0* ldc+J]; \ | rowC = (v2sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] = result[6] * alpha; \ | |||||
| rowC[0] = result[0] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[1* ldc+J]; \ | rowC = (v2sf_t *) &CO[1* ldc+J]; \ | ||||
| rowC[0] = result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
| rowC[0] = result[2] * alpha; \ | rowC[0] = result[2] * alpha; \ | ||||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
| rowC[0] = result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[3* ldc+J]; \ | rowC = (v2sf_t *) &CO[3* ldc+J]; \ | ||||
| rowC[0] = result[0] * alpha; | |||||
| rowC[0] = result[6] * alpha; | |||||
| #define SAVE4x2_ACC1(ACC, J) \ | #define SAVE4x2_ACC1(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v2sf_t *) &CO[4* ldc+J]; \ | rowC = (v2sf_t *) &CO[4* ldc+J]; \ | ||||
| rowC[0] = result[6] * alpha; \ | |||||
| rowC[0] = result[0] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[5* ldc+J]; \ | rowC = (v2sf_t *) &CO[5* ldc+J]; \ | ||||
| rowC[0] = result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
| rowC[0] = result[2] * alpha; \ | rowC[0] = result[2] * alpha; \ | ||||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
| rowC[0] = result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[7* ldc+J]; \ | rowC = (v2sf_t *) &CO[7* ldc+J]; \ | ||||
| rowC[0] = result[0] * alpha; | |||||
| rowC[0] = result[6] * alpha; | |||||
| #define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] = result[3] * alpha; \ | |||||
| rowC[0] = result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
| rowC[0] = result[2] * alpha; | |||||
| rowC[0] = result[1] * alpha; | |||||
| #else | #else | ||||
| #define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[3] * alpha; | |||||
| #define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[3] * alpha; | |||||
| #define SAVE4x2_ACC(ACC, J) \ | #define SAVE4x2_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v2sf_t *) &CO[0* ldc+J]; \ | rowC = (v2sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] += result[6] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[1* ldc+J]; \ | rowC = (v2sf_t *) &CO[1* ldc+J]; \ | ||||
| rowC[0] += result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | rowC[0] += result[2] * alpha; \ | ||||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
| rowC[0] += result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[3* ldc+J]; \ | rowC = (v2sf_t *) &CO[3* ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[6] * alpha; | |||||
| #define SAVE4x2_ACC1(ACC, J) \ | #define SAVE4x2_ACC1(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v2sf_t *) &CO[4* ldc+J]; \ | rowC = (v2sf_t *) &CO[4* ldc+J]; \ | ||||
| rowC[0] += result[6] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[5* ldc+J]; \ | rowC = (v2sf_t *) &CO[5* ldc+J]; \ | ||||
| rowC[0] += result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | rowC[0] += result[2] * alpha; \ | ||||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
| rowC[0] += result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[7* ldc+J]; \ | rowC = (v2sf_t *) &CO[7* ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[6] * alpha; | |||||
| #define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; | |||||
| rowC[0] += result[1] * alpha; | |||||
| #endif | #endif | ||||
| #define KERNEL(i, j) \ | #define KERNEL(i, j) \ | ||||
| __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ | __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ | ||||
| @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
| #include "../arm/gemv_n.c" | |||||
| #else | |||||
| #include "common.h" | #include "common.h" | ||||
| @@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| return(0); | return(0); | ||||
| } | } | ||||
| #endif | |||||
| @@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
| #include "../arm/gemv_t.c" | |||||
| #else | |||||
| #include "common.h" | #include "common.h" | ||||
| @@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| } | } | ||||
| #endif | |||||
| @@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16) | |||||
| #define BF16TOF32(x) x | #define BF16TOF32(x) x | ||||
| #endif | #endif | ||||
| typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||||
| typedef __vector unsigned char vec_t; | |||||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | ||||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | ||||
| @@ -64,54 +64,54 @@ vector char mask = | |||||
| #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) | #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) | ||||
| #define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | rowC = (v4sf_t *) &CO[1*ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | rowC = (v4sf_t *) &CO[3*ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[3] * alpha; | |||||
| #define SAVE_ACC1(ACC, J) \ | #define SAVE_ACC1(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | rowC = (v4sf_t *) &CO[4* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | rowC = (v4sf_t *) &CO[5*ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] += result[1] * alpha; \ | rowC[0] += result[1] * alpha; \ | ||||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | rowC = (v4sf_t *) &CO[7*ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[3] * alpha; | |||||
| #define SAVE4x2_ACC(ACC, J) \ | #define SAVE4x2_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v2sf_t *) &CO[0* ldc+J]; \ | rowC = (v2sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] += result[6] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[1* ldc+J]; \ | rowC = (v2sf_t *) &CO[1* ldc+J]; \ | ||||
| rowC[0] += result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | rowC[0] += result[2] * alpha; \ | ||||
| rowC = (v2sf_t *) &CO[2* ldc+J]; \ | |||||
| rowC[0] += result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[3* ldc+J]; \ | rowC = (v2sf_t *) &CO[3* ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[6] * alpha; | |||||
| #define SAVE4x2_ACC1(ACC, J) \ | #define SAVE4x2_ACC1(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v2sf_t *) &CO[4* ldc+J]; \ | rowC = (v2sf_t *) &CO[4* ldc+J]; \ | ||||
| rowC[0] += result[6] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[5* ldc+J]; \ | rowC = (v2sf_t *) &CO[5* ldc+J]; \ | ||||
| rowC[0] += result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
| rowC[0] += result[2] * alpha; \ | rowC[0] += result[2] * alpha; \ | ||||
| rowC = (v2sf_t *) &CO[6* ldc+J]; \ | |||||
| rowC[0] += result[4] * alpha; \ | |||||
| rowC = (v2sf_t *) &CO[7* ldc+J]; \ | rowC = (v2sf_t *) &CO[7* ldc+J]; \ | ||||
| rowC[0] += result[0] * alpha; | |||||
| rowC[0] += result[6] * alpha; | |||||
| #define MMA __builtin_mma_xvbf16ger2pp | #define MMA __builtin_mma_xvbf16ger2pp | ||||
| #define SAVE2x4_ACC(ACC, J) \ | #define SAVE2x4_ACC(ACC, J) \ | ||||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | rowC = (v4sf_t *) &CO[0* ldc+J]; \ | ||||
| rowC[0] += result[3] * alpha; \ | |||||
| rowC[0] += result[0] * alpha; \ | |||||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | rowC = (v4sf_t *) &CO[1* ldc+J]; \ | ||||
| rowC[0] += result[2] * alpha; | |||||
| rowC[0] += result[1] * alpha; | |||||
| #define SET_ACC_ZERO4() \ | #define SET_ACC_ZERO4() \ | ||||
| __builtin_mma_xxsetaccz (&acc0); \ | __builtin_mma_xxsetaccz (&acc0); \ | ||||
| @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "srot_microk_power8.c" | #include "srot_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | #ifndef HAVE_KERNEL_16 | ||||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "sscal_microk_power8.c" | #include "sscal_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #if !defined(HAVE_KERNEL_16) | #if !defined(HAVE_KERNEL_16) | ||||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "sswap_microk_power8.c" | #include "sswap_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_32 | #ifndef HAVE_KERNEL_32 | ||||
| @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "zasum_microk_power8.c" | #include "zasum_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "zaxpy_microk_power8.c" | #include "zaxpy_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_4 | #ifndef HAVE_KERNEL_4 | ||||
| @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "zcopy_microk_power8.c" | #include "zcopy_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | #ifndef HAVE_KERNEL_16 | ||||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "zdot_microk_power8.c" | #include "zdot_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| @@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in | |||||
| FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | ||||
| if ( n <= 0 ) | if ( n <= 0 ) | ||||
| { | |||||
| { /* | |||||
| __real__ result = 0.0 ; | __real__ result = 0.0 ; | ||||
| __imag__ result = 0.0 ; | __imag__ result = 0.0 ; | ||||
| */ | |||||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||||
| return(result); | return(result); | ||||
| } | } | ||||
| @@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in | |||||
| } | } | ||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| /* | |||||
| __real__ result = dot[0] - dot[1]; | __real__ result = dot[0] - dot[1]; | ||||
| __imag__ result = dot[2] + dot[3]; | __imag__ result = dot[2] + dot[3]; | ||||
| */ | |||||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); | |||||
| #else | #else | ||||
| /* | |||||
| __real__ result = dot[0] + dot[1]; | __real__ result = dot[0] + dot[1]; | ||||
| __imag__ result = dot[2] - dot[3]; | __imag__ result = dot[2] - dot[3]; | ||||
| */ | |||||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); | |||||
| #endif | #endif | ||||
| @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #define HAVE_KERNEL_4x4_VEC 1 | #define HAVE_KERNEL_4x4_VEC 1 | ||||
| #define HAVE_KERNEL_4x2_VEC 1 | #define HAVE_KERNEL_4x2_VEC 1 | ||||
| #define HAVE_KERNEL_4x1_VEC 1 | #define HAVE_KERNEL_4x1_VEC 1 | ||||
| @@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | ||||
| #include <altivec.h> | #include <altivec.h> | ||||
| #endif | #endif | ||||
| #endif | |||||
| // | // | ||||
| #define NBMAX 4096 | #define NBMAX 4096 | ||||
| @@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #define NBMAX 4096 | #define NBMAX 4096 | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #define HAVE_KERNEL_4x4_VEC 1 | #define HAVE_KERNEL_4x4_VEC 1 | ||||
| #define HAVE_KERNEL_4x2_VEC 1 | #define HAVE_KERNEL_4x2_VEC 1 | ||||
| #define HAVE_KERNEL_4x1_VEC 1 | #define HAVE_KERNEL_4x1_VEC 1 | ||||
| #endif | |||||
| #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) | ||||
| #include <altivec.h> | #include <altivec.h> | ||||
| #endif | #endif | ||||
| @@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
| #include "../arm/zrot.c" | |||||
| #else | |||||
| #include "common.h" | #include "common.h" | ||||
| @@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| } | } | ||||
| #endif | |||||
| @@ -39,10 +39,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #include "zscal_microk_power8.c" | #include "zscal_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | #if defined(POWER8) || defined(POWER9) || defined(POWER10) | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||||
| #include "zswap_microk_power8.c" | #include "zswap_microk_power8.c" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | #ifndef HAVE_KERNEL_16 | ||||
| @@ -1,667 +0,0 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||||
| //The array of job_t may overflow the stack. | |||||
| //Instead, use malloc to alloc job_t. | |||||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||||
| #define USE_ALLOC_HEAP | |||||
| #endif | |||||
| static FLOAT dm1 = -1.; | |||||
| #ifndef KERNEL_FUNC | |||||
| #ifndef LOWER | |||||
| #define KERNEL_FUNC SYRK_KERNEL_U | |||||
| #else | |||||
| #define KERNEL_FUNC SYRK_KERNEL_L | |||||
| #endif | |||||
| #endif | |||||
| #ifndef LOWER | |||||
| #ifndef COMPLEX | |||||
| #define TRSM_KERNEL TRSM_KERNEL_LT | |||||
| #else | |||||
| #define TRSM_KERNEL TRSM_KERNEL_LC | |||||
| #endif | |||||
| #else | |||||
| #ifndef COMPLEX | |||||
| #define TRSM_KERNEL TRSM_KERNEL_RN | |||||
| #else | |||||
| #define TRSM_KERNEL TRSM_KERNEL_RR | |||||
| #endif | |||||
| #endif | |||||
| #ifndef CACHE_LINE_SIZE | |||||
| #define CACHE_LINE_SIZE 8 | |||||
| #endif | |||||
| #ifndef DIVIDE_RATE | |||||
| #define DIVIDE_RATE 2 | |||||
| #endif | |||||
| #ifndef SWITCH_RATIO | |||||
| #define SWITCH_RATIO 2 | |||||
| #endif | |||||
| #ifndef LOWER | |||||
| #define TRANS | |||||
| #endif | |||||
| #ifndef SYRK_LOCAL | |||||
| #if !defined(LOWER) && !defined(TRANS) | |||||
| #define SYRK_LOCAL SYRK_UN | |||||
| #elif !defined(LOWER) && defined(TRANS) | |||||
| #define SYRK_LOCAL SYRK_UT | |||||
| #elif defined(LOWER) && !defined(TRANS) | |||||
| #define SYRK_LOCAL SYRK_LN | |||||
| #else | |||||
| #define SYRK_LOCAL SYRK_LT | |||||
| #endif | |||||
| #endif | |||||
| typedef struct { | |||||
| #ifdef HAVE_C11 | |||||
| _Atomic | |||||
| #else | |||||
| volatile | |||||
| #endif | |||||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||||
| } job_t; | |||||
| #ifndef KERNEL_OPERATION | |||||
| #ifndef COMPLEX | |||||
| #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | |||||
| KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) | |||||
| #else | |||||
| #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | |||||
| KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) | |||||
| #endif | |||||
| #endif | |||||
| #ifndef ICOPY_OPERATION | |||||
| #ifndef TRANS | |||||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||||
| #else | |||||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||||
| #endif | |||||
| #endif | |||||
| #ifndef OCOPY_OPERATION | |||||
| #ifdef TRANS | |||||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||||
| #else | |||||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||||
| #endif | |||||
| #endif | |||||
| #ifndef S | |||||
| #define S args -> a | |||||
| #endif | |||||
| #ifndef A | |||||
| #define A args -> b | |||||
| #endif | |||||
| #ifndef C | |||||
| #define C args -> c | |||||
| #endif | |||||
| #ifndef LDA | |||||
| #define LDA args -> lda | |||||
| #endif | |||||
| #ifndef N | |||||
| #define N args -> m | |||||
| #endif | |||||
| #ifndef K | |||||
| #define K args -> k | |||||
| #endif | |||||
| static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||||
| FLOAT *buffer[DIVIDE_RATE]; | |||||
| BLASLONG k, lda; | |||||
| BLASLONG m_from, m_to; | |||||
| FLOAT *alpha; | |||||
| FLOAT *a, *c; | |||||
| job_t *job = (job_t *)args -> common; | |||||
| BLASLONG xxx, bufferside; | |||||
| BLASLONG jjs, min_jj; | |||||
| BLASLONG is, min_i, div_n; | |||||
| BLASLONG i, current; | |||||
| k = K; | |||||
| a = (FLOAT *)A; | |||||
| c = (FLOAT *)C; | |||||
| lda = LDA; | |||||
| alpha = (FLOAT *)args -> alpha; | |||||
| m_from = range_n[mypos + 0]; | |||||
| m_to = range_n[mypos + 1]; | |||||
| #if 0 | |||||
| fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); | |||||
| #endif | |||||
| div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||||
| for (i = 1; i < DIVIDE_RATE; i++) { | |||||
| buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; | |||||
| } | |||||
| #ifndef LOWER | |||||
| TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); | |||||
| #else | |||||
| TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); | |||||
| #endif | |||||
| for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { | |||||
| for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ | |||||
| min_jj = MIN(m_to, xxx + div_n) - jjs; | |||||
| #ifndef LOWER | |||||
| if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; | |||||
| #else | |||||
| if (min_jj > GEMM_P) min_jj = GEMM_P; | |||||
| #endif | |||||
| #ifndef LOWER | |||||
| OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); | |||||
| TRSM_KERNEL (k, min_jj, k, dm1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sb, | |||||
| buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, | |||||
| a + jjs * lda * COMPSIZE, lda, 0); | |||||
| #else | |||||
| ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); | |||||
| TRSM_KERNEL (min_jj, k, k, dm1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, | |||||
| sb, | |||||
| a + jjs * COMPSIZE, lda, 0); | |||||
| #endif | |||||
| } | |||||
| #ifndef LOWER | |||||
| for (i = 0; i <= mypos; i++) | |||||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||||
| #else | |||||
| for (i = mypos; i < args -> nthreads; i++) | |||||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||||
| #endif | |||||
| WMB; | |||||
| } | |||||
| min_i = m_to - m_from; | |||||
| if (min_i >= GEMM_P * 2) { | |||||
| min_i = GEMM_P; | |||||
| } else | |||||
| if (min_i > GEMM_P) { | |||||
| min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | |||||
| #ifndef LOWER | |||||
| ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); | |||||
| #else | |||||
| OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); | |||||
| #endif | |||||
| current = mypos; | |||||
| #ifndef LOWER | |||||
| while (current < args -> nthreads) | |||||
| #else | |||||
| while (current >= 0) | |||||
| #endif | |||||
| { | |||||
| div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | |||||
| /* thread has to wait */ | |||||
| if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, | |||||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||||
| c, lda, m_from, xxx); | |||||
| if (m_from + min_i >= m_to) { | |||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| WMB; | |||||
| } | |||||
| } | |||||
| #ifndef LOWER | |||||
| current ++; | |||||
| #else | |||||
| current --; | |||||
| #endif | |||||
| } | |||||
| for(is = m_from + min_i; is < m_to; is += min_i){ | |||||
| min_i = m_to - is; | |||||
| if (min_i >= GEMM_P * 2) { | |||||
| min_i = GEMM_P; | |||||
| } else | |||||
| if (min_i > GEMM_P) { | |||||
| min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | |||||
| #ifndef LOWER | |||||
| ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); | |||||
| #else | |||||
| OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); | |||||
| #endif | |||||
| current = mypos; | |||||
| #ifndef LOWER | |||||
| while (current < args -> nthreads) | |||||
| #else | |||||
| while (current >= 0) | |||||
| #endif | |||||
| { | |||||
| div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | |||||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, | |||||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||||
| c, lda, is, xxx); | |||||
| if (is + min_i >= m_to) { | |||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| WMB; | |||||
| } | |||||
| } | |||||
| #ifndef LOWER | |||||
| current ++; | |||||
| #else | |||||
| current --; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| for (i = 0; i < args -> nthreads; i++) { | |||||
| if (i != mypos) { | |||||
| for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ | |||||
| blas_arg_t newarg; | |||||
| #ifndef USE_ALLOC_HEAP | |||||
| job_t job[MAX_CPU_NUMBER]; | |||||
| #else | |||||
| job_t * job = NULL; | |||||
| #endif | |||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||||
| BLASLONG range[MAX_CPU_NUMBER + 100]; | |||||
| BLASLONG num_cpu; | |||||
| BLASLONG nthreads = args -> nthreads; | |||||
| BLASLONG width, i, j, k; | |||||
| BLASLONG n, n_from, n_to; | |||||
| int mode, mask; | |||||
| double dnum; | |||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | |||||
| mode = BLAS_XDOUBLE | BLAS_REAL; | |||||
| mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; | |||||
| #elif defined(DOUBLE) | |||||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; | |||||
| #elif defined(HALF) | |||||
| mode = BLAS_HALF | BLAS_REAL; | |||||
| mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; | |||||
| #else | |||||
| mode = BLAS_SINGLE | BLAS_REAL; | |||||
| mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; | |||||
| #endif | |||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||||
| mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; | |||||
| #elif defined(DOUBLE) | |||||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; | |||||
| #else | |||||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; | |||||
| #endif | |||||
| #endif | |||||
| newarg.m = args -> m; | |||||
| newarg.k = args -> k; | |||||
| newarg.a = args -> a; | |||||
| newarg.b = args -> b; | |||||
| newarg.c = args -> c; | |||||
| newarg.lda = args -> lda; | |||||
| newarg.alpha = args -> alpha; | |||||
| #ifdef USE_ALLOC_HEAP | |||||
| job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); | |||||
| if(job==NULL){ | |||||
| fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); | |||||
| exit(1); | |||||
| } | |||||
| #endif | |||||
| newarg.common = (void *)job; | |||||
| n_from = 0; | |||||
| n_to = args -> m; | |||||
| #ifndef LOWER | |||||
| range[MAX_CPU_NUMBER] = n_to - n_from; | |||||
| range[0] = 0; | |||||
| num_cpu = 0; | |||||
| i = 0; | |||||
| n = n_to - n_from; | |||||
| dnum = (double)n * (double)n /(double)nthreads; | |||||
| while (i < n){ | |||||
| if (nthreads - num_cpu > 1) { | |||||
| double di = (double)i; | |||||
| width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); | |||||
| if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); | |||||
| if ((width > n - i) || (width < mask)) width = n - i; | |||||
| } else { | |||||
| width = n - i; | |||||
| } | |||||
| range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; | |||||
| queue[num_cpu].mode = mode; | |||||
| queue[num_cpu].routine = inner_thread; | |||||
| queue[num_cpu].args = &newarg; | |||||
| queue[num_cpu].range_m = NULL; | |||||
| queue[num_cpu].sa = NULL; | |||||
| queue[num_cpu].sb = NULL; | |||||
| queue[num_cpu].next = &queue[num_cpu + 1]; | |||||
| num_cpu ++; | |||||
| i += width; | |||||
| } | |||||
| for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; | |||||
| #else | |||||
| range[0] = 0; | |||||
| num_cpu = 0; | |||||
| i = 0; | |||||
| n = n_to - n_from; | |||||
| dnum = (double)n * (double)n /(double)nthreads; | |||||
| while (i < n){ | |||||
| if (nthreads - num_cpu > 1) { | |||||
| double di = (double)i; | |||||
| width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); | |||||
| if ((width > n - i) || (width < mask)) width = n - i; | |||||
| } else { | |||||
| width = n - i; | |||||
| } | |||||
| range[num_cpu + 1] = range[num_cpu] + width; | |||||
| queue[num_cpu].mode = mode; | |||||
| queue[num_cpu].routine = inner_thread; | |||||
| queue[num_cpu].args = &newarg; | |||||
| queue[num_cpu].range_m = NULL; | |||||
| queue[num_cpu].range_n = range; | |||||
| queue[num_cpu].sa = NULL; | |||||
| queue[num_cpu].sb = NULL; | |||||
| queue[num_cpu].next = &queue[num_cpu + 1]; | |||||
| num_cpu ++; | |||||
| i += width; | |||||
| } | |||||
| #endif | |||||
| newarg.nthreads = num_cpu; | |||||
| if (num_cpu) { | |||||
| for (j = 0; j < num_cpu; j++) { | |||||
| for (i = 0; i < num_cpu; i++) { | |||||
| for (k = 0; k < DIVIDE_RATE; k++) { | |||||
| job[j].working[i][CACHE_LINE_SIZE * k] = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| queue[0].sa = sa; | |||||
| queue[0].sb = sb; | |||||
| queue[num_cpu - 1].next = NULL; | |||||
| exec_blas(num_cpu, queue); | |||||
| } | |||||
| #ifdef USE_ALLOC_HEAP | |||||
| free(job); | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||||
| BLASLONG n, bk, i, blocking, lda; | |||||
| BLASLONG info; | |||||
| int mode; | |||||
| blas_arg_t newarg; | |||||
| FLOAT *a; | |||||
| FLOAT alpha[2] = { -ONE, ZERO}; | |||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | |||||
| mode = BLAS_XDOUBLE | BLAS_REAL; | |||||
| #elif defined(DOUBLE) | |||||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #endif | |||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||||
| #elif defined(DOUBLE) | |||||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #else | |||||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| if (args -> nthreads == 1) { | |||||
| #ifndef LOWER | |||||
| info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); | |||||
| #else | |||||
| info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); | |||||
| #endif | |||||
| return info; | |||||
| } | |||||
| n = args -> n; | |||||
| a = (FLOAT *)args -> a; | |||||
| lda = args -> lda; | |||||
| if (range_n) n = range_n[1] - range_n[0]; | |||||
| if (n <= GEMM_UNROLL_N * 2) { | |||||
| #ifndef LOWER | |||||
| info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); | |||||
| #else | |||||
| info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); | |||||
| #endif | |||||
| return info; | |||||
| } | |||||
| newarg.lda = lda; | |||||
| newarg.ldb = lda; | |||||
| newarg.ldc = lda; | |||||
| newarg.alpha = alpha; | |||||
| newarg.beta = NULL; | |||||
| newarg.nthreads = args -> nthreads; | |||||
| blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; | |||||
| if (blocking > GEMM_Q) blocking = GEMM_Q; | |||||
| for (i = 0; i < n; i += blocking) { | |||||
| bk = n - i; | |||||
| if (bk > blocking) bk = blocking; | |||||
| newarg.m = bk; | |||||
| newarg.n = bk; | |||||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||||
| info = CNAME(&newarg, NULL, NULL, sa, sb, 0); | |||||
| if (info) return info + i; | |||||
| if (n - i - bk > 0) { | |||||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||||
| newarg.m = n - i - bk; | |||||
| newarg.k = bk; | |||||
| #ifndef LOWER | |||||
| newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; | |||||
| #else | |||||
| newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; | |||||
| #endif | |||||
| newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; | |||||
| thread_driver(&newarg, sa, sb); | |||||
| #else | |||||
| #ifndef LOWER | |||||
| newarg.m = bk; | |||||
| newarg.n = n - i - bk; | |||||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||||
| newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; | |||||
| gemm_thread_n(mode | BLAS_TRANSA_T, | |||||
| &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); | |||||
| newarg.n = n - i - bk; | |||||
| newarg.k = bk; | |||||
| newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; | |||||
| newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; | |||||
| #if 0 | |||||
| HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); | |||||
| #else | |||||
| syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, | |||||
| &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); | |||||
| #endif | |||||
| #else | |||||
| newarg.m = n - i - bk; | |||||
| newarg.n = bk; | |||||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||||
| newarg.b = a + (i + bk + i * lda) * COMPSIZE; | |||||
| gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, | |||||
| &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); | |||||
| newarg.n = n - i - bk; | |||||
| newarg.k = bk; | |||||
| newarg.a = a + (i + bk + i * lda) * COMPSIZE; | |||||
| newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; | |||||
| #if 0 | |||||
| HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); | |||||
| #else | |||||
| syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, | |||||
| &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -101,7 +101,12 @@ static FLOAT dm1 = -1.; | |||||
| #endif | #endif | ||||
| typedef struct { | typedef struct { | ||||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||||
| #ifdef HAVE_C11 | |||||
| _Atomic | |||||
| #else | |||||
| volatile | |||||
| #endif | |||||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||||
| } job_t; | } job_t; | ||||
| @@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ | |||||
| #elif defined(DOUBLE) | #elif defined(DOUBLE) | ||||
| mode = BLAS_DOUBLE | BLAS_REAL; | mode = BLAS_DOUBLE | BLAS_REAL; | ||||
| mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; | mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; | ||||
| #elif defined(HALF) | |||||
| mode = BLAS_HALF | BLAS_REAL; | |||||
| mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; | |||||
| #else | #else | ||||
| mode = BLAS_SINGLE | BLAS_REAL; | mode = BLAS_SINGLE | BLAS_REAL; | ||||
| mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; | mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; | ||||
| @@ -2225,7 +2225,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 65536 | #define GEMM_DEFAULT_OFFSET_B 65536 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | #define GEMM_DEFAULT_ALIGN 0x0ffffUL | ||||
| #if defined(__32BIT__) | |||||
| #warning using BINARY32==POWER6 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #else | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 16 | #define DGEMM_DEFAULT_UNROLL_M 16 | ||||
| @@ -2234,7 +2244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | #define CGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | #define ZGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
| #endif | |||||
| #define SGEMM_DEFAULT_P 1280UL | #define SGEMM_DEFAULT_P 1280UL | ||||
| #define DGEMM_DEFAULT_P 640UL | #define DGEMM_DEFAULT_P 640UL | ||||
| #define CGEMM_DEFAULT_P 640UL | #define CGEMM_DEFAULT_P 640UL | ||||
| @@ -2769,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define CGEMM_DEFAULT_R 4096 | #define CGEMM_DEFAULT_R 4096 | ||||
| #define ZGEMM_DEFAULT_R 4096 | #define ZGEMM_DEFAULT_R 4096 | ||||
| #elif defined(THUNDERX3T110) | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define SGEMM_DEFAULT_P 128 | |||||
| #define DGEMM_DEFAULT_P 320 | |||||
| #define CGEMM_DEFAULT_P 128 | |||||
| #define ZGEMM_DEFAULT_P 128 | |||||
| #define SGEMM_DEFAULT_Q 352 | |||||
| #define DGEMM_DEFAULT_Q 128 | |||||
| #define CGEMM_DEFAULT_Q 224 | |||||
| #define ZGEMM_DEFAULT_Q 112 | |||||
| #define SGEMM_DEFAULT_R 4096 | |||||
| #define DGEMM_DEFAULT_R 4096 | |||||
| #define CGEMM_DEFAULT_R 4096 | |||||
| #define ZGEMM_DEFAULT_R 4096 | |||||
| #elif defined(NEOVERSEN1) | #elif defined(NEOVERSEN1) | ||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||