Use microarchitecture name instead of meaningless strings to name the core, the legacy core is still retained. 1. Rename LOONGSONGENERIC to LA64_GENERIC 2. Rename LOONGSON3R5 to LA464 3. Rename LOONGSON2K1000 to LA264tags/v0.3.29
@@ -23,6 +23,15 @@ jobs: | |||||
- target: LOONGSON2K1000 | - target: LOONGSON2K1000 | ||||
triple: loongarch64-unknown-linux-gnu | triple: loongarch64-unknown-linux-gnu | ||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 | opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 | ||||
- target: LA64_GENERIC | |||||
triple: loongarch64-unknown-linux-gnu | |||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC | |||||
- target: LA464 | |||||
triple: loongarch64-unknown-linux-gnu | |||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 | |||||
- target: LA264 | |||||
triple: loongarch64-unknown-linux-gnu | |||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 | |||||
- target: DYNAMIC_ARCH | - target: DYNAMIC_ARCH | ||||
triple: loongarch64-unknown-linux-gnu | triple: loongarch64-unknown-linux-gnu | ||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC | opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC | ||||
@@ -20,6 +20,12 @@ jobs: | |||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 | opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 | ||||
- target: LOONGSON2K1000 | - target: LOONGSON2K1000 | ||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 | opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 | ||||
- target: LA64_GENERIC | |||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC | |||||
- target: LA464 | |||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 | |||||
- target: LA264 | |||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 | |||||
- target: DYNAMIC_ARCH | - target: DYNAMIC_ARCH | ||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC | opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC | ||||
@@ -727,7 +727,7 @@ endif | |||||
endif | endif | ||||
ifeq ($(ARCH), loongarch64) | ifeq ($(ARCH), loongarch64) | ||||
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC | |||||
DYNAMIC_CORE = LA64_GENERIC LA264 LA464 | |||||
endif | endif | ||||
ifeq ($(ARCH), riscv64) | ifeq ($(ARCH), riscv64) | ||||
@@ -126,9 +126,17 @@ x280 | |||||
RISCV64_ZVL256B | RISCV64_ZVL256B | ||||
11.LOONGARCH64: | 11.LOONGARCH64: | ||||
// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names, | |||||
// and it is recommended to use the more standardized naming conventions | |||||
// LA64_GENERIC/LA264/LA464. You can still specify TARGET as | |||||
// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime, | |||||
// and they will be internally relocated to LA64_GENERIC/LA264/LA464. | |||||
LOONGSONGENERIC | LOONGSONGENERIC | ||||
LOONGSON3R5 | |||||
LOONGSON2K1000 | LOONGSON2K1000 | ||||
LOONGSON3R5 | |||||
LA64_GENERIC | |||||
LA264 | |||||
LA464 | |||||
12. Elbrus E2000: | 12. Elbrus E2000: | ||||
E2K | E2K | ||||
@@ -1,5 +1,5 @@ | |||||
/***************************************************************************** | /***************************************************************************** | ||||
Copyright (c) 2011-2020, The OpenBLAS Project | |||||
Copyright (c) 2011-2024, The OpenBLAS Project | |||||
All rights reserved. | All rights reserved. | ||||
Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
@@ -32,53 +32,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
**********************************************************************************/ | **********************************************************************************/ | ||||
#include <stdint.h> | #include <stdint.h> | ||||
#include <sys/auxv.h> | |||||
#include <stdio.h> | #include <stdio.h> | ||||
#include <math.h> | |||||
#include <string.h> | |||||
#include <sys/auxv.h> | |||||
/* If LASX extension instructions supported, | |||||
* using core LOONGSON3R5 | |||||
* If only LSX extension instructions supported, | |||||
* using core LOONGSON2K1000 | |||||
* If neither LASX nor LSX extension instructions supported, | |||||
* using core LOONGSONGENERIC (As far as I know, there is no such | |||||
* CPU yet) | |||||
*/ | |||||
#define CPU_LA64_GENERIC 0 | |||||
#define CPU_LA264 1 | |||||
#define CPU_LA364 2 | |||||
#define CPU_LA464 3 | |||||
#define CPU_LA664 4 | |||||
#define CPU_GENERIC 0 | |||||
#define CPU_LOONGSON3R5 1 | |||||
#define CPU_LOONGSON2K1000 2 | |||||
#define CORE_LA64_GENERIC 0 | |||||
#define CORE_LA264 1 | |||||
#define CORE_LA464 2 | |||||
#define LA_HWCAP_LSX (1U << 4) | #define LA_HWCAP_LSX (1U << 4) | ||||
#define LA_HWCAP_LASX (1U << 5) | #define LA_HWCAP_LASX (1U << 5) | ||||
#define LOONGARCH_CFG0 0x00 | |||||
#define LOONGARCH_CFG2 0x02 | |||||
#define LOONGARCH_CFG10 0x10 | |||||
#define LOONGARCH_CFG11 0x11 | |||||
#define LOONGARCH_CFG12 0x12 | |||||
#define LOONGARCH_CFG13 0x13 | |||||
#define LOONGARCH_CFG14 0x14 | |||||
#define LASX_MASK 1<<7 | |||||
#define LSX_MASK 1<<6 | |||||
#define PRID_SERIES_MASK 0xf000 | |||||
#define PRID_SERIES_LA264 0xa000 | |||||
#define PRID_SERIES_LA364 0xb000 | |||||
#define PRID_SERIES_LA464 0xc000 | |||||
#define PRID_SERIES_LA664 0xd000 | |||||
#define CACHE_INFO_L1_IU 0 | |||||
#define CACHE_INFO_L1_D 1 | |||||
#define CACHE_INFO_L2_IU 2 | |||||
#define CACHE_INFO_L2_D 3 | |||||
#define CACHE_INFO_L3_IU 4 | |||||
#define CACHE_INFO_L3_D 5 | |||||
#define L1_IU_PRESENT_MASK 0x0001 | |||||
#define L1_IU_UNITY_MASK 0x0002 | |||||
#define L1_D_PRESENT_MASK 0x0004 | |||||
#define L2_IU_PRESENT_MASK 0x0008 | |||||
#define L2_IU_UNITY_MASK 0x0010 | |||||
#define L2_D_PRESENT_MASK 0x0080 | |||||
#define L3_IU_PRESENT_MASK 0x0400 | |||||
#define L3_IU_UNITY_MASK 0x0800 | |||||
#define L3_D_PRESENT_MASK 0x4000 | |||||
#define CACHE_WAY_MINUS_1_MASK 0x0000ffff | |||||
#define CACHE_INDEX_LOG2_MASK 0x00ff0000 | |||||
#define CACHE_LINESIZE_LOG2_MASK 0x7f000000 | |||||
typedef struct { | |||||
int size; | |||||
int associative; | |||||
int linesize; | |||||
int unify; | |||||
int present; | |||||
} cache_info_t; | |||||
/* Using microarchitecture representation */ | |||||
static char *cpuname[] = { | static char *cpuname[] = { | ||||
"LOONGSONGENERIC", | |||||
"LOONGSON3R5", | |||||
"LOONGSON2K1000" | |||||
"LA64_GENERIC", | |||||
"LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */ | |||||
"LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */ | |||||
"LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */ | |||||
"LA664" /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */ | |||||
}; | }; | ||||
static char *cpuname_lower[] = { | static char *cpuname_lower[] = { | ||||
"loongsongeneric", | |||||
"loongson3r5", | |||||
"loongson2k1000" | |||||
"la64_generic", | |||||
"la264", | |||||
"la364", | |||||
"la464", | |||||
"la664" | |||||
}; | |||||
static char *corename[] = { | |||||
"LA64_GENERIC", /* Implies using scalar instructions for optimization */ | |||||
"LA264", /* Implies using LSX instructions for optimization */ | |||||
"LA464", /* Implies using LASX instructions for optimization */ | |||||
}; | |||||
static char *corename_lower[] = { | |||||
"la64_generic", | |||||
"la264", | |||||
"la464", | |||||
}; | }; | ||||
int detect(void) { | |||||
#ifdef __linux | |||||
/* | |||||
* Obtain cache and processor identification | |||||
* through the cpucfg command. | |||||
*/ | |||||
static void get_cacheinfo(int type, cache_info_t *cacheinfo) { | |||||
cache_info_t cache_info; | |||||
memset(&cache_info, 0, sizeof(cache_info)); | |||||
uint32_t reg_10 = 0; | |||||
__asm__ volatile ( | |||||
"cpucfg %0, %1 \n\t" | |||||
: "+&r"(reg_10) | |||||
: "r"(LOONGARCH_CFG10) | |||||
); | |||||
switch (type) { | |||||
case CACHE_INFO_L1_IU: | |||||
if (reg_10 & L1_IU_PRESENT_MASK) { | |||||
uint32_t reg_11 = 0; | |||||
cache_info.present = reg_10 & L1_IU_PRESENT_MASK; | |||||
cache_info.unify = reg_10 & L1_IU_UNITY_MASK; | |||||
__asm__ volatile ( | |||||
"cpucfg %0, %1 \n\t" | |||||
: "+&r"(reg_11) | |||||
: "r"(LOONGARCH_CFG11) | |||||
); | |||||
cache_info.associative = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1; | |||||
cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24); | |||||
cache_info.size = cache_info.associative * cache_info.linesize * | |||||
(1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16)); | |||||
} | |||||
break; | |||||
case CACHE_INFO_L1_D: | |||||
if (reg_10 & L1_D_PRESENT_MASK) { | |||||
uint32_t reg_12 = 0; | |||||
cache_info.present = reg_10 & L1_D_PRESENT_MASK; | |||||
__asm__ volatile ( | |||||
"cpucfg %0, %1 \n\t" | |||||
: "+&r"(reg_12) | |||||
: "r"(LOONGARCH_CFG12) | |||||
); | |||||
cache_info.associative = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1; | |||||
cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24); | |||||
cache_info.size = cache_info.associative * cache_info.linesize * | |||||
(1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16)); | |||||
} | |||||
break; | |||||
case CACHE_INFO_L2_IU: | |||||
if (reg_10 & L2_IU_PRESENT_MASK) { | |||||
uint32_t reg_13 = 0; | |||||
cache_info.present = reg_10 & L2_IU_PRESENT_MASK; | |||||
cache_info.unify = reg_10 & L2_IU_UNITY_MASK; | |||||
__asm__ volatile ( | |||||
"cpucfg %0, %1 \n\t" | |||||
: "+&r"(reg_13) | |||||
: "r"(LOONGARCH_CFG13) | |||||
); | |||||
cache_info.associative = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1; | |||||
cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24); | |||||
cache_info.size = cache_info.associative * cache_info.linesize * | |||||
(1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16)); | |||||
} | |||||
break; | |||||
case CACHE_INFO_L2_D: | |||||
if (reg_10 & L2_D_PRESENT_MASK) { | |||||
cache_info.present = reg_10 & L2_D_PRESENT_MASK; | |||||
// No date fetch | |||||
} | |||||
break; | |||||
case CACHE_INFO_L3_IU: | |||||
if (reg_10 & L3_IU_PRESENT_MASK) { | |||||
uint32_t reg_14 = 0; | |||||
cache_info.present = reg_10 & L3_IU_PRESENT_MASK; | |||||
cache_info.unify = reg_10 & L3_IU_UNITY_MASK; | |||||
__asm__ volatile ( | |||||
"cpucfg %0, %1 \n\t" | |||||
: "+&r"(reg_14) | |||||
: "r"(LOONGARCH_CFG14) | |||||
); | |||||
cache_info.associative = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1; | |||||
cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24); | |||||
cache_info.size = cache_info.associative * cache_info.linesize * | |||||
(1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16)); | |||||
} | |||||
break; | |||||
case CACHE_INFO_L3_D: | |||||
if (reg_10 & L3_D_PRESENT_MASK) { | |||||
cache_info.present = reg_10 & L3_D_PRESENT_MASK; | |||||
// No data fetch | |||||
} | |||||
break; | |||||
default: | |||||
break; | |||||
} | |||||
*cacheinfo = cache_info; | |||||
} | |||||
static uint32_t get_prid() { | |||||
uint32_t reg = 0; | |||||
__asm__ volatile ( | |||||
"cpucfg %0, %1 \n\t" | |||||
: "+&r"(reg) | |||||
: "r"(LOONGARCH_CFG0) | |||||
); | |||||
return reg; | |||||
} | |||||
static void get_cpucount(uint32_t *count) { | |||||
uint32_t num = 0; | |||||
FILE *f = fopen("/proc/cpuinfo", "r"); | |||||
if (!f) return; | |||||
char buf[200]; | |||||
while (fgets(buf, sizeof(buf), f)) | |||||
{ | |||||
if (!strncmp("processor", buf, 9)) | |||||
num ++; | |||||
} | |||||
fclose(f); | |||||
*count = num; | |||||
} | |||||
/* Detect whether the OS supports the LASX instruction set */ | |||||
static int os_support_lasx() { | |||||
int hwcap = (int)getauxval(AT_HWCAP); | int hwcap = (int)getauxval(AT_HWCAP); | ||||
if (hwcap & LA_HWCAP_LASX) | if (hwcap & LA_HWCAP_LASX) | ||||
return CPU_LOONGSON3R5; | |||||
else if (hwcap & LA_HWCAP_LSX) | |||||
return CPU_LOONGSON2K1000; | |||||
return 1; | |||||
else | |||||
return 0; | |||||
} | |||||
/* Detect whether the OS supports the LSX instruction set */ | |||||
static int os_support_lsx() { | |||||
int hwcap = (int)getauxval(AT_HWCAP); | |||||
if (hwcap & LA_HWCAP_LSX) | |||||
return 1; | |||||
else | else | ||||
return CPU_GENERIC; | |||||
#endif | |||||
return CPU_GENERIC; | |||||
return 0; | |||||
} | |||||
int get_coretype(void) { | |||||
uint32_t prid = get_prid(); | |||||
switch (prid & PRID_SERIES_MASK) { | |||||
case (PRID_SERIES_LA464): | |||||
case (PRID_SERIES_LA664): | |||||
if (os_support_lasx()) | |||||
return CORE_LA464; | |||||
else if (os_support_lsx()) | |||||
return CORE_LA264; | |||||
else | |||||
return CORE_LA64_GENERIC; | |||||
break; | |||||
case (PRID_SERIES_LA264): | |||||
case (PRID_SERIES_LA364): | |||||
if (os_support_lsx()) | |||||
return CORE_LA264; | |||||
else | |||||
return CORE_LA64_GENERIC; | |||||
break; | |||||
default: | |||||
return CORE_LA64_GENERIC; | |||||
break; | |||||
} | |||||
} | |||||
int get_cputype(void) { | |||||
uint32_t prid = get_prid(); | |||||
switch (prid & PRID_SERIES_MASK) { | |||||
case (PRID_SERIES_LA264): | |||||
return CPU_LA264; | |||||
break; | |||||
case (PRID_SERIES_LA364): | |||||
return CPU_LA364; | |||||
break; | |||||
case (PRID_SERIES_LA464): | |||||
return CPU_LA464; | |||||
break; | |||||
case (PRID_SERIES_LA664): | |||||
return CPU_LA664; | |||||
break; | |||||
default: | |||||
return CPU_LA64_GENERIC; | |||||
break; | |||||
} | |||||
} | } | ||||
char *get_corename(void) { | char *get_corename(void) { | ||||
return cpuname[detect()]; | |||||
return corename[get_coretype()]; | |||||
} | |||||
void get_libname(void){ | |||||
printf("%s", corename_lower[get_coretype()]); | |||||
} | } | ||||
void get_architecture(void) { | void get_architecture(void) { | ||||
@@ -86,8 +332,7 @@ void get_architecture(void) { | |||||
} | } | ||||
void get_subarchitecture(void) { | void get_subarchitecture(void) { | ||||
int d = detect(); | |||||
printf("%s", cpuname[d]); | |||||
printf("%s", cpuname[get_cputype()]); | |||||
} | } | ||||
void get_subdirname(void) { | void get_subdirname(void) { | ||||
@@ -95,50 +340,69 @@ void get_subdirname(void) { | |||||
} | } | ||||
void get_cpuconfig(void) { | void get_cpuconfig(void) { | ||||
uint32_t hwcaps = 0; | |||||
int d = detect(); | |||||
switch (d) { | |||||
case CPU_LOONGSON3R5: | |||||
printf("#define LOONGSON3R5\n"); | |||||
printf("#define L1_DATA_SIZE 65536\n"); | |||||
printf("#define L1_DATA_LINESIZE 64\n"); | |||||
printf("#define L2_SIZE 1048576\n"); | |||||
printf("#define L2_LINESIZE 64\n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
printf("#define DTB_SIZE 4096\n"); | |||||
printf("#define L2_ASSOCIATIVE 16\n"); | |||||
break; | |||||
cache_info_t info; | |||||
uint32_t num_cores = 0; | |||||
case CPU_LOONGSON2K1000: | |||||
printf("#define LOONGSON2K1000\n"); | |||||
printf("#define L1_DATA_SIZE 65536\n"); | |||||
printf("#define L1_DATA_LINESIZE 64\n"); | |||||
printf("#define L2_SIZE 262144\n"); | |||||
printf("#define L2_LINESIZE 64\n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
printf("#define DTB_SIZE 4096\n"); | |||||
printf("#define L2_ASSOCIATIVE 16\n"); | |||||
break; | |||||
printf("#define %s\n", corename[get_coretype()]); // Core name | |||||
default: | |||||
printf("#define LOONGSONGENERIC\n"); | |||||
printf("#define L1_DATA_SIZE 65536\n"); | |||||
printf("#define L1_DATA_LINESIZE 64\n"); | |||||
printf("#define L2_SIZE 262144\n"); | |||||
printf("#define L2_LINESIZE 64\n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
printf("#define DTB_SIZE 4096\n"); | |||||
printf("#define L2_ASSOCIATIVE 16\n"); | |||||
break; | |||||
printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name | |||||
get_cacheinfo(CACHE_INFO_L1_IU, &info); | |||||
if (info.present) { | |||||
if (info.unify) { // Unified cache, without distinguishing between instructions and data | |||||
printf("#define L1_SIZE %d\n", info.size); | |||||
printf("#define L1_ASSOCIATIVE %d\n", info.associative); | |||||
printf("#define L1_LINESIZE %d\n", info.linesize); | |||||
} else { | |||||
printf("#define L1_CODE_SIZE %d\n", info.size); | |||||
printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); | |||||
printf("#define L1_CODE_LINESIZE %d\n", info.linesize); | |||||
} | |||||
} | } | ||||
hwcaps = (uint32_t)getauxval( AT_HWCAP ); | |||||
if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n"); | |||||
if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n"); | |||||
} | |||||
if (!info.unify) { | |||||
get_cacheinfo(CACHE_INFO_L1_D, &info); | |||||
if (info.present) { | |||||
printf("#define L1_DATA_SIZE %d\n", info.size); | |||||
printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); | |||||
printf("#define L1_DATA_LINESIZE %d\n", info.linesize); | |||||
} | |||||
} | |||||
void get_libname(void){ | |||||
int d = detect(); | |||||
printf("%s", cpuname_lower[d]); | |||||
get_cacheinfo(CACHE_INFO_L2_IU, &info); | |||||
if (info.present > 0) { | |||||
if (info.unify) { | |||||
printf("#define L2_SIZE %d\n", info.size); | |||||
printf("#define L2_ASSOCIATIVE %d\n", info.associative); | |||||
printf("#define L2_LINESIZE %d\n", info.linesize); | |||||
} else { | |||||
printf("#define L2_CODE_SIZE %d\n", info.size); | |||||
printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative); | |||||
printf("#define L2_CODE_LINESIZE %d\n", info.linesize); | |||||
} | |||||
} | |||||
get_cacheinfo(CACHE_INFO_L3_IU, &info); | |||||
if (info.present > 0) { | |||||
if (info.unify) { | |||||
printf("#define L3_SIZE %d\n", info.size); | |||||
printf("#define L3_ASSOCIATIVE %d\n", info.associative); | |||||
printf("#define L3_LINESIZE %d\n", info.linesize); | |||||
} else { | |||||
printf("#define L3_CODE_SIZE %d\n", info.size); | |||||
printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative); | |||||
printf("#define L3_CODE_LINESIZE %d\n", info.linesize); | |||||
} | |||||
} | |||||
if(os_support_lsx) printf("#define HAVE_LSX\n"); | |||||
if(os_support_lasx) printf("#define HAVE_LASX\n"); | |||||
get_cpucount(&num_cores); | |||||
if (num_cores) | |||||
printf("#define NUM_CORES %d\n", num_cores); | |||||
//TODO: It’s unclear what this entry represents, but it is indeed necessary. | |||||
//It has been set based on reference to other platforms. | |||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
} | } |
@@ -1082,7 +1082,7 @@ if (buffer == NULL) { | |||||
} | } | ||||
//For target LOONGSON3R5, applying an offset to the buffer is essential | |||||
//For LOONGARCH64, applying an offset to the buffer is essential | |||||
//for minimizing cache conflicts and optimizing performance. | //for minimizing cache conflicts and optimizing performance. | ||||
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) | #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) | ||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | ||||
@@ -28,25 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include <sys/auxv.h> | #include <sys/auxv.h> | ||||
#include "common.h" | #include "common.h" | ||||
extern gotoblas_t gotoblas_LOONGSON3R5; | |||||
extern gotoblas_t gotoblas_LOONGSON2K1000; | |||||
extern gotoblas_t gotoblas_LOONGSONGENERIC; | |||||
#define NUM_CORETYPES 6 | |||||
#define LOONGARCH_CFG0 0x00 | |||||
#define LA_HWCAP_LSX (1U << 4) | |||||
#define LA_HWCAP_LASX (1U << 5) | |||||
#define PRID_SERIES_MASK 0xf000 | |||||
#define PRID_SERIES_LA264 0xa000 | |||||
#define PRID_SERIES_LA364 0xb000 | |||||
#define PRID_SERIES_LA464 0xc000 | |||||
#define PRID_SERIES_LA664 0xd000 | |||||
extern gotoblas_t gotoblas_LA64_GENERIC; | |||||
extern gotoblas_t gotoblas_LA264; | |||||
extern gotoblas_t gotoblas_LA464; | |||||
extern void openblas_warning(int verbose, const char * msg); | extern void openblas_warning(int verbose, const char * msg); | ||||
#define NUM_CORETYPES 3 | |||||
static char *corename[] = { | static char *corename[] = { | ||||
"loongson3r5", | |||||
"loongson2k1000", | |||||
"la64_generic", | |||||
"la264", | |||||
"la464", | |||||
"loongsongeneric", | "loongsongeneric", | ||||
"loongson2k1000", | |||||
"loongson3r5", | |||||
"unknown" | "unknown" | ||||
}; | }; | ||||
char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; | |||||
if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; | |||||
if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; | |||||
if (gotoblas == &gotoblas_LA64_GENERIC) return corename[0]; | |||||
if (gotoblas == &gotoblas_LA264) return corename[1]; | |||||
if (gotoblas == &gotoblas_LA464) return corename[2]; | |||||
return corename[NUM_CORETYPES]; | return corename[NUM_CORETYPES]; | ||||
} | } | ||||
@@ -66,27 +77,78 @@ static gotoblas_t *force_coretype(char *coretype) { | |||||
switch (found) | switch (found) | ||||
{ | { | ||||
case 0: return (&gotoblas_LOONGSON3R5); | |||||
case 1: return (&gotoblas_LOONGSON2K1000); | |||||
case 2: return (&gotoblas_LOONGSONGENERIC); | |||||
case 0: return (&gotoblas_LA64_GENERIC); | |||||
case 1: return (&gotoblas_LA264); | |||||
case 2: return (&gotoblas_LA464); | |||||
case 3: return (&gotoblas_LA64_GENERIC); | |||||
case 4: return (&gotoblas_LA264); | |||||
case 5: return (&gotoblas_LA464); | |||||
} | } | ||||
snprintf(message, 128, "Core not found: %s\n", coretype); | snprintf(message, 128, "Core not found: %s\n", coretype); | ||||
openblas_warning(1, message); | openblas_warning(1, message); | ||||
return NULL; | return NULL; | ||||
} | } | ||||
#define LA_HWCAP_LSX (1U << 4) | |||||
#define LA_HWCAP_LASX (1U << 5) | |||||
static gotoblas_t *get_coretype(void) { | |||||
int hwcap = (int)getauxval(AT_HWCAP); | |||||
/* Detect whether the OS supports the LASX instruction set */ | |||||
static int os_support_lasx() { | |||||
int hwcap = (int)getauxval(AT_HWCAP); | |||||
if (hwcap & LA_HWCAP_LASX) | if (hwcap & LA_HWCAP_LASX) | ||||
return &gotoblas_LOONGSON3R5; | |||||
else if (hwcap & LA_HWCAP_LSX) | |||||
return &gotoblas_LOONGSON2K1000; | |||||
return 1; | |||||
else | |||||
return 0; | |||||
} | |||||
/* Detect whether the OS supports the LSX instruction set */ | |||||
static int os_support_lsx() { | |||||
int hwcap = (int)getauxval(AT_HWCAP); | |||||
if (hwcap & LA_HWCAP_LSX) | |||||
return 1; | |||||
else | else | ||||
return &gotoblas_LOONGSONGENERIC; | |||||
return 0; | |||||
} | |||||
static uint32_t get_prid() { | |||||
uint32_t reg = 0; | |||||
__asm__ volatile ( | |||||
"cpucfg %0, %1 \n\t" | |||||
: "+&r"(reg) | |||||
: "r"(LOONGARCH_CFG0) | |||||
); | |||||
return reg; | |||||
} | |||||
/* Select core at runtime based on the | |||||
* cpu name and SIMD instructions supported | |||||
* by the system | |||||
*/ | |||||
static gotoblas_t *get_coretype(void) { | |||||
uint32_t prid = get_prid(); | |||||
switch (prid & PRID_SERIES_MASK) { | |||||
case (PRID_SERIES_LA464): | |||||
case (PRID_SERIES_LA664): | |||||
if (os_support_lasx()) | |||||
return &gotoblas_LA464; | |||||
else if (os_support_lsx()) | |||||
return &gotoblas_LA264; | |||||
else | |||||
return &gotoblas_LA64_GENERIC; | |||||
break; | |||||
case (PRID_SERIES_LA264): | |||||
case (PRID_SERIES_LA364): | |||||
if (os_support_lsx()) | |||||
return &gotoblas_LA264; | |||||
else | |||||
return &gotoblas_LA64_GENERIC; | |||||
break; | |||||
default: | |||||
return &gotoblas_LA64_GENERIC; | |||||
break; | |||||
} | |||||
} | } | ||||
void gotoblas_dynamic_init(void) { | void gotoblas_dynamic_init(void) { | ||||
@@ -752,7 +752,7 @@ int get_L3_size() { | |||||
} | } | ||||
void blas_set_parameter(void){ | void blas_set_parameter(void){ | ||||
#if defined(LOONGSON3R5) | |||||
#if defined(LA464) | |||||
int L3_size = get_L3_size(); | int L3_size = get_L3_size(); | ||||
#ifdef SMP | #ifdef SMP | ||||
if(blas_num_threads == 1){ | if(blas_num_threads == 1){ | ||||
@@ -135,11 +135,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
/* #define FORCE_CELL */ | /* #define FORCE_CELL */ | ||||
/* #define FORCE_MIPS64_GENERIC */ | /* #define FORCE_MIPS64_GENERIC */ | ||||
/* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
/* #define FORCE_LOONGSON3R3 */ | |||||
/* #define FORCE_LOONGSON3R4 */ | |||||
/* #define FORCE_LOONGSON3R3 */ | |||||
/* #define FORCE_LOONGSON3R4 */ | |||||
/* #define FORCE_LOONGSON3R5 */ | /* #define FORCE_LOONGSON3R5 */ | ||||
/* #define FORCE_LOONGSON2K1000 */ | /* #define FORCE_LOONGSON2K1000 */ | ||||
/* #define FORCE_LOONGSONGENERIC */ | /* #define FORCE_LOONGSONGENERIC */ | ||||
/* #define FORCE_LA64_GENERIC */ | |||||
/* #define FORCE_LA264 */ | |||||
/* #define FORCE_LA464 */ | |||||
/* #define FORCE_I6400 */ | /* #define FORCE_I6400 */ | ||||
/* #define FORCE_P6600 */ | /* #define FORCE_P6600 */ | ||||
/* #define FORCE_P5600 */ | /* #define FORCE_P5600 */ | ||||
@@ -153,7 +156,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
/* #define FORCE_EV5 */ | /* #define FORCE_EV5 */ | ||||
/* #define FORCE_EV6 */ | /* #define FORCE_EV6 */ | ||||
/* #define FORCE_CSKY */ | /* #define FORCE_CSKY */ | ||||
/* #define FORCE_CK860FV */ | |||||
/* #define FORCE_CK860FV */ | |||||
/* #define FORCE_GENERIC */ | /* #define FORCE_GENERIC */ | ||||
#ifdef FORCE_P2 | #ifdef FORCE_P2 | ||||
@@ -979,46 +982,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#else | #else | ||||
#endif | #endif | ||||
#ifdef FORCE_LOONGSON3R5 | |||||
#if defined(FORCE_LA464) || defined(FORCE_LOONGSON3R5) | |||||
#define FORCE | #define FORCE | ||||
#define ARCHITECTURE "LOONGARCH" | #define ARCHITECTURE "LOONGARCH" | ||||
#define SUBARCHITECTURE "LOONGSON3R5" | |||||
#ifdef NO_LASX | |||||
#ifdef NO_LSX | |||||
#define SUBARCHITECTURE "LA64_GENERIC" | |||||
#define SUBDIRNAME "loongarch64" | #define SUBDIRNAME "loongarch64" | ||||
#define ARCHCONFIG "-DLOONGSON3R5 " \ | |||||
#define ARCHCONFIG "-DLA64_GENERIC " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | ||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" | |||||
#define LIBNAME "loongson3r5" | |||||
#define CORENAME "LOONGSON3R5" | |||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 " | |||||
#define LIBNAME "la64_generic" | |||||
#define CORENAME "LA64_GENERIC" | |||||
#else | #else | ||||
#define SUBARCHITECTURE "LA264" | |||||
#define SUBDIRNAME "loongarch64" | |||||
#define ARCHCONFIG "-DLA264 " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 " | |||||
#define LIBNAME "la264" | |||||
#define CORENAME "LA264" | |||||
#endif | |||||
#else | |||||
#define SUBARCHITECTURE "LA464" | |||||
#define SUBDIRNAME "loongarch64" | |||||
#define ARCHCONFIG "-DLA464 " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 " | |||||
#define LIBNAME "la464" | |||||
#define CORENAME "LA464" | |||||
#endif | |||||
#endif | #endif | ||||
#ifdef FORCE_LOONGSON2K1000 | |||||
#if defined(FORCE_LA264) || defined(FORCE_LOONGSON2K1000) | |||||
#define FORCE | #define FORCE | ||||
#define ARCHITECTURE "LOONGARCH" | #define ARCHITECTURE "LOONGARCH" | ||||
#define SUBARCHITECTURE "LOONGSON2K1000" | |||||
#ifdef NO_LSX | |||||
#define SUBARCHITECTURE "LA64_GENERIC" | |||||
#define SUBDIRNAME "loongarch64" | #define SUBDIRNAME "loongarch64" | ||||
#define ARCHCONFIG "-DLOONGSON2K1000 " \ | |||||
#define ARCHCONFIG "-DLA64_GENERIC " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | ||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | ||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" | |||||
#define LIBNAME "loongson2k1000" | |||||
#define CORENAME "LOONGSON2K1000" | |||||
"-DDTB_DEFAULT_ENTRIES=64 " | |||||
#define LIBNAME "la64_generic" | |||||
#define CORENAME "LA64_GENERIC" | |||||
#else | #else | ||||
#define SUBARCHITECTURE "LA264" | |||||
#define SUBDIRNAME "loongarch64" | |||||
#define ARCHCONFIG "-DLA264 " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 " | |||||
#define LIBNAME "la264" | |||||
#define CORENAME "LA264" | |||||
#endif | |||||
#endif | #endif | ||||
#ifdef FORCE_LOONGSONGENERIC | |||||
#if defined(FORCE_LA64_GENERIC) || defined(FORCE_LOONGSONGENERIC) | |||||
#define FORCE | #define FORCE | ||||
#define ARCHITECTURE "LOONGARCH" | #define ARCHITECTURE "LOONGARCH" | ||||
#define SUBARCHITECTURE "LOONGSONGENERIC" | |||||
#define SUBARCHITECTURE "LA64_GENERIC" | |||||
#define SUBDIRNAME "loongarch64" | #define SUBDIRNAME "loongarch64" | ||||
#define ARCHCONFIG "-DLOONGSONGENERIC " \ | |||||
#define ARCHCONFIG "-DLA64_GENERIC " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | ||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | ||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" | |||||
#define LIBNAME "loongsongeneric" | |||||
#define CORENAME "LOONGSONGENERIC" | |||||
#else | |||||
"-DDTB_DEFAULT_ENTRIES=64 " | |||||
#define LIBNAME "la64_generic" | |||||
#define CORENAME "LA64_GENERIC" | |||||
#endif | #endif | ||||
#ifdef FORCE_I6400 | #ifdef FORCE_I6400 | ||||
@@ -572,7 +572,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
buffer = (XFLOAT *)blas_memory_alloc(0); | buffer = (XFLOAT *)blas_memory_alloc(0); | ||||
//For target LOONGSON3R5, applying an offset to the buffer is essential | |||||
//For LOONGARCH64, applying an offset to the buffer is essential | |||||
//for minimizing cache conflicts and optimizing performance. | //for minimizing cache conflicts and optimizing performance. | ||||
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) | #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) | ||||
sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | ||||
@@ -1086,7 +1086,7 @@ static void init_parameter(void) { | |||||
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | ||||
#endif | #endif | ||||
#if defined(LOONGSON3R5) | |||||
#if defined(LA464) | |||||
int L3_size = get_L3_size(); | int L3_size = get_L3_size(); | ||||
#ifdef SMP | #ifdef SMP | ||||
if(blas_num_threads == 1){ | if(blas_num_threads == 1){ | ||||
@@ -2838,7 +2838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define SYMV_P 16 | #define SYMV_P 16 | ||||
#endif | #endif | ||||
#if defined (LOONGSON3R5) | |||||
#if defined (LA464) | |||||
#define SNUMOPT 2 | #define SNUMOPT 2 | ||||
#define DNUMOPT 2 | #define DNUMOPT 2 | ||||
@@ -2891,7 +2891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define SYMV_P 16 | #define SYMV_P 16 | ||||
#endif | #endif | ||||
#ifdef LOONGSON2K1000 | |||||
#ifdef LA264 | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
#define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | ||||
@@ -2926,7 +2926,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define SYMV_P 16 | #define SYMV_P 16 | ||||
#endif | #endif | ||||
#ifdef LOONGSONGENERIC | |||||
#ifdef LA64_GENERIC | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
#define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | ||||