Browse Source

LoongArch64: Rename core

Use microarchitecture name instead of meaningless strings to name the core,
the legacy core is still retained.
1. Rename LOONGSONGENERIC to LA64_GENERIC
2. Rename LOONGSON3R5 to LA464
3. Rename LOONGSON2K1000 to LA264
tags/v0.3.29
gxw 1 year ago
parent
commit
48698b2b1d
14 changed files with 506 additions and 124 deletions
  1. +9
    -0
      .github/workflows/loongarch64.yml
  2. +6
    -0
      .github/workflows/loongarch64_clang.yml
  3. +1
    -1
      Makefile.system
  4. +9
    -1
      TargetList.txt
  5. +335
    -71
      cpuid_loongarch64.c
  6. +1
    -1
      driver/others/blas_server.c
  7. +83
    -21
      driver/others/dynamic_loongarch64.c
  8. +1
    -1
      driver/others/parameter.c
  9. +56
    -23
      getarch.c
  10. +1
    -1
      interface/gemm.c
  11. +0
    -0
      kernel/loongarch64/KERNEL.LA264
  12. +0
    -0
      kernel/loongarch64/KERNEL.LA464
  13. +1
    -1
      kernel/setparam-ref.c
  14. +3
    -3
      param.h

+ 9
- 0
.github/workflows/loongarch64.yml View File

@@ -23,6 +23,15 @@ jobs:
- target: LOONGSON2K1000
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
- target: LA64_GENERIC
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
- target: LA464
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
- target: LA264
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
- target: DYNAMIC_ARCH
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC


+ 6
- 0
.github/workflows/loongarch64_clang.yml View File

@@ -20,6 +20,12 @@ jobs:
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
- target: LOONGSON2K1000
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
- target: LA64_GENERIC
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
- target: LA464
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
- target: LA264
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
- target: DYNAMIC_ARCH
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC



+ 1
- 1
Makefile.system View File

@@ -727,7 +727,7 @@ endif
endif

ifeq ($(ARCH), loongarch64)
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
DYNAMIC_CORE = LA64_GENERIC LA264 LA464
endif

ifeq ($(ARCH), riscv64)


+ 9
- 1
TargetList.txt View File

@@ -126,9 +126,17 @@ x280
RISCV64_ZVL256B

11.LOONGARCH64:
// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names,
// and it is recommended to use the more standardized naming conventions
// LA64_GENERIC/LA264/LA464. You can still specify TARGET as
// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime,
// and they will be internally relocated to LA64_GENERIC/LA264/LA464.
LOONGSONGENERIC
LOONGSON3R5
LOONGSON2K1000
LOONGSON3R5
LA64_GENERIC
LA264
LA464

12. Elbrus E2000:
E2K


+ 335
- 71
cpuid_loongarch64.c View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011-2020, The OpenBLAS Project
Copyright (c) 2011-2024, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
@@ -32,53 +32,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/

#include <stdint.h>
#include <sys/auxv.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <sys/auxv.h>

/* If LASX extension instructions supported,
* using core LOONGSON3R5
* If only LSX extension instructions supported,
* using core LOONGSON2K1000
* If neither LASX nor LSX extension instructions supported,
* using core LOONGSONGENERIC (As far as I know, there is no such
* CPU yet)
*/
#define CPU_LA64_GENERIC 0
#define CPU_LA264 1
#define CPU_LA364 2
#define CPU_LA464 3
#define CPU_LA664 4

#define CPU_GENERIC 0
#define CPU_LOONGSON3R5 1
#define CPU_LOONGSON2K1000 2
#define CORE_LA64_GENERIC 0
#define CORE_LA264 1
#define CORE_LA464 2

#define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)

#define LOONGARCH_CFG0 0x00
#define LOONGARCH_CFG2 0x02
#define LOONGARCH_CFG10 0x10
#define LOONGARCH_CFG11 0x11
#define LOONGARCH_CFG12 0x12
#define LOONGARCH_CFG13 0x13
#define LOONGARCH_CFG14 0x14
#define LASX_MASK 1<<7
#define LSX_MASK 1<<6
#define PRID_SERIES_MASK 0xf000
#define PRID_SERIES_LA264 0xa000
#define PRID_SERIES_LA364 0xb000
#define PRID_SERIES_LA464 0xc000
#define PRID_SERIES_LA664 0xd000

#define CACHE_INFO_L1_IU 0
#define CACHE_INFO_L1_D 1
#define CACHE_INFO_L2_IU 2
#define CACHE_INFO_L2_D 3
#define CACHE_INFO_L3_IU 4
#define CACHE_INFO_L3_D 5
#define L1_IU_PRESENT_MASK 0x0001
#define L1_IU_UNITY_MASK 0x0002
#define L1_D_PRESENT_MASK 0x0004
#define L2_IU_PRESENT_MASK 0x0008
#define L2_IU_UNITY_MASK 0x0010
#define L2_D_PRESENT_MASK 0x0080
#define L3_IU_PRESENT_MASK 0x0400
#define L3_IU_UNITY_MASK 0x0800
#define L3_D_PRESENT_MASK 0x4000
#define CACHE_WAY_MINUS_1_MASK 0x0000ffff
#define CACHE_INDEX_LOG2_MASK 0x00ff0000
#define CACHE_LINESIZE_LOG2_MASK 0x7f000000

typedef struct {
int size;
int associative;
int linesize;
int unify;
int present;
} cache_info_t;

/* Using microarchitecture representation */
static char *cpuname[] = {
"LOONGSONGENERIC",
"LOONGSON3R5",
"LOONGSON2K1000"
"LA64_GENERIC",
"LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */
"LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */
"LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */
"LA664" /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */
};

static char *cpuname_lower[] = {
"loongsongeneric",
"loongson3r5",
"loongson2k1000"
"la64_generic",
"la264",
"la364",
"la464",
"la664"
};

static char *corename[] = {
"LA64_GENERIC", /* Implies using scalar instructions for optimization */
"LA264", /* Implies using LSX instructions for optimization */
"LA464", /* Implies using LASX instructions for optimization */
};

static char *corename_lower[] = {
"la64_generic",
"la264",
"la464",
};

int detect(void) {
#ifdef __linux
/*
* Obtain cache and processor identification
* through the cpucfg command.
*/
static void get_cacheinfo(int type, cache_info_t *cacheinfo) {
cache_info_t cache_info;
memset(&cache_info, 0, sizeof(cache_info));
uint32_t reg_10 = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_10)
: "r"(LOONGARCH_CFG10)
);

switch (type) {
case CACHE_INFO_L1_IU:
if (reg_10 & L1_IU_PRESENT_MASK) {
uint32_t reg_11 = 0;
cache_info.present = reg_10 & L1_IU_PRESENT_MASK;
cache_info.unify = reg_10 & L1_IU_UNITY_MASK;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_11)
: "r"(LOONGARCH_CFG11)
);
cache_info.associative = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1;
cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24);
cache_info.size = cache_info.associative * cache_info.linesize *
(1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16));
}
break;

case CACHE_INFO_L1_D:
if (reg_10 & L1_D_PRESENT_MASK) {
uint32_t reg_12 = 0;
cache_info.present = reg_10 & L1_D_PRESENT_MASK;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_12)
: "r"(LOONGARCH_CFG12)
);
cache_info.associative = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1;
cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24);
cache_info.size = cache_info.associative * cache_info.linesize *
(1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16));
}
break;

case CACHE_INFO_L2_IU:
if (reg_10 & L2_IU_PRESENT_MASK) {
uint32_t reg_13 = 0;
cache_info.present = reg_10 & L2_IU_PRESENT_MASK;
cache_info.unify = reg_10 & L2_IU_UNITY_MASK;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_13)
: "r"(LOONGARCH_CFG13)
);
cache_info.associative = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1;
cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24);
cache_info.size = cache_info.associative * cache_info.linesize *
(1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16));
}
break;

case CACHE_INFO_L2_D:
if (reg_10 & L2_D_PRESENT_MASK) {
cache_info.present = reg_10 & L2_D_PRESENT_MASK;
// No date fetch
}
break;

case CACHE_INFO_L3_IU:
if (reg_10 & L3_IU_PRESENT_MASK) {
uint32_t reg_14 = 0;
cache_info.present = reg_10 & L3_IU_PRESENT_MASK;
cache_info.unify = reg_10 & L3_IU_UNITY_MASK;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_14)
: "r"(LOONGARCH_CFG14)
);
cache_info.associative = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1;
cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24);
cache_info.size = cache_info.associative * cache_info.linesize *
(1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16));
}
break;

case CACHE_INFO_L3_D:
if (reg_10 & L3_D_PRESENT_MASK) {
cache_info.present = reg_10 & L3_D_PRESENT_MASK;
// No data fetch
}
break;

default:
break;
}
*cacheinfo = cache_info;
}

static uint32_t get_prid() {
uint32_t reg = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG0)
);
return reg;
}

static void get_cpucount(uint32_t *count) {
uint32_t num = 0;
FILE *f = fopen("/proc/cpuinfo", "r");
if (!f) return;
char buf[200];
while (fgets(buf, sizeof(buf), f))
{
if (!strncmp("processor", buf, 9))
num ++;
}
fclose(f);
*count = num;
}

/* Detect whether the OS supports the LASX instruction set */
static int os_support_lasx() {
int hwcap = (int)getauxval(AT_HWCAP);

if (hwcap & LA_HWCAP_LASX)
return CPU_LOONGSON3R5;
else if (hwcap & LA_HWCAP_LSX)
return CPU_LOONGSON2K1000;
return 1;
else
return 0;
}

/* Detect whether the OS supports the LSX instruction set */
static int os_support_lsx() {
int hwcap = (int)getauxval(AT_HWCAP);

if (hwcap & LA_HWCAP_LSX)
return 1;
else
return CPU_GENERIC;
#endif
return CPU_GENERIC;
return 0;
}

int get_coretype(void) {
uint32_t prid = get_prid();
switch (prid & PRID_SERIES_MASK) {
case (PRID_SERIES_LA464):
case (PRID_SERIES_LA664):
if (os_support_lasx())
return CORE_LA464;
else if (os_support_lsx())
return CORE_LA264;
else
return CORE_LA64_GENERIC;
break;

case (PRID_SERIES_LA264):
case (PRID_SERIES_LA364):
if (os_support_lsx())
return CORE_LA264;
else
return CORE_LA64_GENERIC;
break;

default:
return CORE_LA64_GENERIC;
break;
}
}

int get_cputype(void) {
uint32_t prid = get_prid();
switch (prid & PRID_SERIES_MASK) {
case (PRID_SERIES_LA264):
return CPU_LA264;
break;

case (PRID_SERIES_LA364):
return CPU_LA364;
break;

case (PRID_SERIES_LA464):
return CPU_LA464;
break;

case (PRID_SERIES_LA664):
return CPU_LA664;
break;

default:
return CPU_LA64_GENERIC;
break;
}
}

char *get_corename(void) {
return cpuname[detect()];
return corename[get_coretype()];
}

void get_libname(void){
printf("%s", corename_lower[get_coretype()]);
}

void get_architecture(void) {
@@ -86,8 +332,7 @@ void get_architecture(void) {
}

void get_subarchitecture(void) {
int d = detect();
printf("%s", cpuname[d]);
printf("%s", cpuname[get_cputype()]);
}

void get_subdirname(void) {
@@ -95,50 +340,69 @@ void get_subdirname(void) {
}

void get_cpuconfig(void) {
uint32_t hwcaps = 0;
int d = detect();

switch (d) {
case CPU_LOONGSON3R5:
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
cache_info_t info;
uint32_t num_cores = 0;

case CPU_LOONGSON2K1000:
printf("#define LOONGSON2K1000\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
printf("#define %s\n", corename[get_coretype()]); // Core name

default:
printf("#define LOONGSONGENERIC\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name

get_cacheinfo(CACHE_INFO_L1_IU, &info);
if (info.present) {
if (info.unify) { // Unified cache, without distinguishing between instructions and data
printf("#define L1_SIZE %d\n", info.size);
printf("#define L1_ASSOCIATIVE %d\n", info.associative);
printf("#define L1_LINESIZE %d\n", info.linesize);
} else {
printf("#define L1_CODE_SIZE %d\n", info.size);
printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative);
printf("#define L1_CODE_LINESIZE %d\n", info.linesize);
}
}

hwcaps = (uint32_t)getauxval( AT_HWCAP );
if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n");
if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n");
}
if (!info.unify) {
get_cacheinfo(CACHE_INFO_L1_D, &info);
if (info.present) {
printf("#define L1_DATA_SIZE %d\n", info.size);
printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative);
printf("#define L1_DATA_LINESIZE %d\n", info.linesize);
}
}

void get_libname(void){
int d = detect();
printf("%s", cpuname_lower[d]);
get_cacheinfo(CACHE_INFO_L2_IU, &info);
if (info.present > 0) {
if (info.unify) {
printf("#define L2_SIZE %d\n", info.size);
printf("#define L2_ASSOCIATIVE %d\n", info.associative);
printf("#define L2_LINESIZE %d\n", info.linesize);
} else {
printf("#define L2_CODE_SIZE %d\n", info.size);
printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative);
printf("#define L2_CODE_LINESIZE %d\n", info.linesize);
}
}

get_cacheinfo(CACHE_INFO_L3_IU, &info);
if (info.present > 0) {
if (info.unify) {
printf("#define L3_SIZE %d\n", info.size);
printf("#define L3_ASSOCIATIVE %d\n", info.associative);
printf("#define L3_LINESIZE %d\n", info.linesize);
} else {
printf("#define L3_CODE_SIZE %d\n", info.size);
printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative);
printf("#define L3_CODE_LINESIZE %d\n", info.linesize);
}
}

if(os_support_lsx) printf("#define HAVE_LSX\n");
if(os_support_lasx) printf("#define HAVE_LASX\n");

get_cpucount(&num_cores);
if (num_cores)
printf("#define NUM_CORES %d\n", num_cores);

//TODO: It’s unclear what this entry represents, but it is indeed necessary.
//It has been set based on reference to other platforms.
printf("#define DTB_DEFAULT_ENTRIES 64\n");
}

+ 1
- 1
driver/others/blas_server.c View File

@@ -1082,7 +1082,7 @@ if (buffer == NULL) {
}

//For target LOONGSON3R5, applying an offset to the buffer is essential
//For LOONGARCH64, applying an offset to the buffer is essential
//for minimizing cache conflicts and optimizing performance.
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);


+ 83
- 21
driver/others/dynamic_loongarch64.c View File

@@ -28,25 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/auxv.h>
#include "common.h"

extern gotoblas_t gotoblas_LOONGSON3R5;
extern gotoblas_t gotoblas_LOONGSON2K1000;
extern gotoblas_t gotoblas_LOONGSONGENERIC;
#define NUM_CORETYPES 6
#define LOONGARCH_CFG0 0x00
#define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)
#define PRID_SERIES_MASK 0xf000
#define PRID_SERIES_LA264 0xa000
#define PRID_SERIES_LA364 0xb000
#define PRID_SERIES_LA464 0xc000
#define PRID_SERIES_LA664 0xd000

extern gotoblas_t gotoblas_LA64_GENERIC;
extern gotoblas_t gotoblas_LA264;
extern gotoblas_t gotoblas_LA464;

extern void openblas_warning(int verbose, const char * msg);

#define NUM_CORETYPES 3

static char *corename[] = {
"loongson3r5",
"loongson2k1000",
"la64_generic",
"la264",
"la464",
"loongsongeneric",
"loongson2k1000",
"loongson3r5",
"unknown"
};

char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0];
if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1];
if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2];
if (gotoblas == &gotoblas_LA64_GENERIC) return corename[0];
if (gotoblas == &gotoblas_LA264) return corename[1];
if (gotoblas == &gotoblas_LA464) return corename[2];
return corename[NUM_CORETYPES];
}

@@ -66,27 +77,78 @@ static gotoblas_t *force_coretype(char *coretype) {

switch (found)
{
case 0: return (&gotoblas_LOONGSON3R5);
case 1: return (&gotoblas_LOONGSON2K1000);
case 2: return (&gotoblas_LOONGSONGENERIC);
case 0: return (&gotoblas_LA64_GENERIC);
case 1: return (&gotoblas_LA264);
case 2: return (&gotoblas_LA464);
case 3: return (&gotoblas_LA64_GENERIC);
case 4: return (&gotoblas_LA264);
case 5: return (&gotoblas_LA464);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return NULL;
}

#define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)

static gotoblas_t *get_coretype(void) {
int hwcap = (int)getauxval(AT_HWCAP);
/* Detect whether the OS supports the LASX instruction set */
static int os_support_lasx() {
int hwcap = (int)getauxval(AT_HWCAP);

if (hwcap & LA_HWCAP_LASX)
return &gotoblas_LOONGSON3R5;
else if (hwcap & LA_HWCAP_LSX)
return &gotoblas_LOONGSON2K1000;
return 1;
else
return 0;
}

/* Detect whether the OS supports the LSX instruction set */
static int os_support_lsx() {
int hwcap = (int)getauxval(AT_HWCAP);

if (hwcap & LA_HWCAP_LSX)
return 1;
else
return &gotoblas_LOONGSONGENERIC;
return 0;
}

static uint32_t get_prid() {
uint32_t reg = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG0)
);
return reg;
}

/* Select core at runtime based on the
* cpu name and SIMD instructions supported
* by the system
*/
static gotoblas_t *get_coretype(void) {
uint32_t prid = get_prid();
switch (prid & PRID_SERIES_MASK) {
case (PRID_SERIES_LA464):
case (PRID_SERIES_LA664):
if (os_support_lasx())
return &gotoblas_LA464;
else if (os_support_lsx())
return &gotoblas_LA264;
else
return &gotoblas_LA64_GENERIC;
break;

case (PRID_SERIES_LA264):
case (PRID_SERIES_LA364):
if (os_support_lsx())
return &gotoblas_LA264;
else
return &gotoblas_LA64_GENERIC;
break;

default:
return &gotoblas_LA64_GENERIC;
break;
}
}

void gotoblas_dynamic_init(void) {


+ 1
- 1
driver/others/parameter.c View File

@@ -752,7 +752,7 @@ int get_L3_size() {
}

void blas_set_parameter(void){
#if defined(LOONGSON3R5)
#if defined(LA464)
int L3_size = get_L3_size();
#ifdef SMP
if(blas_num_threads == 1){


+ 56
- 23
getarch.c View File

@@ -135,11 +135,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_CELL */
/* #define FORCE_MIPS64_GENERIC */
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
/* #define FORCE_LOONGSON3R5 */
/* #define FORCE_LOONGSON2K1000 */
/* #define FORCE_LOONGSONGENERIC */
/* #define FORCE_LA64_GENERIC */
/* #define FORCE_LA264 */
/* #define FORCE_LA464 */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
@@ -153,7 +156,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_EV5 */
/* #define FORCE_EV6 */
/* #define FORCE_CSKY */
/* #define FORCE_CK860FV */
/* #define FORCE_CK860FV */
/* #define FORCE_GENERIC */

#ifdef FORCE_P2
@@ -979,46 +982,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif

#ifdef FORCE_LOONGSON3R5
#if defined(FORCE_LA464) || defined(FORCE_LOONGSON3R5)
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSON3R5"
#ifdef NO_LASX
#ifdef NO_LSX
#define SUBARCHITECTURE "LA64_GENERIC"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSON3R5 " \
#define ARCHCONFIG "-DLA64_GENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
#define LIBNAME "loongson3r5"
#define CORENAME "LOONGSON3R5"
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la64_generic"
#define CORENAME "LA64_GENERIC"
#else
#define SUBARCHITECTURE "LA264"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLA264 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la264"
#define CORENAME "LA264"
#endif
#else
#define SUBARCHITECTURE "LA464"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLA464 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la464"
#define CORENAME "LA464"
#endif
#endif

#ifdef FORCE_LOONGSON2K1000
#if defined(FORCE_LA264) || defined(FORCE_LOONGSON2K1000)
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSON2K1000"
#ifdef NO_LSX
#define SUBARCHITECTURE "LA64_GENERIC"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSON2K1000 " \
#define ARCHCONFIG "-DLA64_GENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
#define LIBNAME "loongson2k1000"
#define CORENAME "LOONGSON2K1000"
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la64_generic"
#define CORENAME "LA64_GENERIC"
#else
#define SUBARCHITECTURE "LA264"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLA264 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la264"
#define CORENAME "LA264"
#endif
#endif

#ifdef FORCE_LOONGSONGENERIC
#if defined(FORCE_LA64_GENERIC) || defined(FORCE_LOONGSONGENERIC)
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSONGENERIC"
#define SUBARCHITECTURE "LA64_GENERIC"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSONGENERIC " \
#define ARCHCONFIG "-DLA64_GENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
#define LIBNAME "loongsongeneric"
#define CORENAME "LOONGSONGENERIC"
#else
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la64_generic"
#define CORENAME "LA64_GENERIC"
#endif

#ifdef FORCE_I6400


+ 1
- 1
interface/gemm.c View File

@@ -572,7 +572,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

buffer = (XFLOAT *)blas_memory_alloc(0);

//For target LOONGSON3R5, applying an offset to the buffer is essential
//For LOONGARCH64, applying an offset to the buffer is essential
//for minimizing cache conflicts and optimizing performance.
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);


kernel/loongarch64/KERNEL.LOONGSON2K1000 → kernel/loongarch64/KERNEL.LA264 View File


kernel/loongarch64/KERNEL.LOONGSON3R5 → kernel/loongarch64/KERNEL.LA464 View File


+ 1
- 1
kernel/setparam-ref.c View File

@@ -1086,7 +1086,7 @@ static void init_parameter(void) {
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
#endif

#if defined(LOONGSON3R5)
#if defined(LA464)
int L3_size = get_L3_size();
#ifdef SMP
if(blas_num_threads == 1){


+ 3
- 3
param.h View File

@@ -2838,7 +2838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif

#if defined (LOONGSON3R5)
#if defined (LA464)
#define SNUMOPT 2
#define DNUMOPT 2

@@ -2891,7 +2891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif

#ifdef LOONGSON2K1000
#ifdef LA264
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
@@ -2926,7 +2926,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif

#ifdef LOONGSONGENERIC
#ifdef LA64_GENERIC
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL


Loading…
Cancel
Save