1. Using core loongson3r3 and loongson3r4 for loongson 2. Add DYNAMIC_ARCH for loongson Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1tags/v0.3.13^2
@@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 | |||||
DYNAMIC_CORE += THUNDERX3T110 | DYNAMIC_CORE += THUNDERX3T110 | ||||
endif | endif | ||||
ifeq ($(ARCH), mips64) | |||||
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 | |||||
endif | |||||
ifeq ($(ARCH), zarch) | ifeq ($(ARCH), zarch) | ||||
DYNAMIC_CORE = ZARCH_GENERIC | DYNAMIC_CORE = ZARCH_GENERIC | ||||
@@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32 | |||||
BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
endif | endif | ||||
ifeq ($(CORE), LOONGSON3A) | |||||
CCOMMON_OPT += -march=mips64 | |||||
FCOMMON_OPT += -march=mips64 | |||||
endif | |||||
ifeq ($(CORE), LOONGSON3B) | |||||
CCOMMON_OPT += -march=mips64 | |||||
FCOMMON_OPT += -march=mips64 | |||||
ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||||
CCOMMON_OPT += -march=loongson3a | |||||
FCOMMON_OPT += -march=loongson3a | |||||
endif | endif | ||||
ifeq ($(CORE), MIPS24K) | ifeq ($(CORE), MIPS24K) | ||||
@@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32 | |||||
else | else | ||||
FCOMMON_OPT += -n64 | FCOMMON_OPT += -n64 | ||||
endif | endif | ||||
ifeq ($(CORE), LOONGSON3A) | |||||
ifeq ($(CORE), LOONGSON3R3) | |||||
FCOMMON_OPT += -loongson3 -static | FCOMMON_OPT += -loongson3 -static | ||||
endif | endif | ||||
ifeq ($(CORE), LOONGSON3B) | |||||
ifeq ($(CORE), LOONGSON3R4) | |||||
FCOMMON_OPT += -loongson3 -static | FCOMMON_OPT += -loongson3 -static | ||||
endif | endif | ||||
@@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32 | |||||
else | else | ||||
CCOMMON_OPT += -n64 | CCOMMON_OPT += -n64 | ||||
endif | endif | ||||
ifeq ($(CORE), LOONGSON3A) | |||||
ifeq ($(CORE), LOONGSON3R3) | |||||
CCOMMON_OPT += -loongson3 -static | CCOMMON_OPT += -loongson3 -static | ||||
endif | endif | ||||
ifeq ($(CORE), LOONGSON3B) | |||||
ifeq ($(CORE), LOONGSON3R4) | |||||
CCOMMON_OPT += -loongson3 -static | CCOMMON_OPT += -loongson3 -static | ||||
endif | endif | ||||
@@ -1223,10 +1222,8 @@ ifdef SMP | |||||
CCOMMON_OPT += -DSMP_SERVER | CCOMMON_OPT += -DSMP_SERVER | ||||
ifeq ($(ARCH), mips64) | ifeq ($(ARCH), mips64) | ||||
ifneq ($(CORE), LOONGSON3B) | |||||
USE_SIMPLE_THREADED_LEVEL3 = 1 | USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
endif | endif | ||||
endif | |||||
ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
# USE_SIMPLE_THREADED_LEVEL3 = 1 | # USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
@@ -1342,11 +1339,9 @@ endif | |||||
ifneq ($(ARCH), x86_64) | ifneq ($(ARCH), x86_64) | ||||
ifneq ($(ARCH), x86) | ifneq ($(ARCH), x86) | ||||
ifneq ($(CORE), LOONGSON3B) | |||||
NO_AFFINITY = 1 | NO_AFFINITY = 1 | ||||
endif | endif | ||||
endif | endif | ||||
endif | |||||
ifdef NO_AFFINITY | ifdef NO_AFFINITY | ||||
ifeq ($(NO_AFFINITY), 0) | ifeq ($(NO_AFFINITY), 0) | ||||
@@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, | |||||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | ||||
return 0; | return 0; | ||||
#else | #else | ||||
#if defined (LOONGSON3B) | |||||
#if defined (__64BIT__) | |||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||||
#else | |||||
return 0; //NULL Implementation on Loongson 3B 32bit. | |||||
#endif | |||||
#else | |||||
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | ||||
// unsigned long null_nodemask=0; | // unsigned long null_nodemask=0; | ||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | ||||
#endif | #endif | ||||
#endif | |||||
} | } | ||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | ||||
@@ -229,12 +229,7 @@ REALNAME: ;\ | |||||
#define BUFFER_SIZE ( 32 << 21) | #define BUFFER_SIZE ( 32 << 21) | ||||
#if defined(LOONGSON3A) | |||||
#define PAGESIZE (16UL << 10) | |||||
#define FIXED_PAGESIZE (16UL << 10) | |||||
#endif | |||||
#if defined(LOONGSON3B) | |||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||||
#define PAGESIZE (16UL << 10) | #define PAGESIZE (16UL << 10) | ||||
#define FIXED_PAGESIZE (16UL << 10) | #define FIXED_PAGESIZE (16UL << 10) | ||||
#endif | #endif | ||||
@@ -250,7 +245,7 @@ REALNAME: ;\ | |||||
#define MAP_ANONYMOUS MAP_ANON | #define MAP_ANONYMOUS MAP_ANON | ||||
#endif | #endif | ||||
#if defined(LOONGSON3A) || defined(LOONGSON3B) | |||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||||
#define PREFETCHD_(x) ld $0, x | #define PREFETCHD_(x) ld $0, x | ||||
#define PREFETCHD(x) PREFETCHD_(x) | #define PREFETCHD(x) PREFETCHD_(x) | ||||
#else | #else | ||||
@@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
/* or implied, of The University of Texas at Austin. */ | /* or implied, of The University of Texas at Austin. */ | ||||
/*********************************************************************/ | /*********************************************************************/ | ||||
#define CPU_UNKNOWN 0 | |||||
#define CPU_SICORTEX 1 | |||||
#define CPU_LOONGSON3A 2 | |||||
#define CPU_LOONGSON3B 3 | |||||
#define CPU_I6400 4 | |||||
#define CPU_P6600 5 | |||||
#define CPU_I6500 6 | |||||
#define CPU_UNKNOWN 0 | |||||
#define CPU_SICORTEX 1 | |||||
#define CPU_LOONGSON3R3 2 | |||||
#define CPU_LOONGSON3R4 3 | |||||
#define CPU_I6400 4 | |||||
#define CPU_P6600 5 | |||||
#define CPU_I6500 6 | |||||
static char *cpuname[] = { | static char *cpuname[] = { | ||||
"UNKNOWN", | "UNKNOWN", | ||||
"SICORTEX", | "SICORTEX", | ||||
"LOONGSON3A", | |||||
"LOONGSON3B", | |||||
"LOONGSON3R3", | |||||
"LOONGSON3R4", | |||||
"I6400", | "I6400", | ||||
"P6600", | "P6600", | ||||
"I6500" | "I6500" | ||||
@@ -90,48 +90,13 @@ static char *cpuname[] = { | |||||
int detect(void){ | int detect(void){ | ||||
#ifdef __linux | |||||
#ifdef linux | |||||
FILE *infile; | FILE *infile; | ||||
char buffer[512], *p; | char buffer[512], *p; | ||||
p = (char *)NULL; | p = (char *)NULL; | ||||
infile = fopen("/proc/cpuinfo", "r"); | |||||
while (fgets(buffer, sizeof(buffer), infile)){ | |||||
if (!strncmp("cpu", buffer, 3)){ | |||||
p = strchr(buffer, ':') + 2; | |||||
#if 0 | |||||
fprintf(stderr, "%s\n", p); | |||||
#endif | |||||
break; | |||||
} | |||||
} | |||||
fclose(infile); | |||||
if(p != NULL){ | |||||
if (strstr(p, "Loongson-3A")){ | |||||
return CPU_LOONGSON3A; | |||||
}else if(strstr(p, "Loongson-3B")){ | |||||
return CPU_LOONGSON3B; | |||||
}else if (strstr(p, "Loongson-3")){ | |||||
infile = fopen("/proc/cpuinfo", "r"); | |||||
p = (char *)NULL; | |||||
while (fgets(buffer, sizeof(buffer), infile)){ | |||||
if (!strncmp("system type", buffer, 11)){ | |||||
p = strchr(buffer, ':') + 2; | |||||
break; | |||||
} | |||||
} | |||||
fclose(infile); | |||||
if (strstr(p, "loongson3a")) | |||||
return CPU_LOONGSON3A; | |||||
}else{ | |||||
return CPU_SICORTEX; | |||||
} | |||||
} | |||||
//Check model name for Loongson3 | //Check model name for Loongson3 | ||||
infile = fopen("/proc/cpuinfo", "r"); | infile = fopen("/proc/cpuinfo", "r"); | ||||
p = (char *)NULL; | |||||
while (fgets(buffer, sizeof(buffer), infile)){ | while (fgets(buffer, sizeof(buffer), infile)){ | ||||
if (!strncmp("model name", buffer, 10)){ | if (!strncmp("model name", buffer, 10)){ | ||||
p = strchr(buffer, ':') + 2; | p = strchr(buffer, ':') + 2; | ||||
@@ -140,14 +105,16 @@ int detect(void){ | |||||
} | } | ||||
fclose(infile); | fclose(infile); | ||||
if(p != NULL){ | if(p != NULL){ | ||||
if (strstr(p, "Loongson-3A")){ | |||||
return CPU_LOONGSON3A; | |||||
}else if(strstr(p, "Loongson-3B")){ | |||||
return CPU_LOONGSON3B; | |||||
} | |||||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||||
return CPU_LOONGSON3R3; | |||||
}else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||||
return CPU_LOONGSON3R4; | |||||
} else{ | |||||
return CPU_SICORTEX; | |||||
} | } | ||||
#endif | #endif | ||||
return CPU_UNKNOWN; | return CPU_UNKNOWN; | ||||
} | |||||
} | } | ||||
char *get_corename(void){ | char *get_corename(void){ | ||||
@@ -159,10 +126,10 @@ void get_architecture(void){ | |||||
} | } | ||||
void get_subarchitecture(void){ | void get_subarchitecture(void){ | ||||
if(detect()==CPU_LOONGSON3A) { | |||||
printf("LOONGSON3A"); | |||||
}else if(detect()==CPU_LOONGSON3B){ | |||||
printf("LOONGSON3B"); | |||||
if(detect()==CPU_LOONGSON3R3) { | |||||
printf("LOONGSON3R3"); | |||||
}else if(detect()==CPU_LOONGSON3R4){ | |||||
printf("LOONGSON3R4"); | |||||
}else if(detect()==CPU_I6400){ | }else if(detect()==CPU_I6400){ | ||||
printf("I6400"); | printf("I6400"); | ||||
}else if(detect()==CPU_P6600){ | }else if(detect()==CPU_P6600){ | ||||
@@ -179,8 +146,8 @@ void get_subdirname(void){ | |||||
} | } | ||||
void get_cpuconfig(void){ | void get_cpuconfig(void){ | ||||
if(detect()==CPU_LOONGSON3A) { | |||||
printf("#define LOONGSON3A\n"); | |||||
if(detect()==CPU_LOONGSON3R3) { | |||||
printf("#define LOONGSON3R3\n"); | |||||
printf("#define L1_DATA_SIZE 65536\n"); | printf("#define L1_DATA_SIZE 65536\n"); | ||||
printf("#define L1_DATA_LINESIZE 32\n"); | printf("#define L1_DATA_LINESIZE 32\n"); | ||||
printf("#define L2_SIZE 512488\n"); | printf("#define L2_SIZE 512488\n"); | ||||
@@ -188,8 +155,8 @@ void get_cpuconfig(void){ | |||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | printf("#define DTB_DEFAULT_ENTRIES 64\n"); | ||||
printf("#define DTB_SIZE 4096\n"); | printf("#define DTB_SIZE 4096\n"); | ||||
printf("#define L2_ASSOCIATIVE 4\n"); | printf("#define L2_ASSOCIATIVE 4\n"); | ||||
}else if(detect()==CPU_LOONGSON3B){ | |||||
printf("#define LOONGSON3B\n"); | |||||
}else if(detect()==CPU_LOONGSON3R4){ | |||||
printf("#define LOONGSON3R4\n"); | |||||
printf("#define L1_DATA_SIZE 65536\n"); | printf("#define L1_DATA_SIZE 65536\n"); | ||||
printf("#define L1_DATA_LINESIZE 32\n"); | printf("#define L1_DATA_LINESIZE 32\n"); | ||||
printf("#define L2_SIZE 512488\n"); | printf("#define L2_SIZE 512488\n"); | ||||
@@ -237,10 +204,10 @@ void get_cpuconfig(void){ | |||||
} | } | ||||
void get_libname(void){ | void get_libname(void){ | ||||
if(detect()==CPU_LOONGSON3A) { | |||||
printf("loongson3a\n"); | |||||
}else if(detect()==CPU_LOONGSON3B) { | |||||
printf("loongson3b\n"); | |||||
if(detect()==CPU_LOONGSON3R3) { | |||||
printf("loongson3r3\n"); | |||||
}else if(detect()==CPU_LOONGSON3R4) { | |||||
printf("loongson3r4\n"); | |||||
}else if(detect()==CPU_I6400) { | }else if(detect()==CPU_I6400) { | ||||
printf("i6400\n"); | printf("i6400\n"); | ||||
}else if(detect()==CPU_P6600) { | }else if(detect()==CPU_P6600) { | ||||
@@ -24,10 +24,14 @@ else | |||||
ifeq ($(ARCH),zarch) | ifeq ($(ARCH),zarch) | ||||
COMMONOBJS += dynamic_zarch.$(SUFFIX) | COMMONOBJS += dynamic_zarch.$(SUFFIX) | ||||
else | else | ||||
ifeq ($(ARCH),mips64) | |||||
COMMONOBJS += dynamic_mips64.$(SUFFIX) | |||||
else | |||||
COMMONOBJS += dynamic.$(SUFFIX) | COMMONOBJS += dynamic.$(SUFFIX) | ||||
endif | endif | ||||
endif | endif | ||||
endif | endif | ||||
endif | |||||
else | else | ||||
COMMONOBJS += parameter.$(SUFFIX) | COMMONOBJS += parameter.$(SUFFIX) | ||||
endif | endif | ||||
@@ -92,10 +96,14 @@ else | |||||
ifeq ($(ARCH),zarch) | ifeq ($(ARCH),zarch) | ||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) | ||||
else | else | ||||
ifeq ($(ARCH),mips64) | |||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) | |||||
else | |||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | ||||
endif | endif | ||||
endif | endif | ||||
endif | endif | ||||
endif | |||||
else | else | ||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | ||||
endif | endif | ||||
@@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { | |||||
blas_cpu_number = num_threads; | blas_cpu_number = num_threads; | ||||
#if defined(ARCH_MIPS64) | #if defined(ARCH_MIPS64) | ||||
#ifndef DYNAMIC_ARCH | |||||
//set parameters for different number of threads. | //set parameters for different number of threads. | ||||
blas_set_parameter(); | blas_set_parameter(); | ||||
#endif | #endif | ||||
#endif | |||||
} | } | ||||
@@ -0,0 +1,230 @@ | |||||
/***************************************************************************** | |||||
Copyright (c) 2020, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
**********************************************************************************/ | |||||
#include <sys/wait.h> | |||||
#include <stdio.h> | |||||
#include <unistd.h> | |||||
#include <stdlib.h> | |||||
#include <string.h> | |||||
#include <sys/resource.h> | |||||
#include "common.h" | |||||
extern gotoblas_t gotoblas_LOONGSON3R3; | |||||
extern gotoblas_t gotoblas_LOONGSON3R4; | |||||
extern void openblas_warning(int verbose, const char * msg); | |||||
#define NUM_CORETYPES 2 | |||||
static char *corename[] = { | |||||
"loongson3r3", | |||||
"loongson3r4", | |||||
"UNKNOWN" | |||||
}; | |||||
char *gotoblas_corename(void) { | |||||
if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; | |||||
if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; | |||||
return corename[NUM_CORETYPES]; | |||||
} | |||||
static gotoblas_t *force_coretype(char *coretype) { | |||||
int i; | |||||
int found = -1; | |||||
char message[128]; | |||||
for ( i=0 ; i < NUM_CORETYPES; i++) | |||||
{ | |||||
if (!strncasecmp(coretype, corename[i], 20)) | |||||
{ | |||||
found = i; | |||||
break; | |||||
} | |||||
} | |||||
switch (found) | |||||
{ | |||||
case 0: return (&gotoblas_LOONGSON3R3); | |||||
case 1: return (&gotoblas_LOONGSON3R4); | |||||
} | |||||
snprintf(message, 128, "Core not found: %s\n", coretype); | |||||
openblas_warning(1, message); | |||||
return NULL; | |||||
} | |||||
#define MMI_MASK 0x00000010 | |||||
#define MSA_MASK 0x00000020 | |||||
int fd[2]; | |||||
int support_cpucfg; | |||||
static void handler(int signum) | |||||
{ | |||||
close(fd[1]); | |||||
exit(1); | |||||
} | |||||
/* Brief : Function to check if cpucfg supported on loongson | |||||
* Return: 1 supported | |||||
* 0 not supported | |||||
*/ | |||||
static int cpucfg_test(void) { | |||||
pid_t pid; | |||||
int status = 0; | |||||
support_cpucfg = 0; | |||||
pipe(fd); | |||||
pid = fork(); | |||||
if (pid == 0) { /* Subprocess */ | |||||
struct sigaction act; | |||||
close(fd[0]); | |||||
/* Set signal action for SIGILL. */ | |||||
act.sa_handler = handler; | |||||
sigaction(SIGILL,&act,NULL); | |||||
/* Execute cpucfg in subprocess. */ | |||||
__asm__ volatile( | |||||
".insn \n\t" | |||||
".word (0xc8080118) \n\t" | |||||
::: | |||||
); | |||||
support_cpucfg = 1; | |||||
write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); | |||||
close(fd[1]); | |||||
exit(0); | |||||
} else if (pid > 0){ /* Parent process*/ | |||||
close(fd[1]); | |||||
if ((waitpid(pid,&status,0) <= 0) || | |||||
(read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) | |||||
support_cpucfg = 0; | |||||
close(fd[0]); | |||||
} else { | |||||
support_cpucfg = 0; | |||||
} | |||||
return support_cpucfg; | |||||
} | |||||
static gotoblas_t *get_coretype_from_cpucfg(void) { | |||||
int flag = 0; | |||||
__asm__ volatile( | |||||
".insn \n\t" | |||||
"dli $8, 0x01 \n\t" | |||||
".word (0xc9084918) \n\t" | |||||
"usw $9, 0x00(%0) \n\t" | |||||
: | |||||
: "r"(&flag) | |||||
: "memory" | |||||
); | |||||
if (flag & MSA_MASK) | |||||
return (&gotoblas_LOONGSON3R4); | |||||
if (flag & MMI_MASK) | |||||
return (&gotoblas_LOONGSON3R3); | |||||
return NULL; | |||||
} | |||||
static gotoblas_t *get_coretype_from_cpuinfo(void) { | |||||
#ifdef linux | |||||
FILE *infile; | |||||
char buffer[512], *p; | |||||
p = (char *)NULL; | |||||
//Check model name for Loongson3 | |||||
infile = fopen("/proc/cpuinfo", "r"); | |||||
while (fgets(buffer, sizeof(buffer), infile)){ | |||||
if (!strncmp("model name", buffer, 10)){ | |||||
p = strchr(buffer, ':') + 2; | |||||
break; | |||||
} | |||||
} | |||||
fclose(infile); | |||||
if(p != NULL){ | |||||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) | |||||
return (&gotoblas_LOONGSON3R3); | |||||
else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) | |||||
return (&gotoblas_LOONGSON3R4); | |||||
else | |||||
return NULL; | |||||
} | |||||
#endif | |||||
return NULL; | |||||
} | |||||
static gotoblas_t *get_coretype(void) { | |||||
int ret = 0; | |||||
ret = cpucfg_test(); | |||||
if (ret == 1) | |||||
return get_coretype_from_cpucfg(); | |||||
else | |||||
return get_coretype_from_cpuinfo(); | |||||
} | |||||
void gotoblas_dynamic_init(void) { | |||||
char coremsg[128]; | |||||
char coren[22]; | |||||
char *p; | |||||
if (gotoblas) return; | |||||
p = getenv("OPENBLAS_CORETYPE"); | |||||
if ( p ) | |||||
{ | |||||
gotoblas = force_coretype(p); | |||||
} | |||||
else | |||||
{ | |||||
gotoblas = get_coretype(); | |||||
} | |||||
if (gotoblas == NULL) | |||||
{ | |||||
snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); | |||||
openblas_warning(1, coremsg); | |||||
gotoblas = &gotoblas_LOONGSON3R3; | |||||
} | |||||
if (gotoblas && gotoblas->init) { | |||||
strncpy(coren, gotoblas_corename(), 20); | |||||
sprintf(coremsg, "Core: %s\n", coren); | |||||
openblas_warning(2, coremsg); | |||||
gotoblas -> init(); | |||||
} else { | |||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||||
exit(1); | |||||
} | |||||
} | |||||
void gotoblas_dynamic_quit(void) { | |||||
gotoblas = NULL; | |||||
} |
@@ -717,7 +717,7 @@ void blas_set_parameter(void){ | |||||
#if defined(ARCH_MIPS64) | #if defined(ARCH_MIPS64) | ||||
void blas_set_parameter(void){ | void blas_set_parameter(void){ | ||||
#if defined(LOONGSON3A) | |||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||||
#ifdef SMP | #ifdef SMP | ||||
if(blas_num_threads == 1){ | if(blas_num_threads == 1){ | ||||
#endif | #endif | ||||
@@ -731,20 +731,6 @@ void blas_set_parameter(void){ | |||||
#endif | #endif | ||||
#endif | #endif | ||||
#if defined(LOONGSON3B) | |||||
#ifdef SMP | |||||
if(blas_num_threads == 1 || blas_num_threads == 2){ | |||||
#endif | |||||
//single thread | |||||
dgemm_r = 640; | |||||
#ifdef SMP | |||||
}else{ | |||||
//multi thread | |||||
dgemm_r = 160; | |||||
} | |||||
#endif | |||||
#endif | |||||
} | } | ||||
#endif | #endif | ||||
@@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
/* #define FORCE_PPC440FP2 */ | /* #define FORCE_PPC440FP2 */ | ||||
/* #define FORCE_CELL */ | /* #define FORCE_CELL */ | ||||
/* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
/* #define FORCE_LOONGSON3A */ | |||||
/* #define FORCE_LOONGSON3B */ | |||||
/* #define FORCE_LOONGSON3R3 */ | |||||
/* #define FORCE_LOONGSON3R4 */ | |||||
/* #define FORCE_I6400 */ | /* #define FORCE_I6400 */ | ||||
/* #define FORCE_P6600 */ | /* #define FORCE_P6600 */ | ||||
/* #define FORCE_P5600 */ | /* #define FORCE_P5600 */ | ||||
@@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
#ifdef FORCE_LOONGSON3A | |||||
#ifdef FORCE_LOONGSON3R3 | |||||
#define FORCE | #define FORCE | ||||
#define ARCHITECTURE "MIPS" | #define ARCHITECTURE "MIPS" | ||||
#define SUBARCHITECTURE "LOONGSON3A" | |||||
#define SUBARCHITECTURE "LOONGSON3R3" | |||||
#define SUBDIRNAME "mips64" | #define SUBDIRNAME "mips64" | ||||
#define ARCHCONFIG "-DLOONGSON3A " \ | |||||
#define ARCHCONFIG "-DLOONGSON3R3 " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | ||||
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | ||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | ||||
#define LIBNAME "loongson3a" | |||||
#define CORENAME "LOONGSON3A" | |||||
#define LIBNAME "loongson3r3" | |||||
#define CORENAME "LOONGSON3R3" | |||||
#else | #else | ||||
#endif | #endif | ||||
#ifdef FORCE_LOONGSON3B | |||||
#ifdef FORCE_LOONGSON3R4 | |||||
#define FORCE | #define FORCE | ||||
#define ARCHITECTURE "MIPS" | #define ARCHITECTURE "MIPS" | ||||
#define SUBARCHITECTURE "LOONGSON3B" | |||||
#define SUBARCHITECTURE "LOONGSON3R4" | |||||
#define SUBDIRNAME "mips64" | #define SUBDIRNAME "mips64" | ||||
#define ARCHCONFIG "-DLOONGSON3B " \ | |||||
#define ARCHCONFIG "-DLOONGSON3R4 " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | ||||
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | ||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | ||||
#define LIBNAME "loongson3b" | |||||
#define CORENAME "LOONGSON3B" | |||||
#define LIBNAME "loongson3r4" | |||||
#define CORENAME "LOONGSON3R4" | |||||
#else | #else | ||||
#endif | #endif | ||||
@@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) | |||||
endif | endif | ||||
else ifeq ($(TARGET_CORE), HASWELL) | else ifeq ($(TARGET_CORE), HASWELL) | ||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | ||||
else ifeq ($(TARGET_CORE), LOONGSON3R4) | |||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) | |||||
else | else | ||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | ||||
endif | endif | ||||
@@ -68,6 +70,9 @@ else | |||||
TARGET_CORE = $(CORE) | TARGET_CORE = $(CORE) | ||||
KDIR = | KDIR = | ||||
TSUFFIX = | TSUFFIX = | ||||
ifeq ($(TARGET_CORE), LOONGSON3R4) | |||||
override CFLAGS += $(MSA_FLAGS) | |||||
endif | |||||
endif | endif | ||||
-include $(KERNELDIR)/KERNEL.$(TARGET_CORE) | -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) | ||||
@@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) | |||||
USE_TRMM = 1 | USE_TRMM = 1 | ||||
endif | endif | ||||
ifeq ($(TARGET), LOONGSON3B) | |||||
USE_TRMM = 1 | |||||
endif | |||||
ifneq ($(DYNAMIC_ARCH), 1) | ifneq ($(DYNAMIC_ARCH), 1) | ||||
ifeq ($(TARGET), GENERIC) | ifeq ($(TARGET), GENERIC) | ||||
USE_TRMM = 1 | USE_TRMM = 1 | ||||
@@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | ||||
{ \ | { \ | ||||
LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ | LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ | ||||
src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ | |||||
src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ | |||||
SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | ||||
\ | \ | ||||
PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | ||||
@@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | ||||
{ \ | { \ | ||||
LD_SP2_INC(pa0, 4, src_a0, src_a1); \ | LD_SP2_INC(pa0, 4, src_a0, src_a1); \ | ||||
src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ | |||||
src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ | |||||
SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | ||||
\ | \ | ||||
PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | ||||
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
{ | { | ||||
if ((0 == c) && (0 == s)) | if ((0 == c) && (0 == s)) | ||||
{ | { | ||||
v4f32 zero = __msa_cast_to_vector_float(0); | |||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); | |||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); | |||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); | |||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); | |||||
v4f32 zero = {0.0, 0.0, 0.0, 0.0}; | |||||
/* process 2 elements */ | /* process 2 elements */ | ||||
for (j = (n >> 1); j--;) | for (j = (n >> 1); j--;) | ||||
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
{ | { | ||||
if ((0.0 == da_r) && (0.0 == da_i)) | if ((0.0 == da_r) && (0.0 == da_i)) | ||||
{ | { | ||||
v4f32 zero_v = __msa_cast_to_vector_float(0); | |||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||||
v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||||
for (i = (n >> 5); i--;) | for (i = (n >> 5); i--;) | ||||
{ | { | ||||
@@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
{ | { | ||||
if (0.0 == da) | if (0.0 == da) | ||||
{ | { | ||||
v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
v2f64 zero_v = {0.0, 0.0}; | |||||
for (i = (n >> 5); i--;) | for (i = (n >> 5); i--;) | ||||
{ | { | ||||
@@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); | ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); | ||||
ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); | ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); | ||||
src_a54 = __msa_cast_to_vector_double(*(a + 54)); | |||||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||||
src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); | |||||
src_a62 = LD_DP(a + 62); | src_a62 = LD_DP(a + 62); | ||||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); | src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); | ||||
src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); | src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); | ||||
@@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
src_a44 = LD_DP(a + 44); | src_a44 = LD_DP(a + 44); | ||||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); | src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); | ||||
src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); | src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); | ||||
src_a36 = __msa_cast_to_vector_double(*(a + 36)); | |||||
src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); | |||||
src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); | |||||
res_c7 *= src_a63; | res_c7 *= src_a63; | ||||
res_c6 -= res_c7 * src_a62; | res_c6 -= res_c7 * src_a62; | ||||
@@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
src_a26 = LD_DP(a + 26); | src_a26 = LD_DP(a + 26); | ||||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); | src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); | ||||
src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); | src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); | ||||
src_a18 = __msa_cast_to_vector_double(*(a + 18)); | |||||
src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); | |||||
src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); | |||||
res_c3 -= res_c7 * src_a59; | res_c3 -= res_c7 * src_a59; | ||||
res_c2 -= res_c7 * src_a58; | res_c2 -= res_c7 * src_a58; | ||||
@@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
src_a8 = LD_DP(a + 8); | src_a8 = LD_DP(a + 8); | ||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | ||||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | ||||
src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||||
res_c1 -= res_c2 * src_a17; | res_c1 -= res_c2 * src_a17; | ||||
res_c1 *= src_a9; | res_c1 *= src_a9; | ||||
@@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_a52 = LD_DP(a - 12); | src_a52 = LD_DP(a - 12); | ||||
src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); | src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); | ||||
src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); | src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); | ||||
src_a54 = __msa_cast_to_vector_double(*(a - 10)); | |||||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||||
src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); | |||||
src_a40 = LD_DP(a - 24); | src_a40 = LD_DP(a - 24); | ||||
src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); | src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); | ||||
@@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_a34 = LD_DP(a - 30); | src_a34 = LD_DP(a - 30); | ||||
src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); | src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); | ||||
src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); | src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); | ||||
src_a36 = __msa_cast_to_vector_double(*(a - 28)); | |||||
src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); | |||||
src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); | |||||
res_c4 *= src_a36; | res_c4 *= src_a36; | ||||
res_c3 -= res_c4 * src_a35; | res_c3 -= res_c4 * src_a35; | ||||
@@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_a16 = LD_DP(a - 48); | src_a16 = LD_DP(a - 48); | ||||
src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); | src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); | ||||
src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); | src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); | ||||
src_a18 = __msa_cast_to_vector_double(*(a - 46)); | |||||
src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); | |||||
src_a0 = __msa_cast_to_vector_double(*(a - 64)); | |||||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||||
src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); | |||||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); | |||||
src_a8 = LD_DP(a - 56); | src_a8 = LD_DP(a - 56); | ||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | ||||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | ||||
@@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | ||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | ||||
src_a8 = __msa_cast_to_vector_double(*(a + 8)); | |||||
src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||||
src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); | |||||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||||
src_a4 = LD_DP(a + 4); | src_a4 = LD_DP(a + 4); | ||||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | ||||
@@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | ||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | ||||
src_a8 = __msa_cast_to_vector_double(*(a + 8)); | |||||
src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||||
src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); | |||||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||||
src_a4 = LD_DP(a + 4); | src_a4 = LD_DP(a + 4); | ||||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | ||||
@@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
res_c14 -= res_c8 * src_a6; | res_c14 -= res_c8 * src_a6; | ||||
res_c15 -= res_c8 * src_a7; | res_c15 -= res_c8 * src_a7; | ||||
src_a9 = __msa_cast_to_vector_double(*(a + 9)); | |||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||||
src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); | |||||
src_a10 = LD_DP(a + 10); | src_a10 = LD_DP(a + 10); | ||||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | ||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | ||||
@@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
res_c14 -= res_c10 * src_a22; | res_c14 -= res_c10 * src_a22; | ||||
res_c15 -= res_c10 * src_a23; | res_c15 -= res_c10 * src_a23; | ||||
src_a27 = __msa_cast_to_vector_double(*(a + 27)); | |||||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); | |||||
src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); | |||||
src_a28 = LD_DP(a + 28); | src_a28 = LD_DP(a + 28); | ||||
src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | ||||
src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | ||||
@@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
res_c14 -= res_c12 * src_a38; | res_c14 -= res_c12 * src_a38; | ||||
res_c15 -= res_c12 * src_a39; | res_c15 -= res_c12 * src_a39; | ||||
src_a45 = __msa_cast_to_vector_double(*(a + 45)); | |||||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); | |||||
src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); | |||||
src_a46 = LD_DP(a + 46); | src_a46 = LD_DP(a + 46); | ||||
src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | ||||
src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | ||||
@@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); | ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); | ||||
ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); | ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); | ||||
src_a63 = __msa_cast_to_vector_double(*(a + 63)); | |||||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); | |||||
src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); | |||||
src_a54 = LD_DP(a + 54); | src_a54 = LD_DP(a + 54); | ||||
src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | ||||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | ||||
@@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
res_c6 -= res_c0 * src_a6; | res_c6 -= res_c0 * src_a6; | ||||
res_c7 -= res_c0 * src_a7; | res_c7 -= res_c0 * src_a7; | ||||
src_a9 = __msa_cast_to_vector_double(*(a + 9)); | |||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||||
src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); | |||||
src_a10 = LD_DP(a + 10); | src_a10 = LD_DP(a + 10); | ||||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | ||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | ||||
@@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
res_c6 -= res_c2 * src_a22; | res_c6 -= res_c2 * src_a22; | ||||
res_c7 -= res_c2 * src_a23; | res_c7 -= res_c2 * src_a23; | ||||
src_a27 = __msa_cast_to_vector_double(*(a + 27)); | |||||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); | |||||
src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); | |||||
src_a28 = LD_DP(a + 28); | src_a28 = LD_DP(a + 28); | ||||
src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | ||||
src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | ||||
@@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
res_c6 -= res_c4 * src_a38; | res_c6 -= res_c4 * src_a38; | ||||
res_c7 -= res_c4 * src_a39; | res_c7 -= res_c4 * src_a39; | ||||
src_a45 = __msa_cast_to_vector_double(*(a + 45)); | |||||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); | |||||
src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); | |||||
src_a46 = LD_DP(a + 46); | src_a46 = LD_DP(a + 46); | ||||
src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | ||||
src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | ||||
@@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
res_c6 -= res_c5 * src_a46; | res_c6 -= res_c5 * src_a46; | ||||
res_c7 -= res_c5 * src_a47; | res_c7 -= res_c5 * src_a47; | ||||
src_a63 = __msa_cast_to_vector_double(*(a + 63)); | |||||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); | |||||
src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); | |||||
src_a54 = LD_DP(a + 54); | src_a54 = LD_DP(a + 54); | ||||
src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | ||||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | ||||
@@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
res_c6 -= res_c4 * src_a2; | res_c6 -= res_c4 * src_a2; | ||||
res_c7 -= res_c4 * src_a3; | res_c7 -= res_c4 * src_a3; | ||||
src_a5 = __msa_cast_to_vector_double(*(a + 5)); | |||||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); | |||||
src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); | |||||
src_a6 = LD_DP(a + 6); | src_a6 = LD_DP(a + 6); | ||||
src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | ||||
src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | ||||
@@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_a10 = LD_DP(a + 10); | src_a10 = LD_DP(a + 10); | ||||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | ||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | ||||
src_a15 = __msa_cast_to_vector_double(*(a + 15)); | |||||
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); | |||||
src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); | |||||
res_c2 *= src_a10; | res_c2 *= src_a10; | ||||
res_c3 -= res_c2 * src_a11; | res_c3 -= res_c2 * src_a11; | ||||
@@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
res_c2 -= res_c0 * src_a2; | res_c2 -= res_c0 * src_a2; | ||||
res_c3 -= res_c0 * src_a3; | res_c3 -= res_c0 * src_a3; | ||||
src_a5 = __msa_cast_to_vector_double(*(a + 5)); | |||||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); | |||||
src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); | |||||
src_a6 = LD_DP(a + 6); | src_a6 = LD_DP(a + 6); | ||||
src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | ||||
src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | ||||
@@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_a10 = LD_DP(a + 10); | src_a10 = LD_DP(a + 10); | ||||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | ||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | ||||
src_a15 = __msa_cast_to_vector_double(*(a + 15)); | |||||
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); | |||||
src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); | |||||
res_c2 *= src_a10; | res_c2 *= src_a10; | ||||
res_c3 -= res_c2 * src_a11; | res_c3 -= res_c2 * src_a11; | ||||
@@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
src_b2 = LD_DP(b + 2); | src_b2 = LD_DP(b + 2); | ||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | ||||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | ||||
src_b5 = __msa_cast_to_vector_double(*(b + 5)); | |||||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); | |||||
src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); | |||||
src_b6 = LD_DP(b + 6); | src_b6 = LD_DP(b + 6); | ||||
src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | ||||
src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | ||||
src_b10 = LD_DP(b + 10); | src_b10 = LD_DP(b + 10); | ||||
src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | ||||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | ||||
src_b15 = __msa_cast_to_vector_double(*(b + 15)); | |||||
src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); | |||||
src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); | |||||
src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
@@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_b0 = LD_DP(b + 0); | src_b0 = LD_DP(b + 0); | ||||
src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | ||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | ||||
src_b3 = __msa_cast_to_vector_double(*(b + 3)); | |||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); | |||||
src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); | |||||
src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
@@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||||
} | } | ||||
} | } | ||||
src_b0 = __msa_cast_to_vector_double(*b); | |||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*b); | |||||
src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
@@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_b2 = LD_DP(b + 2); | src_b2 = LD_DP(b + 2); | ||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | ||||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | ||||
src_b5 = __msa_cast_to_vector_double(*(b + 5)); | |||||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); | |||||
src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); | |||||
src_b6 = LD_DP(b + 6); | src_b6 = LD_DP(b + 6); | ||||
src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | ||||
src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | ||||
src_b10 = LD_DP(b + 10); | src_b10 = LD_DP(b + 10); | ||||
src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | ||||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | ||||
src_b15 = __msa_cast_to_vector_double(*(b + 15)); | |||||
src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); | |||||
src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); | |||||
src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
@@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_b0 = LD_DP(b + 0); | src_b0 = LD_DP(b + 0); | ||||
src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | ||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | ||||
src_b3 = __msa_cast_to_vector_double(*(b + 3)); | |||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); | |||||
src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); | |||||
src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
@@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
src_b8 = LD_DP(b + 8); | src_b8 = LD_DP(b + 8); | ||||
src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | ||||
src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | ||||
src_b10 = __msa_cast_to_vector_double(*(b + 10)); | |||||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||||
src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); | |||||
src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||||
src_b4 = LD_DP(b + 4); | src_b4 = LD_DP(b + 4); | ||||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | ||||
src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | ||||
@@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
a -= 16; | a -= 16; | ||||
b -= 4; | b -= 4; | ||||
src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||||
src_b2 = LD_DP(b + 2); | src_b2 = LD_DP(b + 2); | ||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | ||||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | ||||
@@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||||
a -= 8; | a -= 8; | ||||
b -= 1; | b -= 1; | ||||
src_b0 = __msa_cast_to_vector_double(*b); | |||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*b); | |||||
src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
@@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
src_b8 = LD_DP(b + 8); | src_b8 = LD_DP(b + 8); | ||||
src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | ||||
src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | ||||
src_b10 = __msa_cast_to_vector_double(*(b + 10)); | |||||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||||
src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); | |||||
src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||||
src_b4 = LD_DP(b + 4); | src_b4 = LD_DP(b + 4); | ||||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | ||||
src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | ||||
@@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
a -= 8; | a -= 8; | ||||
b -= 4; | b -= 4; | ||||
src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||||
src_b2 = LD_DP(b + 2); | src_b2 = LD_DP(b + 2); | ||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | ||||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | ||||
@@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) | |||||
#define ST_DP(...) ST_D(v2f64, __VA_ARGS__) | #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) | ||||
#define COPY_FLOAT_TO_VECTOR(a) ( { \ | #define COPY_FLOAT_TO_VECTOR(a) ( { \ | ||||
v4f32 out; \ | |||||
out = __msa_cast_to_vector_float(a); \ | |||||
out = (v4f32) __msa_splati_w((v4i32) out, 0); \ | |||||
v4f32 out = {a, a, a, a}; \ | |||||
out; \ | out; \ | ||||
} ) | } ) | ||||
#define COPY_DOUBLE_TO_VECTOR(a) ( { \ | #define COPY_DOUBLE_TO_VECTOR(a) ( { \ | ||||
v2f64 out; \ | |||||
out = __msa_cast_to_vector_double(a); \ | |||||
out = (v2f64) __msa_splati_d((v2i64) out, 0); \ | |||||
v2f64 out = {a, a}; \ | |||||
out; \ | out; \ | ||||
} ) | } ) | ||||
@@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
{ | { | ||||
if ((0 == c) && (0 == s)) | if ((0 == c) && (0 == s)) | ||||
{ | { | ||||
v4f32 zero = __msa_cast_to_vector_float(0); | |||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); | |||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); | |||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); | |||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); | |||||
v4f32 zero = {0.0, 0.0, 0.0, 0.0}; | |||||
/* process 4 floats */ | /* process 4 floats */ | ||||
for (j = (n >> 2); j--;) | for (j = (n >> 2); j--;) | ||||
@@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
{ | { | ||||
if (0.0 == da) | if (0.0 == da) | ||||
{ | { | ||||
v4f32 zero_v = __msa_cast_to_vector_float(0); | |||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||||
v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||||
for (i = (n >> 6); i--;) | for (i = (n >> 6); i--;) | ||||
{ | { | ||||
@@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
{ | { | ||||
if ((0.0 == da_r) && (0.0 == da_i)) | if ((0.0 == da_r) && (0.0 == da_i)) | ||||
{ | { | ||||
v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
v2f64 zero_v = {0.0, 0.0}; | |||||
for (i = (n >> 4); i--;) | for (i = (n >> 4); i--;) | ||||
{ | { | ||||
@@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
if ((0.0 == da_r) && (0.0 == da_i)) | if ((0.0 == da_r) && (0.0 == da_i)) | ||||
{ | { | ||||
v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
v2f64 zero_v = {0.0, 0.0}; | |||||
for (i = (n >> 4); i--;) | for (i = (n >> 4); i--;) | ||||
{ | { | ||||
@@ -1,64 +0,0 @@ | |||||
SAXPYKERNEL=axpy_loongson3a.S | |||||
DAXPYKERNEL=daxpy_loongson3a_simd.S | |||||
SGEMVNKERNEL = gemv_n_loongson3a.c | |||||
SGEMVTKERNEL = gemv_t_loongson3a.c | |||||
DGEMVNKERNEL = gemv_n_loongson3a.c | |||||
DGEMVTKERNEL = gemv_t_loongson3a.c | |||||
CGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
CGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
@@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c | SGEMMITCOPY = ../generic/gemm_tcopy_8.c | ||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | SGEMMONCOPY = ../generic/gemm_ncopy_4.c | ||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | ||||
SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | ||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | DGEMMONCOPY = ../generic/gemm_ncopy_4.c | ||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | ||||
DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | ||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | ||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | ||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ||||
CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | ||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
@@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
DSDOTKERNEL = ../mips/dot.c | DSDOTKERNEL = ../mips/dot.c | ||||
@@ -0,0 +1,192 @@ | |||||
ifdef HAVE_MSA | |||||
SAXPYKERNEL = ../mips/saxpy_msa.c | |||||
DAXPYKERNEL = ../mips/daxpy_msa.c | |||||
CAXPYKERNEL = ../mips/caxpy_msa.c | |||||
ZAXPYKERNEL = ../mips/zaxpy_msa.c | |||||
else | |||||
SAXPYKERNEL = axpy_loongson3a.S | |||||
DAXPYKERNEL = daxpy_loongson3a_simd.S | |||||
endif | |||||
ifdef HAVE_MSA | |||||
SCOPYKERNEL = ../mips/scopy_msa.c | |||||
DCOPYKERNEL = ../mips/dcopy_msa.c | |||||
CCOPYKERNEL = ../mips/ccopy_msa.c | |||||
ZCOPYKERNEL = ../mips/zcopy_msa.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
SDOTKERNEL = ../mips/sdot_msa.c | |||||
DDOTKERNEL = ../mips/ddot_msa.c | |||||
CDOTKERNEL = ../mips/cdot_msa.c | |||||
ZDOTKERNEL = ../mips/zdot_msa.c | |||||
endif | |||||
DSDOTKERNEL = ../mips/dot.c | |||||
ifdef HAVE_MSA | |||||
SROTKERNEL = ../mips/srot_msa.c | |||||
DROTKERNEL = ../mips/drot_msa.c | |||||
CROTKERNEL = ../mips/crot_msa.c | |||||
ZROTKERNEL = ../mips/zrot_msa.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
SSCALKERNEL = ../mips/sscal_msa.c | |||||
DSCALKERNEL = ../mips/dscal_msa.c | |||||
CSCALKERNEL = ../mips/cscal_msa.c | |||||
ZSCALKERNEL = ../mips/zscal_msa.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||||
DGEMVNKERNEL = ../mips/dgemv_n_msa.c | |||||
SGEMVTKERNEL = ../mips/sgemv_t_msa.c | |||||
DGEMVTKERNEL = ../mips/dgemv_t_msa.c | |||||
CGEMVNKERNEL = ../mips/cgemv_n_msa.c | |||||
CGEMVTKERNEL = ../mips/cgemv_t_msa.c | |||||
ZGEMVNKERNEL = ../mips/zgemv_n_msa.c | |||||
ZGEMVTKERNEL = ../mips/zgemv_t_msa.c | |||||
else | |||||
SGEMVNKERNEL = gemv_n_loongson3a.c | |||||
SGEMVTKERNEL = gemv_t_loongson3a.c | |||||
DGEMVNKERNEL = gemv_n_loongson3a.c | |||||
DGEMVTKERNEL = gemv_t_loongson3a.c | |||||
CGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
CGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
SASUMKERNEL = ../mips/sasum_msa.c | |||||
DASUMKERNEL = ../mips/dasum_msa.c | |||||
CASUMKERNEL = ../mips/casum_msa.c | |||||
ZASUMKERNEL = ../mips/zasum_msa.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
SSWAPKERNEL = ../mips/sswap_msa.c | |||||
DSWAPKERNEL = ../mips/dswap_msa.c | |||||
CSWAPKERNEL = ../mips/cswap_msa.c | |||||
ZSWAPKERNEL = ../mips/zswap_msa.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||||
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||||
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
else | |||||
SGEMMKERNEL = sgemm_kernel_8x4_ps.S | |||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
ifdef HAVE_MSA | |||||
DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | |||||
DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | |||||
DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | |||||
DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c | |||||
DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c | |||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
else | |||||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
ifdef HAVE_MSA | |||||
CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||||
CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||||
CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||||
CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c | |||||
CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
else | |||||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
ifdef HAVE_MSA | |||||
ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||||
ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||||
ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
else | |||||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
ifdef HAVE_MSA | |||||
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||||
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||||
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||||
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c | |||||
else | |||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | |||||
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | |||||
DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | |||||
DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c | |||||
else | |||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
else | |||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
endif | |||||
ifdef HAVE_MSA | |||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
else | |||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
endif |
@@ -933,6 +933,77 @@ static void init_parameter(void) { | |||||
} | } | ||||
#else // (ARCH_ARM64) | #else // (ARCH_ARM64) | ||||
#if defined(ARCH_MIPS64) | |||||
static void init_parameter(void) { | |||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||||
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||||
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||||
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||||
TABLE_NAME.dgemm_r = 640; | |||||
TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||||
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||||
#ifdef EXPRECISION | |||||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; | |||||
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; | |||||
TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; | |||||
TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; | |||||
#endif | |||||
#if defined(USE_GEMM3M) | |||||
#ifdef CGEMM3M_DEFAULT_P | |||||
TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; | |||||
#else | |||||
TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; | |||||
#endif | |||||
#ifdef ZGEMM3M_DEFAULT_P | |||||
TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; | |||||
#else | |||||
TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; | |||||
#endif | |||||
#ifdef CGEMM3M_DEFAULT_Q | |||||
TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; | |||||
#else | |||||
TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; | |||||
#endif | |||||
#ifdef ZGEMM3M_DEFAULT_Q | |||||
TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; | |||||
#else | |||||
TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; | |||||
#endif | |||||
#ifdef CGEMM3M_DEFAULT_R | |||||
TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; | |||||
#else | |||||
TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; | |||||
#endif | |||||
#ifdef ZGEMM3M_DEFAULT_R | |||||
TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; | |||||
#else | |||||
TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; | |||||
#endif | |||||
#ifdef EXPRECISION | |||||
TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; | |||||
TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; | |||||
TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; | |||||
#endif | |||||
#endif | |||||
} | |||||
#else // (ARCH_MIPS64) | |||||
#if (ARCH_POWER) | #if (ARCH_POWER) | ||||
static void init_parameter(void) { | static void init_parameter(void) { | ||||
@@ -1780,4 +1851,5 @@ static void init_parameter(void) { | |||||
} | } | ||||
#endif //POWER | #endif //POWER | ||||
#endif //ZARCH | #endif //ZARCH | ||||
#endif //(ARCH_MIPS64) | |||||
#endif //(ARCH_ARM64) | #endif //(ARCH_ARM64) |
@@ -2570,8 +2570,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define SYMV_P 16 | #define SYMV_P 16 | ||||
#endif | #endif | ||||
#ifdef LOONGSON3A | |||||
/*Copy from SICORTEX*/ | |||||
#if defined(LOONGSON3R4) | |||||
#define SNUMOPT 2 | #define SNUMOPT 2 | ||||
#define DNUMOPT 2 | #define DNUMOPT 2 | ||||
@@ -2579,6 +2578,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
#ifdef HAVE_MSA | |||||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||||
#define SGEMM_DEFAULT_UNROLL_N 8 | |||||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
#else | |||||
#define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
#define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
@@ -2590,6 +2602,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define ZGEMM_DEFAULT_UNROLL_M 2 | #define ZGEMM_DEFAULT_UNROLL_M 2 | ||||
#define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
#endif | |||||
#define SGEMM_DEFAULT_P 64 | #define SGEMM_DEFAULT_P 64 | ||||
#define DGEMM_DEFAULT_P 44 | #define DGEMM_DEFAULT_P 44 | ||||
@@ -2612,7 +2625,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define SYMV_P 16 | #define SYMV_P 16 | ||||
#endif | #endif | ||||
#ifdef LOONGSON3B | |||||
#if defined(LOONGSON3R3) | |||||
////Copy from SICORTEX | |||||
#define SNUMOPT 2 | #define SNUMOPT 2 | ||||
#define DNUMOPT 2 | #define DNUMOPT 2 | ||||
@@ -2620,32 +2634,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
#define SGEMM_DEFAULT_UNROLL_M 2 | |||||
#define SGEMM_DEFAULT_UNROLL_N 2 | |||||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||||
#define DGEMM_DEFAULT_UNROLL_N 2 | |||||
#define DGEMM_DEFAULT_UNROLL_M 4 | |||||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||||
#define CGEMM_DEFAULT_UNROLL_M 4 | |||||
#define CGEMM_DEFAULT_UNROLL_N 2 | #define CGEMM_DEFAULT_UNROLL_N 2 | ||||
#define ZGEMM_DEFAULT_UNROLL_M 2 | #define ZGEMM_DEFAULT_UNROLL_M 2 | ||||
#define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
#define SGEMM_DEFAULT_P 64 | #define SGEMM_DEFAULT_P 64 | ||||
#define DGEMM_DEFAULT_P 24 | |||||
#define CGEMM_DEFAULT_P 24 | |||||
#define ZGEMM_DEFAULT_P 20 | |||||
#define DGEMM_DEFAULT_P 44 | |||||
#define CGEMM_DEFAULT_P 64 | |||||
#define ZGEMM_DEFAULT_P 32 | |||||
#define SGEMM_DEFAULT_Q 192 | #define SGEMM_DEFAULT_Q 192 | ||||
#define DGEMM_DEFAULT_Q 128 | |||||
#define DGEMM_DEFAULT_Q 92 | |||||
#define CGEMM_DEFAULT_Q 128 | #define CGEMM_DEFAULT_Q 128 | ||||
#define ZGEMM_DEFAULT_Q 64 | |||||
#define ZGEMM_DEFAULT_Q 80 | |||||
#define SGEMM_DEFAULT_R 512 | |||||
#define DGEMM_DEFAULT_R 512 | |||||
#define CGEMM_DEFAULT_R 512 | |||||
#define ZGEMM_DEFAULT_R 512 | |||||
#define SGEMM_DEFAULT_R 640 | |||||
#define DGEMM_DEFAULT_R dgemm_r | |||||
#define CGEMM_DEFAULT_R 640 | |||||
#define ZGEMM_DEFAULT_R 640 | |||||
#define GEMM_OFFSET_A1 0x10000 | #define GEMM_OFFSET_A1 0x10000 | ||||
#define GEMM_OFFSET_B1 0x100000 | #define GEMM_OFFSET_B1 0x100000 | ||||