@@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 | |||
DYNAMIC_CORE += THUNDERX3T110 | |||
endif | |||
ifeq ($(ARCH), mips64) | |||
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 | |||
endif | |||
ifeq ($(ARCH), zarch) | |||
DYNAMIC_CORE = ZARCH_GENERIC | |||
@@ -672,7 +676,7 @@ DYNAMIC_CORE += POWER9 | |||
else | |||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||
endif | |||
LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) | |||
LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) | |||
ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) | |||
DYNAMIC_CORE += POWER10 | |||
CCOMMON_OPT += -DHAVE_P10_SUPPORT | |||
@@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32 | |||
BINARY_DEFINED = 1 | |||
endif | |||
ifeq ($(CORE), LOONGSON3A) | |||
CCOMMON_OPT += -march=mips64 | |||
FCOMMON_OPT += -march=mips64 | |||
endif | |||
ifeq ($(CORE), LOONGSON3B) | |||
CCOMMON_OPT += -march=mips64 | |||
FCOMMON_OPT += -march=mips64 | |||
ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||
CCOMMON_OPT += -march=loongson3a | |||
FCOMMON_OPT += -march=loongson3a | |||
endif | |||
ifeq ($(CORE), MIPS24K) | |||
@@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32 | |||
else | |||
FCOMMON_OPT += -n64 | |||
endif | |||
ifeq ($(CORE), LOONGSON3A) | |||
ifeq ($(CORE), LOONGSON3R3) | |||
FCOMMON_OPT += -loongson3 -static | |||
endif | |||
ifeq ($(CORE), LOONGSON3B) | |||
ifeq ($(CORE), LOONGSON3R4) | |||
FCOMMON_OPT += -loongson3 -static | |||
endif | |||
@@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32 | |||
else | |||
CCOMMON_OPT += -n64 | |||
endif | |||
ifeq ($(CORE), LOONGSON3A) | |||
ifeq ($(CORE), LOONGSON3R3) | |||
CCOMMON_OPT += -loongson3 -static | |||
endif | |||
ifeq ($(CORE), LOONGSON3B) | |||
ifeq ($(CORE), LOONGSON3R4) | |||
CCOMMON_OPT += -loongson3 -static | |||
endif | |||
@@ -1223,10 +1222,8 @@ ifdef SMP | |||
CCOMMON_OPT += -DSMP_SERVER | |||
ifeq ($(ARCH), mips64) | |||
ifneq ($(CORE), LOONGSON3B) | |||
USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
endif | |||
endif | |||
ifeq ($(USE_OPENMP), 1) | |||
# USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
@@ -1342,11 +1339,9 @@ endif | |||
ifneq ($(ARCH), x86_64) | |||
ifneq ($(ARCH), x86) | |||
ifneq ($(CORE), LOONGSON3B) | |||
NO_AFFINITY = 1 | |||
endif | |||
endif | |||
endif | |||
ifdef NO_AFFINITY | |||
ifeq ($(NO_AFFINITY), 0) | |||
@@ -199,7 +199,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
} else { | |||
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
$code = '"addvi.b $w0, $w1, 1"'; | |||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
$msa_flags = "-mmsa -mfp64 -mload-store-pairs"; | |||
print $tmpf "#include <msa.h>\n\n"; | |||
print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
@@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, | |||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | |||
return 0; | |||
#else | |||
#if defined (LOONGSON3B) | |||
#if defined (__64BIT__) | |||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||
#else | |||
return 0; //NULL Implementation on Loongson 3B 32bit. | |||
#endif | |||
#else | |||
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | |||
// unsigned long null_nodemask=0; | |||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||
#endif | |||
#endif | |||
} | |||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | |||
@@ -229,12 +229,7 @@ REALNAME: ;\ | |||
#define BUFFER_SIZE ( 32 << 21) | |||
#if defined(LOONGSON3A) | |||
#define PAGESIZE (16UL << 10) | |||
#define FIXED_PAGESIZE (16UL << 10) | |||
#endif | |||
#if defined(LOONGSON3B) | |||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
#define PAGESIZE (16UL << 10) | |||
#define FIXED_PAGESIZE (16UL << 10) | |||
#endif | |||
@@ -250,7 +245,7 @@ REALNAME: ;\ | |||
#define MAP_ANONYMOUS MAP_ANON | |||
#endif | |||
#if defined(LOONGSON3A) || defined(LOONGSON3B) | |||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
#define PREFETCHD_(x) ld $0, x | |||
#define PREFETCHD(x) PREFETCHD_(x) | |||
#else | |||
@@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#define CPU_UNKNOWN 0 | |||
#define CPU_SICORTEX 1 | |||
#define CPU_LOONGSON3A 2 | |||
#define CPU_LOONGSON3B 3 | |||
#define CPU_I6400 4 | |||
#define CPU_P6600 5 | |||
#define CPU_I6500 6 | |||
#define CPU_UNKNOWN 0 | |||
#define CPU_SICORTEX 1 | |||
#define CPU_LOONGSON3R3 2 | |||
#define CPU_LOONGSON3R4 3 | |||
#define CPU_I6400 4 | |||
#define CPU_P6600 5 | |||
#define CPU_I6500 6 | |||
static char *cpuname[] = { | |||
"UNKNOWN", | |||
"SICORTEX", | |||
"LOONGSON3A", | |||
"LOONGSON3B", | |||
"LOONGSON3R3", | |||
"LOONGSON3R4", | |||
"I6400", | |||
"P6600", | |||
"I6500" | |||
@@ -90,48 +90,13 @@ static char *cpuname[] = { | |||
int detect(void){ | |||
#ifdef __linux | |||
#ifdef linux | |||
FILE *infile; | |||
char buffer[512], *p; | |||
p = (char *)NULL; | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)){ | |||
if (!strncmp("cpu", buffer, 3)){ | |||
p = strchr(buffer, ':') + 2; | |||
#if 0 | |||
fprintf(stderr, "%s\n", p); | |||
#endif | |||
break; | |||
} | |||
} | |||
fclose(infile); | |||
if(p != NULL){ | |||
if (strstr(p, "Loongson-3A")){ | |||
return CPU_LOONGSON3A; | |||
}else if(strstr(p, "Loongson-3B")){ | |||
return CPU_LOONGSON3B; | |||
}else if (strstr(p, "Loongson-3")){ | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
p = (char *)NULL; | |||
while (fgets(buffer, sizeof(buffer), infile)){ | |||
if (!strncmp("system type", buffer, 11)){ | |||
p = strchr(buffer, ':') + 2; | |||
break; | |||
} | |||
} | |||
fclose(infile); | |||
if (strstr(p, "loongson3a")) | |||
return CPU_LOONGSON3A; | |||
}else{ | |||
return CPU_SICORTEX; | |||
} | |||
} | |||
//Check model name for Loongson3 | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
p = (char *)NULL; | |||
while (fgets(buffer, sizeof(buffer), infile)){ | |||
if (!strncmp("model name", buffer, 10)){ | |||
p = strchr(buffer, ':') + 2; | |||
@@ -140,14 +105,16 @@ int detect(void){ | |||
} | |||
fclose(infile); | |||
if(p != NULL){ | |||
if (strstr(p, "Loongson-3A")){ | |||
return CPU_LOONGSON3A; | |||
}else if(strstr(p, "Loongson-3B")){ | |||
return CPU_LOONGSON3B; | |||
} | |||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||
return CPU_LOONGSON3R3; | |||
}else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||
return CPU_LOONGSON3R4; | |||
} else{ | |||
return CPU_SICORTEX; | |||
} | |||
#endif | |||
return CPU_UNKNOWN; | |||
} | |||
} | |||
char *get_corename(void){ | |||
@@ -159,10 +126,10 @@ void get_architecture(void){ | |||
} | |||
void get_subarchitecture(void){ | |||
if(detect()==CPU_LOONGSON3A) { | |||
printf("LOONGSON3A"); | |||
}else if(detect()==CPU_LOONGSON3B){ | |||
printf("LOONGSON3B"); | |||
if(detect()==CPU_LOONGSON3R3) { | |||
printf("LOONGSON3R3"); | |||
}else if(detect()==CPU_LOONGSON3R4){ | |||
printf("LOONGSON3R4"); | |||
}else if(detect()==CPU_I6400){ | |||
printf("I6400"); | |||
}else if(detect()==CPU_P6600){ | |||
@@ -179,8 +146,8 @@ void get_subdirname(void){ | |||
} | |||
void get_cpuconfig(void){ | |||
if(detect()==CPU_LOONGSON3A) { | |||
printf("#define LOONGSON3A\n"); | |||
if(detect()==CPU_LOONGSON3R3) { | |||
printf("#define LOONGSON3R3\n"); | |||
printf("#define L1_DATA_SIZE 65536\n"); | |||
printf("#define L1_DATA_LINESIZE 32\n"); | |||
printf("#define L2_SIZE 512488\n"); | |||
@@ -188,8 +155,8 @@ void get_cpuconfig(void){ | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 4\n"); | |||
}else if(detect()==CPU_LOONGSON3B){ | |||
printf("#define LOONGSON3B\n"); | |||
}else if(detect()==CPU_LOONGSON3R4){ | |||
printf("#define LOONGSON3R4\n"); | |||
printf("#define L1_DATA_SIZE 65536\n"); | |||
printf("#define L1_DATA_LINESIZE 32\n"); | |||
printf("#define L2_SIZE 512488\n"); | |||
@@ -237,10 +204,10 @@ void get_cpuconfig(void){ | |||
} | |||
void get_libname(void){ | |||
if(detect()==CPU_LOONGSON3A) { | |||
printf("loongson3a\n"); | |||
}else if(detect()==CPU_LOONGSON3B) { | |||
printf("loongson3b\n"); | |||
if(detect()==CPU_LOONGSON3R3) { | |||
printf("loongson3r3\n"); | |||
}else if(detect()==CPU_LOONGSON3R4) { | |||
printf("loongson3r4\n"); | |||
}else if(detect()==CPU_I6400) { | |||
printf("i6400\n"); | |||
}else if(detect()==CPU_P6600) { | |||
@@ -339,8 +339,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
#else | |||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
else | |||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | |||
/* | |||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | |||
else | |||
*/ | |||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
#endif | |||
@@ -373,8 +373,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
#else | |||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
else | |||
/* | |||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | |||
else | |||
*/ | |||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
#endif | |||
/* Copy part of local region of B into workspace */ | |||
@@ -24,10 +24,14 @@ else | |||
ifeq ($(ARCH),zarch) | |||
COMMONOBJS += dynamic_zarch.$(SUFFIX) | |||
else | |||
ifeq ($(ARCH),mips64) | |||
COMMONOBJS += dynamic_mips64.$(SUFFIX) | |||
else | |||
COMMONOBJS += dynamic.$(SUFFIX) | |||
endif | |||
endif | |||
endif | |||
endif | |||
else | |||
COMMONOBJS += parameter.$(SUFFIX) | |||
endif | |||
@@ -92,10 +96,14 @@ else | |||
ifeq ($(ARCH),zarch) | |||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) | |||
else | |||
ifeq ($(ARCH),mips64) | |||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) | |||
else | |||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
endif | |||
endif | |||
endif | |||
endif | |||
else | |||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
endif | |||
@@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { | |||
blas_cpu_number = num_threads; | |||
#if defined(ARCH_MIPS64) | |||
#ifndef DYNAMIC_ARCH | |||
//set parameters for different number of threads. | |||
blas_set_parameter(); | |||
#endif | |||
#endif | |||
} | |||
@@ -0,0 +1,230 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2020, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
#include <sys/wait.h> | |||
#include <stdio.h> | |||
#include <unistd.h> | |||
#include <stdlib.h> | |||
#include <string.h> | |||
#include <sys/resource.h> | |||
#include "common.h" | |||
extern gotoblas_t gotoblas_LOONGSON3R3; | |||
extern gotoblas_t gotoblas_LOONGSON3R4; | |||
extern void openblas_warning(int verbose, const char * msg); | |||
#define NUM_CORETYPES 2 | |||
static char *corename[] = { | |||
"loongson3r3", | |||
"loongson3r4", | |||
"UNKNOWN" | |||
}; | |||
char *gotoblas_corename(void) { | |||
if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; | |||
if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; | |||
return corename[NUM_CORETYPES]; | |||
} | |||
static gotoblas_t *force_coretype(char *coretype) { | |||
int i; | |||
int found = -1; | |||
char message[128]; | |||
for ( i=0 ; i < NUM_CORETYPES; i++) | |||
{ | |||
if (!strncasecmp(coretype, corename[i], 20)) | |||
{ | |||
found = i; | |||
break; | |||
} | |||
} | |||
switch (found) | |||
{ | |||
case 0: return (&gotoblas_LOONGSON3R3); | |||
case 1: return (&gotoblas_LOONGSON3R4); | |||
} | |||
snprintf(message, 128, "Core not found: %s\n", coretype); | |||
openblas_warning(1, message); | |||
return NULL; | |||
} | |||
#define MMI_MASK 0x00000010 | |||
#define MSA_MASK 0x00000020 | |||
int fd[2]; | |||
int support_cpucfg; | |||
static void handler(int signum) | |||
{ | |||
close(fd[1]); | |||
exit(1); | |||
} | |||
/* Brief : Function to check if cpucfg supported on loongson | |||
* Return: 1 supported | |||
* 0 not supported | |||
*/ | |||
static int cpucfg_test(void) { | |||
pid_t pid; | |||
int status = 0; | |||
support_cpucfg = 0; | |||
pipe(fd); | |||
pid = fork(); | |||
if (pid == 0) { /* Subprocess */ | |||
struct sigaction act; | |||
close(fd[0]); | |||
/* Set signal action for SIGILL. */ | |||
act.sa_handler = handler; | |||
sigaction(SIGILL,&act,NULL); | |||
/* Execute cpucfg in subprocess. */ | |||
__asm__ volatile( | |||
".insn \n\t" | |||
".word (0xc8080118) \n\t" | |||
::: | |||
); | |||
support_cpucfg = 1; | |||
write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); | |||
close(fd[1]); | |||
exit(0); | |||
} else if (pid > 0){ /* Parent process*/ | |||
close(fd[1]); | |||
if ((waitpid(pid,&status,0) <= 0) || | |||
(read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) | |||
support_cpucfg = 0; | |||
close(fd[0]); | |||
} else { | |||
support_cpucfg = 0; | |||
} | |||
return support_cpucfg; | |||
} | |||
static gotoblas_t *get_coretype_from_cpucfg(void) { | |||
int flag = 0; | |||
__asm__ volatile( | |||
".insn \n\t" | |||
"dli $8, 0x01 \n\t" | |||
".word (0xc9084918) \n\t" | |||
"usw $9, 0x00(%0) \n\t" | |||
: | |||
: "r"(&flag) | |||
: "memory" | |||
); | |||
if (flag & MSA_MASK) | |||
return (&gotoblas_LOONGSON3R4); | |||
if (flag & MMI_MASK) | |||
return (&gotoblas_LOONGSON3R3); | |||
return NULL; | |||
} | |||
static gotoblas_t *get_coretype_from_cpuinfo(void) { | |||
#ifdef linux | |||
FILE *infile; | |||
char buffer[512], *p; | |||
p = (char *)NULL; | |||
//Check model name for Loongson3 | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)){ | |||
if (!strncmp("model name", buffer, 10)){ | |||
p = strchr(buffer, ':') + 2; | |||
break; | |||
} | |||
} | |||
fclose(infile); | |||
if(p != NULL){ | |||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) | |||
return (&gotoblas_LOONGSON3R3); | |||
else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) | |||
return (&gotoblas_LOONGSON3R4); | |||
else | |||
return NULL; | |||
} | |||
#endif | |||
return NULL; | |||
} | |||
static gotoblas_t *get_coretype(void) { | |||
int ret = 0; | |||
ret = cpucfg_test(); | |||
if (ret == 1) | |||
return get_coretype_from_cpucfg(); | |||
else | |||
return get_coretype_from_cpuinfo(); | |||
} | |||
void gotoblas_dynamic_init(void) { | |||
char coremsg[128]; | |||
char coren[22]; | |||
char *p; | |||
if (gotoblas) return; | |||
p = getenv("OPENBLAS_CORETYPE"); | |||
if ( p ) | |||
{ | |||
gotoblas = force_coretype(p); | |||
} | |||
else | |||
{ | |||
gotoblas = get_coretype(); | |||
} | |||
if (gotoblas == NULL) | |||
{ | |||
snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); | |||
openblas_warning(1, coremsg); | |||
gotoblas = &gotoblas_LOONGSON3R3; | |||
} | |||
if (gotoblas && gotoblas->init) { | |||
strncpy(coren, gotoblas_corename(), 20); | |||
sprintf(coremsg, "Core: %s\n", coren); | |||
openblas_warning(2, coremsg); | |||
gotoblas -> init(); | |||
} else { | |||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
exit(1); | |||
} | |||
} | |||
void gotoblas_dynamic_quit(void) { | |||
gotoblas = NULL; | |||
} |
@@ -52,6 +52,9 @@ static gotoblas_t *get_coretype(void) { | |||
if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) | |||
return &gotoblas_POWER10; | |||
#endif | |||
/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ | |||
if (__builtin_cpu_is("power10")) | |||
return &gotoblas_POWER9; | |||
return NULL; | |||
} | |||
@@ -717,7 +717,7 @@ void blas_set_parameter(void){ | |||
#if defined(ARCH_MIPS64) | |||
void blas_set_parameter(void){ | |||
#if defined(LOONGSON3A) | |||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
#ifdef SMP | |||
if(blas_num_threads == 1){ | |||
#endif | |||
@@ -731,20 +731,6 @@ void blas_set_parameter(void){ | |||
#endif | |||
#endif | |||
#if defined(LOONGSON3B) | |||
#ifdef SMP | |||
if(blas_num_threads == 1 || blas_num_threads == 2){ | |||
#endif | |||
//single thread | |||
dgemm_r = 640; | |||
#ifdef SMP | |||
}else{ | |||
//multi thread | |||
dgemm_r = 160; | |||
} | |||
#endif | |||
#endif | |||
} | |||
#endif | |||
@@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
/* #define FORCE_PPC440FP2 */ | |||
/* #define FORCE_CELL */ | |||
/* #define FORCE_SICORTEX */ | |||
/* #define FORCE_LOONGSON3A */ | |||
/* #define FORCE_LOONGSON3B */ | |||
/* #define FORCE_LOONGSON3R3 */ | |||
/* #define FORCE_LOONGSON3R4 */ | |||
/* #define FORCE_I6400 */ | |||
/* #define FORCE_P6600 */ | |||
/* #define FORCE_P5600 */ | |||
@@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#ifdef FORCE_LOONGSON3A | |||
#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B | |||
#define FORCE | |||
#define ARCHITECTURE "MIPS" | |||
#define SUBARCHITECTURE "LOONGSON3A" | |||
#define SUBARCHITECTURE "LOONGSON3R3" | |||
#define SUBDIRNAME "mips64" | |||
#define ARCHCONFIG "-DLOONGSON3A " \ | |||
#define ARCHCONFIG "-DLOONGSON3R3 " \ | |||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
#define LIBNAME "loongson3a" | |||
#define CORENAME "LOONGSON3A" | |||
#define LIBNAME "loongson3r3" | |||
#define CORENAME "LOONGSON3R3" | |||
#else | |||
#endif | |||
#ifdef FORCE_LOONGSON3B | |||
#ifdef FORCE_LOONGSON3R4 | |||
#define FORCE | |||
#define ARCHITECTURE "MIPS" | |||
#define SUBARCHITECTURE "LOONGSON3B" | |||
#define SUBARCHITECTURE "LOONGSON3R4" | |||
#define SUBDIRNAME "mips64" | |||
#define ARCHCONFIG "-DLOONGSON3B " \ | |||
#define ARCHCONFIG "-DLOONGSON3R4 " \ | |||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
#define LIBNAME "loongson3b" | |||
#define CORENAME "LOONGSON3B" | |||
#define LIBNAME "loongson3r4" | |||
#define CORENAME "LOONGSON3R4" | |||
#else | |||
#endif | |||
@@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) | |||
endif | |||
else ifeq ($(TARGET_CORE), HASWELL) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | |||
else ifeq ($(TARGET_CORE), LOONGSON3R4) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) | |||
else | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
endif | |||
@@ -68,6 +70,9 @@ else | |||
TARGET_CORE = $(CORE) | |||
KDIR = | |||
TSUFFIX = | |||
ifeq ($(TARGET_CORE), LOONGSON3R4) | |||
override CFLAGS += $(MSA_FLAGS) | |||
endif | |||
endif | |||
-include $(KERNELDIR)/KERNEL.$(TARGET_CORE) | |||
@@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) | |||
USE_TRMM = 1 | |||
endif | |||
ifeq ($(TARGET), LOONGSON3B) | |||
USE_TRMM = 1 | |||
endif | |||
ifneq ($(DYNAMIC_ARCH), 1) | |||
ifeq ($(TARGET), GENERIC) | |||
USE_TRMM = 1 | |||
@@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | |||
{ \ | |||
LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ | |||
src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ | |||
src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ | |||
SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | |||
\ | |||
PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | |||
@@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | |||
{ \ | |||
LD_SP2_INC(pa0, 4, src_a0, src_a1); \ | |||
src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ | |||
src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ | |||
SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | |||
\ | |||
PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | |||
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
{ | |||
if ((0 == c) && (0 == s)) | |||
{ | |||
v4f32 zero = __msa_cast_to_vector_float(0); | |||
zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); | |||
zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); | |||
zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); | |||
zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); | |||
v4f32 zero = {0.0, 0.0, 0.0, 0.0}; | |||
/* process 2 elements */ | |||
for (j = (n >> 1); j--;) | |||
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
{ | |||
if ((0.0 == da_r) && (0.0 == da_i)) | |||
{ | |||
v4f32 zero_v = __msa_cast_to_vector_float(0); | |||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||
v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||
for (i = (n >> 5); i--;) | |||
{ | |||
@@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
{ | |||
if (0.0 == da) | |||
{ | |||
v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
v2f64 zero_v = {0.0, 0.0}; | |||
for (i = (n >> 5); i--;) | |||
{ | |||
@@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); | |||
ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); | |||
src_a54 = __msa_cast_to_vector_double(*(a + 54)); | |||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||
src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); | |||
src_a62 = LD_DP(a + 62); | |||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); | |||
src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); | |||
@@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
src_a44 = LD_DP(a + 44); | |||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); | |||
src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); | |||
src_a36 = __msa_cast_to_vector_double(*(a + 36)); | |||
src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); | |||
src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); | |||
res_c7 *= src_a63; | |||
res_c6 -= res_c7 * src_a62; | |||
@@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
src_a26 = LD_DP(a + 26); | |||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); | |||
src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); | |||
src_a18 = __msa_cast_to_vector_double(*(a + 18)); | |||
src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); | |||
src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); | |||
res_c3 -= res_c7 * src_a59; | |||
res_c2 -= res_c7 * src_a58; | |||
@@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
src_a8 = LD_DP(a + 8); | |||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | |||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||
src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||
res_c1 -= res_c2 * src_a17; | |||
res_c1 *= src_a9; | |||
@@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_a52 = LD_DP(a - 12); | |||
src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); | |||
src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); | |||
src_a54 = __msa_cast_to_vector_double(*(a - 10)); | |||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||
src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); | |||
src_a40 = LD_DP(a - 24); | |||
src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); | |||
@@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_a34 = LD_DP(a - 30); | |||
src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); | |||
src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); | |||
src_a36 = __msa_cast_to_vector_double(*(a - 28)); | |||
src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); | |||
src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); | |||
res_c4 *= src_a36; | |||
res_c3 -= res_c4 * src_a35; | |||
@@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_a16 = LD_DP(a - 48); | |||
src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); | |||
src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); | |||
src_a18 = __msa_cast_to_vector_double(*(a - 46)); | |||
src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); | |||
src_a0 = __msa_cast_to_vector_double(*(a - 64)); | |||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||
src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); | |||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); | |||
src_a8 = LD_DP(a - 56); | |||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | |||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||
@@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | |||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||
src_a8 = __msa_cast_to_vector_double(*(a + 8)); | |||
src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||
src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); | |||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||
src_a4 = LD_DP(a + 4); | |||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | |||
@@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | |||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||
src_a8 = __msa_cast_to_vector_double(*(a + 8)); | |||
src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||
src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); | |||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||
src_a4 = LD_DP(a + 4); | |||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | |||
@@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
res_c14 -= res_c8 * src_a6; | |||
res_c15 -= res_c8 * src_a7; | |||
src_a9 = __msa_cast_to_vector_double(*(a + 9)); | |||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||
src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); | |||
src_a10 = LD_DP(a + 10); | |||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | |||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | |||
@@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
res_c14 -= res_c10 * src_a22; | |||
res_c15 -= res_c10 * src_a23; | |||
src_a27 = __msa_cast_to_vector_double(*(a + 27)); | |||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); | |||
src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); | |||
src_a28 = LD_DP(a + 28); | |||
src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | |||
src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | |||
@@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
res_c14 -= res_c12 * src_a38; | |||
res_c15 -= res_c12 * src_a39; | |||
src_a45 = __msa_cast_to_vector_double(*(a + 45)); | |||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); | |||
src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); | |||
src_a46 = LD_DP(a + 46); | |||
src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | |||
src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | |||
@@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); | |||
ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); | |||
src_a63 = __msa_cast_to_vector_double(*(a + 63)); | |||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); | |||
src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); | |||
src_a54 = LD_DP(a + 54); | |||
src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | |||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||
@@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
res_c6 -= res_c0 * src_a6; | |||
res_c7 -= res_c0 * src_a7; | |||
src_a9 = __msa_cast_to_vector_double(*(a + 9)); | |||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||
src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); | |||
src_a10 = LD_DP(a + 10); | |||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | |||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | |||
@@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
res_c6 -= res_c2 * src_a22; | |||
res_c7 -= res_c2 * src_a23; | |||
src_a27 = __msa_cast_to_vector_double(*(a + 27)); | |||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); | |||
src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); | |||
src_a28 = LD_DP(a + 28); | |||
src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | |||
src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | |||
@@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
res_c6 -= res_c4 * src_a38; | |||
res_c7 -= res_c4 * src_a39; | |||
src_a45 = __msa_cast_to_vector_double(*(a + 45)); | |||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); | |||
src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); | |||
src_a46 = LD_DP(a + 46); | |||
src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | |||
src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | |||
@@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
res_c6 -= res_c5 * src_a46; | |||
res_c7 -= res_c5 * src_a47; | |||
src_a63 = __msa_cast_to_vector_double(*(a + 63)); | |||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); | |||
src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); | |||
src_a54 = LD_DP(a + 54); | |||
src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | |||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||
@@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
res_c6 -= res_c4 * src_a2; | |||
res_c7 -= res_c4 * src_a3; | |||
src_a5 = __msa_cast_to_vector_double(*(a + 5)); | |||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); | |||
src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); | |||
src_a6 = LD_DP(a + 6); | |||
src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | |||
src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | |||
@@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_a10 = LD_DP(a + 10); | |||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | |||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | |||
src_a15 = __msa_cast_to_vector_double(*(a + 15)); | |||
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); | |||
src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); | |||
res_c2 *= src_a10; | |||
res_c3 -= res_c2 * src_a11; | |||
@@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
res_c2 -= res_c0 * src_a2; | |||
res_c3 -= res_c0 * src_a3; | |||
src_a5 = __msa_cast_to_vector_double(*(a + 5)); | |||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); | |||
src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); | |||
src_a6 = LD_DP(a + 6); | |||
src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | |||
src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | |||
@@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_a10 = LD_DP(a + 10); | |||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | |||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | |||
src_a15 = __msa_cast_to_vector_double(*(a + 15)); | |||
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); | |||
src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); | |||
res_c2 *= src_a10; | |||
res_c3 -= res_c2 * src_a11; | |||
@@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
src_b2 = LD_DP(b + 2); | |||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | |||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | |||
src_b5 = __msa_cast_to_vector_double(*(b + 5)); | |||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); | |||
src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); | |||
src_b6 = LD_DP(b + 6); | |||
src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | |||
src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | |||
src_b10 = LD_DP(b + 10); | |||
src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | |||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||
src_b15 = __msa_cast_to_vector_double(*(b + 15)); | |||
src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); | |||
src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); | |||
src_c0 *= src_b0; | |||
src_c1 *= src_b0; | |||
@@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_b0 = LD_DP(b + 0); | |||
src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | |||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
src_b3 = __msa_cast_to_vector_double(*(b + 3)); | |||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); | |||
src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); | |||
src_c0 *= src_b0; | |||
src_c1 *= src_b0; | |||
@@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
} | |||
} | |||
src_b0 = __msa_cast_to_vector_double(*b); | |||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
src_b0 = COPY_DOUBLE_TO_VECTOR(*b); | |||
src_c0 *= src_b0; | |||
src_c1 *= src_b0; | |||
@@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_b2 = LD_DP(b + 2); | |||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | |||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | |||
src_b5 = __msa_cast_to_vector_double(*(b + 5)); | |||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); | |||
src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); | |||
src_b6 = LD_DP(b + 6); | |||
src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | |||
src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | |||
src_b10 = LD_DP(b + 10); | |||
src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | |||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||
src_b15 = __msa_cast_to_vector_double(*(b + 15)); | |||
src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); | |||
src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); | |||
src_c0 *= src_b0; | |||
src_c1 *= src_b0; | |||
@@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_b0 = LD_DP(b + 0); | |||
src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | |||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
src_b3 = __msa_cast_to_vector_double(*(b + 3)); | |||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); | |||
src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); | |||
src_c0 *= src_b0; | |||
src_c1 *= src_b0; | |||
@@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
src_b8 = LD_DP(b + 8); | |||
src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | |||
src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | |||
src_b10 = __msa_cast_to_vector_double(*(b + 10)); | |||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||
src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); | |||
src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||
src_b4 = LD_DP(b + 4); | |||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | |||
src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | |||
@@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
a -= 16; | |||
b -= 4; | |||
src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||
src_b2 = LD_DP(b + 2); | |||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | |||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | |||
@@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
a -= 8; | |||
b -= 1; | |||
src_b0 = __msa_cast_to_vector_double(*b); | |||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
src_b0 = COPY_DOUBLE_TO_VECTOR(*b); | |||
src_c0 *= src_b0; | |||
src_c1 *= src_b0; | |||
@@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
src_b8 = LD_DP(b + 8); | |||
src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | |||
src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | |||
src_b10 = __msa_cast_to_vector_double(*(b + 10)); | |||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||
src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); | |||
src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||
src_b4 = LD_DP(b + 4); | |||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | |||
src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | |||
@@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
a -= 8; | |||
b -= 4; | |||
src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||
src_b2 = LD_DP(b + 2); | |||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | |||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | |||
@@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) | |||
#define ST_DP(...) ST_D(v2f64, __VA_ARGS__) | |||
#define COPY_FLOAT_TO_VECTOR(a) ( { \ | |||
v4f32 out; \ | |||
out = __msa_cast_to_vector_float(a); \ | |||
out = (v4f32) __msa_splati_w((v4i32) out, 0); \ | |||
v4f32 out = {a, a, a, a}; \ | |||
out; \ | |||
} ) | |||
#define COPY_DOUBLE_TO_VECTOR(a) ( { \ | |||
v2f64 out; \ | |||
out = __msa_cast_to_vector_double(a); \ | |||
out = (v2f64) __msa_splati_d((v2i64) out, 0); \ | |||
v2f64 out = {a, a}; \ | |||
out; \ | |||
} ) | |||
@@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
{ | |||
if ((0 == c) && (0 == s)) | |||
{ | |||
v4f32 zero = __msa_cast_to_vector_float(0); | |||
zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); | |||
zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); | |||
zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); | |||
zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); | |||
v4f32 zero = {0.0, 0.0, 0.0, 0.0}; | |||
/* process 4 floats */ | |||
for (j = (n >> 2); j--;) | |||
@@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
{ | |||
if (0.0 == da) | |||
{ | |||
v4f32 zero_v = __msa_cast_to_vector_float(0); | |||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||
v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||
for (i = (n >> 6); i--;) | |||
{ | |||
@@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
{ | |||
if ((0.0 == da_r) && (0.0 == da_i)) | |||
{ | |||
v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
v2f64 zero_v = {0.0, 0.0}; | |||
for (i = (n >> 4); i--;) | |||
{ | |||
@@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
if ((0.0 == da_r) && (0.0 == da_i)) | |||
{ | |||
v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
v2f64 zero_v = {0.0, 0.0}; | |||
for (i = (n >> 4); i--;) | |||
{ | |||
@@ -1,64 +0,0 @@ | |||
SAXPYKERNEL=axpy_loongson3a.S | |||
DAXPYKERNEL=daxpy_loongson3a_simd.S | |||
SGEMVNKERNEL = gemv_n_loongson3a.c | |||
SGEMVTKERNEL = gemv_t_loongson3a.c | |||
DGEMVNKERNEL = gemv_n_loongson3a.c | |||
DGEMVTKERNEL = gemv_t_loongson3a.c | |||
CGEMVNKERNEL = zgemv_n_loongson3a.c | |||
CGEMVTKERNEL = zgemv_t_loongson3a.c | |||
ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||
ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||
STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
@@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
SGEMMINCOPYOBJ = sgemm_incopy.o | |||
SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMINCOPYOBJ = cgemm_incopy.o | |||
CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
@@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DSDOTKERNEL = ../mips/dot.c | |||
@@ -0,0 +1,192 @@ | |||
ifdef HAVE_MSA | |||
SAXPYKERNEL = ../mips/saxpy_msa.c | |||
DAXPYKERNEL = ../mips/daxpy_msa.c | |||
CAXPYKERNEL = ../mips/caxpy_msa.c | |||
ZAXPYKERNEL = ../mips/zaxpy_msa.c | |||
else | |||
SAXPYKERNEL = axpy_loongson3a.S | |||
DAXPYKERNEL = daxpy_loongson3a_simd.S | |||
endif | |||
ifdef HAVE_MSA | |||
SCOPYKERNEL = ../mips/scopy_msa.c | |||
DCOPYKERNEL = ../mips/dcopy_msa.c | |||
CCOPYKERNEL = ../mips/ccopy_msa.c | |||
ZCOPYKERNEL = ../mips/zcopy_msa.c | |||
endif | |||
ifdef HAVE_MSA | |||
SDOTKERNEL = ../mips/sdot_msa.c | |||
DDOTKERNEL = ../mips/ddot_msa.c | |||
CDOTKERNEL = ../mips/cdot_msa.c | |||
ZDOTKERNEL = ../mips/zdot_msa.c | |||
endif | |||
DSDOTKERNEL = ../mips/dot.c | |||
ifdef HAVE_MSA | |||
SROTKERNEL = ../mips/srot_msa.c | |||
DROTKERNEL = ../mips/drot_msa.c | |||
CROTKERNEL = ../mips/crot_msa.c | |||
ZROTKERNEL = ../mips/zrot_msa.c | |||
endif | |||
ifdef HAVE_MSA | |||
SSCALKERNEL = ../mips/sscal_msa.c | |||
DSCALKERNEL = ../mips/dscal_msa.c | |||
CSCALKERNEL = ../mips/cscal_msa.c | |||
ZSCALKERNEL = ../mips/zscal_msa.c | |||
endif | |||
ifdef HAVE_MSA | |||
SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||
DGEMVNKERNEL = ../mips/dgemv_n_msa.c | |||
SGEMVTKERNEL = ../mips/sgemv_t_msa.c | |||
DGEMVTKERNEL = ../mips/dgemv_t_msa.c | |||
CGEMVNKERNEL = ../mips/cgemv_n_msa.c | |||
CGEMVTKERNEL = ../mips/cgemv_t_msa.c | |||
ZGEMVNKERNEL = ../mips/zgemv_n_msa.c | |||
ZGEMVTKERNEL = ../mips/zgemv_t_msa.c | |||
else | |||
SGEMVNKERNEL = gemv_n_loongson3a.c | |||
SGEMVTKERNEL = gemv_t_loongson3a.c | |||
DGEMVNKERNEL = gemv_n_loongson3a.c | |||
DGEMVTKERNEL = gemv_t_loongson3a.c | |||
CGEMVNKERNEL = zgemv_n_loongson3a.c | |||
CGEMVTKERNEL = zgemv_t_loongson3a.c | |||
ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||
ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||
endif | |||
ifdef HAVE_MSA | |||
SASUMKERNEL = ../mips/sasum_msa.c | |||
DASUMKERNEL = ../mips/dasum_msa.c | |||
CASUMKERNEL = ../mips/casum_msa.c | |||
ZASUMKERNEL = ../mips/zasum_msa.c | |||
endif | |||
ifdef HAVE_MSA | |||
SSWAPKERNEL = ../mips/sswap_msa.c | |||
DSWAPKERNEL = ../mips/dswap_msa.c | |||
CSWAPKERNEL = ../mips/cswap_msa.c | |||
ZSWAPKERNEL = ../mips/zswap_msa.c | |||
endif | |||
ifdef HAVE_MSA | |||
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
else | |||
SGEMMKERNEL = sgemm_kernel_8x4_ps.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ifdef HAVE_MSA | |||
DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | |||
DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | |||
DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | |||
DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c | |||
DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
else | |||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ifdef HAVE_MSA | |||
CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||
CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||
CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||
CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c | |||
CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
else | |||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ifdef HAVE_MSA | |||
ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||
ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||
ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
else | |||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ifdef HAVE_MSA | |||
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c | |||
else | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
endif | |||
ifdef HAVE_MSA | |||
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | |||
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | |||
DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | |||
DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c | |||
else | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
endif | |||
ifdef HAVE_MSA | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
else | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
endif | |||
ifdef HAVE_MSA | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
else | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
endif |
@@ -63,15 +63,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
STRSMKERNEL_LN = trsm_kernel_LN_power10.c | |||
STRSMKERNEL_LT = trsm_kernel_LT_power10.c | |||
STRSMKERNEL_RN = trsm_kernel_RN_power10.c | |||
STRSMKERNEL_RT = trsm_kernel_RT_power10.c | |||
DTRSMKERNEL_LN = trsm_kernel_LN_power10.c | |||
DTRSMKERNEL_LT = trsm_kernel_LT_power10.c | |||
DTRSMKERNEL_RN = trsm_kernel_RN_power10.c | |||
DTRSMKERNEL_RT = trsm_kernel_RT_power10.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
@@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
STRSMKERNEL_LN = trsm_kernel_LN_power10.c | |||
STRSMKERNEL_LT = trsm_kernel_LT_power10.c | |||
STRSMKERNEL_RN = trsm_kernel_RN_power10.c | |||
STRSMKERNEL_RT = trsm_kernel_RT_power10.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LN = trsm_kernel_LN_power10.c | |||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_RN = trsm_kernel_RN_power10.c | |||
DTRSMKERNEL_RT = trsm_kernel_RT_power10.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
@@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
"xvmaddasp 38, 58, 33 \n\t" | |||
"xvmaddasp 39, 59, 33 \n\t" | |||
"stxvp 48, 0(%4) \n\t" | |||
"stxvp 50, 32(%4) \n\t" | |||
"stxvp 34, 64(%4) \n\t" | |||
"stxvp 38, 96(%4) \n\t" | |||
"stxv 49, 0(%4) \n\t" | |||
"stxv 48, 16(%4) \n\t" | |||
"stxv 51, 32(%4) \n\t" | |||
"stxv 50, 48(%4) \n\t" | |||
"stxv 35, 64(%4) \n\t" | |||
"stxv 34, 80(%4) \n\t" | |||
"stxv 39, 96(%4) \n\t" | |||
"stxv 38, 112(%4) \n\t" | |||
"addi %4, %4, 128 \n\t" | |||
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part | |||
@@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
"xvmaddasp 38, 58, 33 \n\t" | |||
"xvmaddasp 39, 59, 33 \n\t" | |||
"stxvp 48, 0(%4) \n\t" | |||
"stxvp 50, 32(%4) \n\t" | |||
"stxvp 34, 64(%4) \n\t" | |||
"stxvp 38, 96(%4) \n\t" | |||
"stxv 49, 0(%4) \n\t" | |||
"stxv 48, 16(%4) \n\t" | |||
"stxv 51, 32(%4) \n\t" | |||
"stxv 50, 48(%4) \n\t" | |||
"stxv 35, 64(%4) \n\t" | |||
"stxv 34, 80(%4) \n\t" | |||
"stxv 39, 96(%4) \n\t" | |||
"stxv 38, 112(%4) \n\t" | |||
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" | |||
: | |||
@@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
BLASLONG n1 = n & -16; | |||
if ( n >= 16 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
for (i = 0; i < align; i++) { | |||
y[i] += da * x[i] ; | |||
} | |||
} | |||
BLASLONG n1 = (n-i) & -16; | |||
if ( n1 ) | |||
daxpy_kernel_8(n1, &x[i], &y[i], da); | |||
i += n1; | |||
if ( n1 ) | |||
daxpy_kernel_8(n1, x, y, da); | |||
i = n1; | |||
while(i < n) | |||
{ | |||
@@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
BLASLONG n1 = n & -64; | |||
if ( n >= 64 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
for (i = 0; i < align; i++) { | |||
y[i] += da * x[i] ; | |||
} | |||
} | |||
BLASLONG n1 = (n-i) & -64; | |||
if ( n1 ) | |||
saxpy_kernel_64(n1, x, y, da); | |||
saxpy_kernel_64(n1, &x[i], &y[i], da); | |||
i = n1; | |||
i += n1; | |||
while(i < n) | |||
{ | |||
@@ -0,0 +1,828 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include "common.h" | |||
#include <altivec.h> | |||
static FLOAT dm1 = -1.; | |||
#ifdef CONJ | |||
#define GEMM_KERNEL GEMM_KERNEL_R | |||
#else | |||
#define GEMM_KERNEL GEMM_KERNEL_N | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 1 | |||
#define GEMM_UNROLL_M_SHIFT 0 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 2 | |||
#define GEMM_UNROLL_M_SHIFT 1 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 4 | |||
#define GEMM_UNROLL_M_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 6 | |||
#define GEMM_UNROLL_M_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 8 | |||
#define GEMM_UNROLL_M_SHIFT 3 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 16 | |||
#define GEMM_UNROLL_M_SHIFT 4 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 1 | |||
#define GEMM_UNROLL_N_SHIFT 0 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 2 | |||
#define GEMM_UNROLL_N_SHIFT 1 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 4 | |||
#define GEMM_UNROLL_N_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 8 | |||
#define GEMM_UNROLL_N_SHIFT 3 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 16 | |||
#define GEMM_UNROLL_N_SHIFT 4 | |||
#endif | |||
#ifndef COMPLEX | |||
#ifdef DOUBLE | |||
static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; | |||
c0 = &c[0*ldc]; | |||
c1 = &c[1*ldc]; | |||
c2 = &c[2*ldc]; | |||
c3 = &c[3*ldc]; | |||
c4 = &c[4*ldc]; | |||
c5 = &c[5*ldc]; | |||
c6 = &c[6*ldc]; | |||
c7 = &c[7*ldc]; | |||
vector FLOAT *Vb = (vector FLOAT *) b; | |||
vector FLOAT *Vc0 = (vector FLOAT *) c0; | |||
vector FLOAT *Vc1 = (vector FLOAT *) c1; | |||
vector FLOAT *Vc2 = (vector FLOAT *) c2; | |||
vector FLOAT *Vc3 = (vector FLOAT *) c3; | |||
vector FLOAT *Vc4 = (vector FLOAT *) c4; | |||
vector FLOAT *Vc5 = (vector FLOAT *) c5; | |||
vector FLOAT *Vc6 = (vector FLOAT *) c6; | |||
vector FLOAT *Vc7 = (vector FLOAT *) c7; | |||
vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; | |||
a[0] = (c0[0] *= b[0]); | |||
a[1] = (c0[1] *= b[0]); | |||
a[2] = (c0[2] *= b[0]); | |||
a[3] = (c0[3] *= b[0]); | |||
a[4] = (c0[4] *= b[0]); | |||
a[5] = (c0[5] *= b[0]); | |||
a[6] = (c0[6] *= b[0]); | |||
a[7] = (c0[7] *= b[0]); | |||
VbS0 = vec_splat(Vb[0], 1); | |||
VbS1 = vec_splat(Vb[1], 0); | |||
VbS2 = vec_splat(Vb[1], 1); | |||
VbS3 = vec_splat(Vb[2], 0); | |||
VbS4 = vec_splat(Vb[2], 1); | |||
VbS5 = vec_splat(Vb[3], 0); | |||
VbS6 = vec_splat(Vb[3], 1); | |||
Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]); | |||
Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]); | |||
Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]); | |||
Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]); | |||
Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]); | |||
Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]); | |||
Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]); | |||
Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]); | |||
Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]); | |||
Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]); | |||
Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]); | |||
Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]); | |||
Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]); | |||
Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]); | |||
Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]); | |||
Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]); | |||
Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]); | |||
Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]); | |||
Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]); | |||
Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]); | |||
Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]); | |||
Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]); | |||
Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]); | |||
Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]); | |||
Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]); | |||
Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]); | |||
Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]); | |||
Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]); | |||
a[ 8] = (c1[0] *= b[9]); | |||
a[ 9] = (c1[1] *= b[9]); | |||
a[10] = (c1[2] *= b[9]); | |||
a[11] = (c1[3] *= b[9]); | |||
a[12] = (c1[4] *= b[9]); | |||
a[13] = (c1[5] *= b[9]); | |||
a[14] = (c1[6] *= b[9]); | |||
a[15] = (c1[7] *= b[9]); | |||
VbS0 = vec_splat(Vb[5], 0); | |||
VbS1 = vec_splat(Vb[5], 1); | |||
VbS2 = vec_splat(Vb[6], 0); | |||
VbS3 = vec_splat(Vb[6], 1); | |||
VbS4 = vec_splat(Vb[7], 0); | |||
VbS5 = vec_splat(Vb[7], 1); | |||
Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]); | |||
Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]); | |||
Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]); | |||
Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]); | |||
Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]); | |||
Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]); | |||
Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]); | |||
Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]); | |||
Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]); | |||
Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]); | |||
Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]); | |||
Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]); | |||
Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]); | |||
Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]); | |||
Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]); | |||
Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]); | |||
Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]); | |||
Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]); | |||
Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]); | |||
Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]); | |||
Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]); | |||
Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]); | |||
Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]); | |||
Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]); | |||
a[16] = (c2[0] *= b[18]); | |||
a[17] = (c2[1] *= b[18]); | |||
a[18] = (c2[2] *= b[18]); | |||
a[19] = (c2[3] *= b[18]); | |||
a[20] = (c2[4] *= b[18]); | |||
a[21] = (c2[5] *= b[18]); | |||
a[22] = (c2[6] *= b[18]); | |||
a[23] = (c2[7] *= b[18]); | |||
VbS0 = vec_splat(Vb[ 9], 1); | |||
VbS1 = vec_splat(Vb[10], 0); | |||
VbS2 = vec_splat(Vb[10], 1); | |||
VbS3 = vec_splat(Vb[11], 0); | |||
VbS4 = vec_splat(Vb[11], 1); | |||
Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]); | |||
Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]); | |||
Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]); | |||
Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]); | |||
Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]); | |||
Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]); | |||
Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]); | |||
Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]); | |||
Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]); | |||
Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]); | |||
Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]); | |||
Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]); | |||
Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]); | |||
Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]); | |||
Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]); | |||
Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]); | |||
Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]); | |||
Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]); | |||
Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]); | |||
Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]); | |||
a[24] = (c3[0] *= b[27]); | |||
a[25] = (c3[1] *= b[27]); | |||
a[26] = (c3[2] *= b[27]); | |||
a[27] = (c3[3] *= b[27]); | |||
a[28] = (c3[4] *= b[27]); | |||
a[29] = (c3[5] *= b[27]); | |||
a[30] = (c3[6] *= b[27]); | |||
a[31] = (c3[7] *= b[27]); | |||
VbS0 = vec_splat(Vb[14], 0); | |||
VbS1 = vec_splat(Vb[14], 1); | |||
VbS2 = vec_splat(Vb[15], 0); | |||
VbS3 = vec_splat(Vb[15], 1); | |||
Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]); | |||
Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]); | |||
Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]); | |||
Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]); | |||
Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]); | |||
Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]); | |||
Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]); | |||
Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]); | |||
Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]); | |||
Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]); | |||
Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]); | |||
Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]); | |||
Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]); | |||
Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]); | |||
Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]); | |||
Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]); | |||
a[32] = (c4[0] *= b[36]); | |||
a[33] = (c4[1] *= b[36]); | |||
a[34] = (c4[2] *= b[36]); | |||
a[35] = (c4[3] *= b[36]); | |||
a[36] = (c4[4] *= b[36]); | |||
a[37] = (c4[5] *= b[36]); | |||
a[38] = (c4[6] *= b[36]); | |||
a[39] = (c4[7] *= b[36]); | |||
VbS0 = vec_splat(Vb[18], 1); | |||
VbS1 = vec_splat(Vb[19], 0); | |||
VbS2 = vec_splat(Vb[19], 1); | |||
Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]); | |||
Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]); | |||
Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]); | |||
Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]); | |||
Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]); | |||
Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]); | |||
Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]); | |||
Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]); | |||
Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]); | |||
Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]); | |||
Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]); | |||
Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]); | |||
a[40] = (c5[0] *= b[45]); | |||
a[41] = (c5[1] *= b[45]); | |||
a[42] = (c5[2] *= b[45]); | |||
a[43] = (c5[3] *= b[45]); | |||
a[44] = (c5[4] *= b[45]); | |||
a[45] = (c5[5] *= b[45]); | |||
a[46] = (c5[6] *= b[45]); | |||
a[47] = (c5[7] *= b[45]); | |||
VbS0 = vec_splat(Vb[23], 0); | |||
VbS1 = vec_splat(Vb[23], 1); | |||
Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]); | |||
Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]); | |||
Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]); | |||
Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]); | |||
Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]); | |||
Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]); | |||
Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]); | |||
Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]); | |||
a[48] = (c6[0] *= b[54]); | |||
a[49] = (c6[1] *= b[54]); | |||
a[50] = (c6[2] *= b[54]); | |||
a[51] = (c6[3] *= b[54]); | |||
a[52] = (c6[4] *= b[54]); | |||
a[53] = (c6[5] *= b[54]); | |||
a[54] = (c6[6] *= b[54]); | |||
a[55] = (c6[7] *= b[54]); | |||
VbS0 = vec_splat(Vb[27], 1); | |||
Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]); | |||
Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]); | |||
Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]); | |||
Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]); | |||
a[56] = (c7[0] *= b[63]); | |||
a[57] = (c7[1] *= b[63]); | |||
a[58] = (c7[2] *= b[63]); | |||
a[59] = (c7[3] *= b[63]); | |||
a[60] = (c7[4] *= b[63]); | |||
a[61] = (c7[5] *= b[63]); | |||
a[62] = (c7[6] *= b[63]); | |||
a[63] = (c7[7] *= b[63]); | |||
} | |||
#else | |||
static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; | |||
c0 = &c[0*ldc]; | |||
c1 = &c[1*ldc]; | |||
c2 = &c[2*ldc]; | |||
c3 = &c[3*ldc]; | |||
c4 = &c[4*ldc]; | |||
c5 = &c[5*ldc]; | |||
c6 = &c[6*ldc]; | |||
c7 = &c[7*ldc]; | |||
vector FLOAT *Va = (vector FLOAT *) a; | |||
vector FLOAT *Vb = (vector FLOAT *) b; | |||
vector FLOAT *Vc0 = (vector FLOAT *) c0; | |||
vector FLOAT *Vc1 = (vector FLOAT *) c1; | |||
vector FLOAT *Vc2 = (vector FLOAT *) c2; | |||
vector FLOAT *Vc3 = (vector FLOAT *) c3; | |||
vector FLOAT *Vc4 = (vector FLOAT *) c4; | |||
vector FLOAT *Vc5 = (vector FLOAT *) c5; | |||
vector FLOAT *Vc6 = (vector FLOAT *) c6; | |||
vector FLOAT *Vc7 = (vector FLOAT *) c7; | |||
vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; | |||
VbS0 = vec_splat(Vb[0], 0); | |||
VbS1 = vec_splat(Vb[0], 1); | |||
VbS2 = vec_splat(Vb[0], 2); | |||
VbS3 = vec_splat(Vb[0], 3); | |||
VbS4 = vec_splat(Vb[1], 0); | |||
VbS5 = vec_splat(Vb[1], 1); | |||
VbS6 = vec_splat(Vb[1], 2); | |||
VbS7 = vec_splat(Vb[1], 3); | |||
Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]); | |||
Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]); | |||
Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]); | |||
Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]); | |||
Va[0] = Vc0[0]; | |||
Va[1] = Vc0[1]; | |||
Va[2] = Vc0[2]; | |||
Va[3] = Vc0[3]; | |||
Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]); | |||
Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); | |||
Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); | |||
Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); | |||
Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]); | |||
Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); | |||
Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); | |||
Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); | |||
Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]); | |||
Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); | |||
Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); | |||
Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); | |||
Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]); | |||
Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); | |||
Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); | |||
Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); | |||
Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]); | |||
Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); | |||
Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); | |||
Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); | |||
Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]); | |||
Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); | |||
Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); | |||
Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); | |||
Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]); | |||
Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); | |||
Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); | |||
Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); | |||
VbS0 = vec_splat(Vb[2], 1); | |||
VbS1 = vec_splat(Vb[2], 2); | |||
VbS2 = vec_splat(Vb[2], 3); | |||
VbS3 = vec_splat(Vb[3], 0); | |||
VbS4 = vec_splat(Vb[3], 1); | |||
VbS5 = vec_splat(Vb[3], 2); | |||
VbS6 = vec_splat(Vb[3], 3); | |||
Vc1[0] = vec_mul(VbS0, Vc1[0]); | |||
Vc1[1] = vec_mul(VbS0, Vc1[1]); | |||
Vc1[2] = vec_mul(VbS0, Vc1[2]); | |||
Vc1[3] = vec_mul(VbS0, Vc1[3]); | |||
Va[4] = Vc1[0]; | |||
Va[5] = Vc1[1]; | |||
Va[6] = Vc1[2]; | |||
Va[7] = Vc1[3]; | |||
Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]); | |||
Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]); | |||
Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]); | |||
Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]); | |||
Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]); | |||
Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]); | |||
Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]); | |||
Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]); | |||
Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]); | |||
Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]); | |||
Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]); | |||
Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]); | |||
Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]); | |||
Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]); | |||
Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]); | |||
Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]); | |||
Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]); | |||
Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]); | |||
Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]); | |||
Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]); | |||
Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]); | |||
Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]); | |||
Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]); | |||
Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]); | |||
VbS0 = vec_splat(Vb[4], 2); | |||
VbS1 = vec_splat(Vb[4], 3); | |||
VbS2 = vec_splat(Vb[5], 0); | |||
VbS3 = vec_splat(Vb[5], 1); | |||
VbS4 = vec_splat(Vb[5], 2); | |||
VbS5 = vec_splat(Vb[5], 3); | |||
Vc2[0] = vec_mul(VbS0, Vc2[0]); | |||
Vc2[1] = vec_mul(VbS0, Vc2[1]); | |||
Vc2[2] = vec_mul(VbS0, Vc2[2]); | |||
Vc2[3] = vec_mul(VbS0, Vc2[3]); | |||
Va[ 8] = Vc2[0]; | |||
Va[ 9] = Vc2[1]; | |||
Va[10] = Vc2[2]; | |||
Va[11] = Vc2[3]; | |||
Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]); | |||
Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]); | |||
Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]); | |||
Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]); | |||
Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]); | |||
Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]); | |||
Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]); | |||
Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]); | |||
Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]); | |||
Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]); | |||
Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]); | |||
Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]); | |||
Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]); | |||
Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]); | |||
Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]); | |||
Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]); | |||
Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]); | |||
Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]); | |||
Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]); | |||
Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]); | |||
VbS0 = vec_splat(Vb[6], 3); | |||
VbS1 = vec_splat(Vb[7], 0); | |||
VbS2 = vec_splat(Vb[7], 1); | |||
VbS3 = vec_splat(Vb[7], 2); | |||
VbS4 = vec_splat(Vb[7], 3); | |||
Vc3[0] = vec_mul(VbS0, Vc3[0]); | |||
Vc3[1] = vec_mul(VbS0, Vc3[1]); | |||
Vc3[2] = vec_mul(VbS0, Vc3[2]); | |||
Vc3[3] = vec_mul(VbS0, Vc3[3]); | |||
Va[12] = Vc3[0]; | |||
Va[13] = Vc3[1]; | |||
Va[14] = Vc3[2]; | |||
Va[15] = Vc3[3]; | |||
Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]); | |||
Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]); | |||
Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]); | |||
Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]); | |||
Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]); | |||
Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]); | |||
Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]); | |||
Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]); | |||
Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]); | |||
Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]); | |||
Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]); | |||
Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]); | |||
Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]); | |||
Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]); | |||
Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]); | |||
Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]); | |||
VbS0 = vec_splat(Vb[9], 0); | |||
VbS1 = vec_splat(Vb[9], 1); | |||
VbS2 = vec_splat(Vb[9], 2); | |||
VbS3 = vec_splat(Vb[9], 3); | |||
Vc4[0] = vec_mul(VbS0, Vc4[0]); | |||
Vc4[1] = vec_mul(VbS0, Vc4[1]); | |||
Vc4[2] = vec_mul(VbS0, Vc4[2]); | |||
Vc4[3] = vec_mul(VbS0, Vc4[3]); | |||
Va[16] = Vc4[0]; | |||
Va[17] = Vc4[1]; | |||
Va[18] = Vc4[2]; | |||
Va[19] = Vc4[3]; | |||
Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]); | |||
Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]); | |||
Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]); | |||
Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]); | |||
Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]); | |||
Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]); | |||
Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]); | |||
Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]); | |||
Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]); | |||
Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]); | |||
Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]); | |||
Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]); | |||
VbS0 = vec_splat(Vb[11], 1); | |||
VbS1 = vec_splat(Vb[11], 2); | |||
VbS2 = vec_splat(Vb[11], 3); | |||
Vc5[0] = vec_mul(VbS0, Vc5[0]); | |||
Vc5[1] = vec_mul(VbS0, Vc5[1]); | |||
Vc5[2] = vec_mul(VbS0, Vc5[2]); | |||
Vc5[3] = vec_mul(VbS0, Vc5[3]); | |||
Va[20] = Vc5[0]; | |||
Va[21] = Vc5[1]; | |||
Va[22] = Vc5[2]; | |||
Va[23] = Vc5[3]; | |||
Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]); | |||
Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]); | |||
Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]); | |||
Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]); | |||
Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]); | |||
Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]); | |||
Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]); | |||
Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]); | |||
VbS0 = vec_splat(Vb[13], 2); | |||
VbS1 = vec_splat(Vb[13], 3); | |||
Vc6[0] = vec_mul(VbS0, Vc6[0]); | |||
Vc6[1] = vec_mul(VbS0, Vc6[1]); | |||
Vc6[2] = vec_mul(VbS0, Vc6[2]); | |||
Vc6[3] = vec_mul(VbS0, Vc6[3]); | |||
Va[24] = Vc6[0]; | |||
Va[25] = Vc6[1]; | |||
Va[26] = Vc6[2]; | |||
Va[27] = Vc6[3]; | |||
Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]); | |||
Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]); | |||
Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]); | |||
Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]); | |||
VbS0 = vec_splat(Vb[15], 3); | |||
Vc7[0] = vec_mul(VbS0, Vc7[0]); | |||
Vc7[1] = vec_mul(VbS0, Vc7[1]); | |||
Vc7[2] = vec_mul(VbS0, Vc7[2]); | |||
Vc7[3] = vec_mul(VbS0, Vc7[3]); | |||
Va[28] = Vc7[0]; | |||
Va[29] = Vc7[1]; | |||
Va[30] = Vc7[2]; | |||
Va[31] = Vc7[3]; | |||
} | |||
#endif | |||
static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa, bb; | |||
int i, j, k; | |||
for (i = 0; i < n; i++) { | |||
bb = *(b + i); | |||
for (j = 0; j < m; j ++) { | |||
aa = *(c + j + i * ldc); | |||
aa *= bb; | |||
*a = aa; | |||
*(c + j + i * ldc) = aa; | |||
a ++; | |||
for (k = i + 1; k < n; k ++){ | |||
*(c + j + k * ldc) -= aa * *(b + k); | |||
} | |||
} | |||
b += n; | |||
} | |||
} | |||
#else | |||
static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa1, aa2; | |||
FLOAT bb1, bb2; | |||
FLOAT cc1, cc2; | |||
int i, j, k; | |||
ldc *= 2; | |||
for (i = 0; i < n; i++) { | |||
bb1 = *(b + i * 2 + 0); | |||
bb2 = *(b + i * 2 + 1); | |||
for (j = 0; j < m; j ++) { | |||
aa1 = *(c + j * 2 + 0 + i * ldc); | |||
aa2 = *(c + j * 2 + 1 + i * ldc); | |||
#ifndef CONJ | |||
cc1 = aa1 * bb1 - aa2 * bb2; | |||
cc2 = aa1 * bb2 + aa2 * bb1; | |||
#else | |||
cc1 = aa1 * bb1 + aa2 * bb2; | |||
cc2 = -aa1 * bb2 + aa2 * bb1; | |||
#endif | |||
*(a + 0) = cc1; | |||
*(a + 1) = cc2; | |||
*(c + j * 2 + 0 + i * ldc) = cc1; | |||
*(c + j * 2 + 1 + i * ldc) = cc2; | |||
a += 2; | |||
for (k = i + 1; k < n; k ++){ | |||
#ifndef CONJ | |||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | |||
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
#else | |||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | |||
*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
#endif | |||
} | |||
} | |||
b += n * 2; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
#ifdef COMPLEX | |||
FLOAT dummy2, | |||
#endif | |||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
FLOAT *aa, *cc; | |||
BLASLONG kk; | |||
BLASLONG i, j, jj; | |||
#if 0 | |||
fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
m, n, k, offset); | |||
#endif | |||
jj = 0; | |||
j = (n >> GEMM_UNROLL_N_SHIFT); | |||
kk = -offset; | |||
#ifdef DOUBLE | |||
int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); | |||
#else | |||
int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); | |||
#endif | |||
while (j > 0) { | |||
aa = a; | |||
cc = c; | |||
i = (m >> GEMM_UNROLL_M_SHIFT); | |||
if (i > 0) { | |||
do { | |||
if (kk > 0) { | |||
GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, b, cc, ldc); | |||
} | |||
if (well_aligned) { | |||
#ifdef DOUBLE | |||
solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, | |||
b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); | |||
#else | |||
solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, | |||
b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); | |||
#endif | |||
} | |||
else { | |||
solve(GEMM_UNROLL_M, GEMM_UNROLL_N, | |||
aa + kk * GEMM_UNROLL_M * COMPSIZE, | |||
b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
} | |||
aa += GEMM_UNROLL_M * k * COMPSIZE; | |||
cc += GEMM_UNROLL_M * COMPSIZE; | |||
i --; | |||
} while (i > 0); | |||
} | |||
if (m & (GEMM_UNROLL_M - 1)) { | |||
i = (GEMM_UNROLL_M >> 1); | |||
while (i > 0) { | |||
if (m & i) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, b, cc, ldc); | |||
} | |||
solve(i, GEMM_UNROLL_N, | |||
aa + kk * i * COMPSIZE, | |||
b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
} | |||
i >>= 1; | |||
} | |||
} | |||
kk += GEMM_UNROLL_N; | |||
b += GEMM_UNROLL_N * k * COMPSIZE; | |||
c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
j --; | |||
jj += GEMM_UNROLL_M; | |||
} | |||
if (n & (GEMM_UNROLL_N - 1)) { | |||
j = (GEMM_UNROLL_N >> 1); | |||
while (j > 0) { | |||
if (n & j) { | |||
aa = a; | |||
cc = c; | |||
i = (m >> GEMM_UNROLL_M_SHIFT); | |||
while (i > 0) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, | |||
b, | |||
cc, | |||
ldc); | |||
} | |||
solve(GEMM_UNROLL_M, j, | |||
aa + kk * GEMM_UNROLL_M * COMPSIZE, | |||
b + kk * j * COMPSIZE, cc, ldc); | |||
aa += GEMM_UNROLL_M * k * COMPSIZE; | |||
cc += GEMM_UNROLL_M * COMPSIZE; | |||
i --; | |||
} | |||
if (m & (GEMM_UNROLL_M - 1)) { | |||
i = (GEMM_UNROLL_M >> 1); | |||
while (i > 0) { | |||
if (m & i) { | |||
if (kk > 0) { | |||
GEMM_KERNEL(i, j, kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa, | |||
b, | |||
cc, | |||
ldc); | |||
} | |||
solve(i, j, | |||
aa + kk * i * COMPSIZE, | |||
b + kk * j * COMPSIZE, cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
} | |||
i >>= 1; | |||
} | |||
} | |||
b += j * k * COMPSIZE; | |||
c += j * ldc * COMPSIZE; | |||
kk += j; | |||
} | |||
j >>= 1; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,855 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include "common.h" | |||
#include <altivec.h> | |||
static FLOAT dm1 = -1.; | |||
#ifdef CONJ | |||
#define GEMM_KERNEL GEMM_KERNEL_R | |||
#else | |||
#define GEMM_KERNEL GEMM_KERNEL_N | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 1 | |||
#define GEMM_UNROLL_M_SHIFT 0 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 2 | |||
#define GEMM_UNROLL_M_SHIFT 1 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 4 | |||
#define GEMM_UNROLL_M_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 6 | |||
#define GEMM_UNROLL_M_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 8 | |||
#define GEMM_UNROLL_M_SHIFT 3 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_M == 16 | |||
#define GEMM_UNROLL_M_SHIFT 4 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 1 | |||
#define GEMM_UNROLL_N_SHIFT 0 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 2 | |||
#define GEMM_UNROLL_N_SHIFT 1 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 4 | |||
#define GEMM_UNROLL_N_SHIFT 2 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 8 | |||
#define GEMM_UNROLL_N_SHIFT 3 | |||
#endif | |||
#if GEMM_DEFAULT_UNROLL_N == 16 | |||
#define GEMM_UNROLL_N_SHIFT 4 | |||
#endif | |||
#ifndef COMPLEX | |||
#ifdef DOUBLE | |||
static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; | |||
c0 = &c[0*ldc]; | |||
c1 = &c[1*ldc]; | |||
c2 = &c[2*ldc]; | |||
c3 = &c[3*ldc]; | |||
c4 = &c[4*ldc]; | |||
c5 = &c[5*ldc]; | |||
c6 = &c[6*ldc]; | |||
c7 = &c[7*ldc]; | |||
vector FLOAT *Vb = (vector FLOAT *) b; | |||
vector FLOAT *Vc0 = (vector FLOAT *) c0; | |||
vector FLOAT *Vc1 = (vector FLOAT *) c1; | |||
vector FLOAT *Vc2 = (vector FLOAT *) c2; | |||
vector FLOAT *Vc3 = (vector FLOAT *) c3; | |||
vector FLOAT *Vc4 = (vector FLOAT *) c4; | |||
vector FLOAT *Vc5 = (vector FLOAT *) c5; | |||
vector FLOAT *Vc6 = (vector FLOAT *) c6; | |||
vector FLOAT *Vc7 = (vector FLOAT *) c7; | |||
vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; | |||
a[56] = (c7[0] *= b[63]); | |||
a[57] = (c7[1] *= b[63]); | |||
a[58] = (c7[2] *= b[63]); | |||
a[59] = (c7[3] *= b[63]); | |||
a[60] = (c7[4] *= b[63]); | |||
a[61] = (c7[5] *= b[63]); | |||
a[62] = (c7[6] *= b[63]); | |||
a[63] = (c7[7] *= b[63]); | |||
VbS0 = vec_splat(Vb[28], 0); | |||
VbS1 = vec_splat(Vb[28], 1); | |||
VbS2 = vec_splat(Vb[29], 0); | |||
VbS3 = vec_splat(Vb[29], 1); | |||
VbS4 = vec_splat(Vb[30], 0); | |||
VbS5 = vec_splat(Vb[30], 1); | |||
VbS6 = vec_splat(Vb[31], 0); | |||
Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]); | |||
Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]); | |||
Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]); | |||
Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]); | |||
Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]); | |||
Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]); | |||
Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]); | |||
Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]); | |||
Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]); | |||
Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]); | |||
Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]); | |||
Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]); | |||
Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]); | |||
Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]); | |||
Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]); | |||
Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]); | |||
Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]); | |||
Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]); | |||
Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]); | |||
Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]); | |||
Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]); | |||
Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]); | |||
Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]); | |||
Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]); | |||
Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]); | |||
Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]); | |||
Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]); | |||
Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]); | |||
a[48] = (c6[0] *= b[54]); | |||
a[49] = (c6[1] *= b[54]); | |||
a[50] = (c6[2] *= b[54]); | |||
a[51] = (c6[3] *= b[54]); | |||
a[52] = (c6[4] *= b[54]); | |||
a[53] = (c6[5] *= b[54]); | |||
a[54] = (c6[6] *= b[54]); | |||
a[55] = (c6[7] *= b[54]); | |||
VbS0 = vec_splat(Vb[24], 0); | |||
VbS1 = vec_splat(Vb[24], 1); | |||
VbS2 = vec_splat(Vb[25], 0); | |||
VbS3 = vec_splat(Vb[25], 1); | |||
VbS4 = vec_splat(Vb[26], 0); | |||
VbS5 = vec_splat(Vb[26], 1); | |||
Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]); | |||
Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]); | |||
Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]); | |||
Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]); | |||
Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]); | |||
Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]); | |||
Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]); | |||
Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]); | |||
Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]); | |||
Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]); | |||
Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]); | |||
Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]); | |||
Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]); | |||
Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]); | |||
Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]); | |||
Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]); | |||
Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]); | |||
Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]); | |||
Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]); | |||
Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]); | |||
Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]); | |||
Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]); | |||
Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]); | |||
Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]); | |||
a[40] = (c5[0] *= b[45]); | |||
a[41] = (c5[1] *= b[45]); | |||
a[42] = (c5[2] *= b[45]); | |||
a[43] = (c5[3] *= b[45]); | |||
a[44] = (c5[4] *= b[45]); | |||
a[45] = (c5[5] *= b[45]); | |||
a[46] = (c5[6] *= b[45]); | |||
a[47] = (c5[7] *= b[45]); | |||
VbS0 = vec_splat(Vb[20], 0); | |||
VbS1 = vec_splat(Vb[20], 1); | |||
VbS2 = vec_splat(Vb[21], 0); | |||
VbS3 = vec_splat(Vb[21], 1); | |||
VbS4 = vec_splat(Vb[22], 0); | |||
Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]); | |||
Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]); | |||
Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]); | |||
Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]); | |||
Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]); | |||
Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]); | |||
Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]); | |||
Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]); | |||
Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]); | |||
Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]); | |||
Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]); | |||
Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]); | |||
Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]); | |||
Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]); | |||
Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]); | |||
Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]); | |||
Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]); | |||
Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]); | |||
Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]); | |||
Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]); | |||
a[32] = (c4[0] *= b[36]); | |||
a[33] = (c4[1] *= b[36]); | |||
a[34] = (c4[2] *= b[36]); | |||
a[35] = (c4[3] *= b[36]); | |||
a[36] = (c4[4] *= b[36]); | |||
a[37] = (c4[5] *= b[36]); | |||
a[38] = (c4[6] *= b[36]); | |||
a[39] = (c4[7] *= b[36]); | |||
VbS0 = vec_splat(Vb[16], 0); | |||
VbS1 = vec_splat(Vb[16], 1); | |||
VbS2 = vec_splat(Vb[17], 0); | |||
VbS3 = vec_splat(Vb[17], 1); | |||
Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]); | |||
Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]); | |||
Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]); | |||
Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]); | |||
Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]); | |||
Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]); | |||
Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]); | |||
Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]); | |||
Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]); | |||
Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]); | |||
Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]); | |||
Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]); | |||
Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]); | |||
Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]); | |||
Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]); | |||
Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]); | |||
a[24] = (c3[0] *= b[27]); | |||
a[25] = (c3[1] *= b[27]); | |||
a[26] = (c3[2] *= b[27]); | |||
a[27] = (c3[3] *= b[27]); | |||
a[28] = (c3[4] *= b[27]); | |||
a[29] = (c3[5] *= b[27]); | |||
a[30] = (c3[6] *= b[27]); | |||
a[31] = (c3[7] *= b[27]); | |||
VbS0 = vec_splat(Vb[12], 0); | |||
VbS1 = vec_splat(Vb[12], 1); | |||
VbS2 = vec_splat(Vb[13], 0); | |||
Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]); | |||
Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]); | |||
Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]); | |||
Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]); | |||
Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]); | |||
Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]); | |||
Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]); | |||
Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]); | |||
Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]); | |||
Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]); | |||
Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]); | |||
Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]); | |||
a[16] = (c2[0] *= b[18]); | |||
a[17] = (c2[1] *= b[18]); | |||
a[18] = (c2[2] *= b[18]); | |||
a[19] = (c2[3] *= b[18]); | |||
a[20] = (c2[4] *= b[18]); | |||
a[21] = (c2[5] *= b[18]); | |||
a[22] = (c2[6] *= b[18]); | |||
a[23] = (c2[7] *= b[18]); | |||
VbS0 = vec_splat(Vb[8], 0); | |||
VbS1 = vec_splat(Vb[8], 1); | |||
Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]); | |||
Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]); | |||
Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]); | |||
Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]); | |||
Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]); | |||
Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]); | |||
Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]); | |||
Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]); | |||
a[ 8] = (c1[0] *= b[9]); | |||
a[ 9] = (c1[1] *= b[9]); | |||
a[10] = (c1[2] *= b[9]); | |||
a[11] = (c1[3] *= b[9]); | |||
a[12] = (c1[4] *= b[9]); | |||
a[13] = (c1[5] *= b[9]); | |||
a[14] = (c1[6] *= b[9]); | |||
a[15] = (c1[7] *= b[9]); | |||
VbS0 = vec_splat(Vb[4], 0); | |||
Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]); | |||
Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]); | |||
Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]); | |||
Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]); | |||
a[0] = (c0[0] *= b[0]); | |||
a[1] = (c0[1] *= b[0]); | |||
a[2] = (c0[2] *= b[0]); | |||
a[3] = (c0[3] *= b[0]); | |||
a[4] = (c0[4] *= b[0]); | |||
a[5] = (c0[5] *= b[0]); | |||
a[6] = (c0[6] *= b[0]); | |||
a[7] = (c0[7] *= b[0]); | |||
} | |||
#else | |||
static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; | |||
c0 = &c[0*ldc]; | |||
c1 = &c[1*ldc]; | |||
c2 = &c[2*ldc]; | |||
c3 = &c[3*ldc]; | |||
c4 = &c[4*ldc]; | |||
c5 = &c[5*ldc]; | |||
c6 = &c[6*ldc]; | |||
c7 = &c[7*ldc]; | |||
vector FLOAT *Va = (vector FLOAT *) a; | |||
vector FLOAT *Vb = (vector FLOAT *) b; | |||
vector FLOAT *Vc0 = (vector FLOAT *) c0; | |||
vector FLOAT *Vc1 = (vector FLOAT *) c1; | |||
vector FLOAT *Vc2 = (vector FLOAT *) c2; | |||
vector FLOAT *Vc3 = (vector FLOAT *) c3; | |||
vector FLOAT *Vc4 = (vector FLOAT *) c4; | |||
vector FLOAT *Vc5 = (vector FLOAT *) c5; | |||
vector FLOAT *Vc6 = (vector FLOAT *) c6; | |||
vector FLOAT *Vc7 = (vector FLOAT *) c7; | |||
vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; | |||
VbS0 = vec_splat(Vb[14], 0); | |||
VbS1 = vec_splat(Vb[14], 1); | |||
VbS2 = vec_splat(Vb[14], 2); | |||
VbS3 = vec_splat(Vb[14], 3); | |||
VbS4 = vec_splat(Vb[15], 0); | |||
VbS5 = vec_splat(Vb[15], 1); | |||
VbS6 = vec_splat(Vb[15], 2); | |||
VbS7 = vec_splat(Vb[15], 3); | |||
Vc7[0] = vec_mul(VbS7, Vc7[0]); | |||
Vc7[1] = vec_mul(VbS7, Vc7[1]); | |||
Vc7[2] = vec_mul(VbS7, Vc7[2]); | |||
Vc7[3] = vec_mul(VbS7, Vc7[3]); | |||
Va[28] = Vc7[0]; | |||
Va[29] = Vc7[1]; | |||
Va[30] = Vc7[2]; | |||
Va[31] = Vc7[3]; | |||
Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); | |||
Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); | |||
Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); | |||
Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); | |||
Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); | |||
Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); | |||
Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); | |||
Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); | |||
Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); | |||
Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); | |||
Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); | |||
Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); | |||
Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); | |||
Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); | |||
Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); | |||
Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); | |||
Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); | |||
Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); | |||
Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); | |||
Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); | |||
Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); | |||
Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); | |||
Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); | |||
Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); | |||
Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); | |||
Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); | |||
Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); | |||
Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); | |||
VbS0 = vec_splat(Vb[12], 0); | |||
VbS1 = vec_splat(Vb[12], 1); | |||
VbS2 = vec_splat(Vb[12], 2); | |||
VbS3 = vec_splat(Vb[12], 3); | |||
VbS4 = vec_splat(Vb[13], 0); | |||
VbS5 = vec_splat(Vb[13], 1); | |||
VbS6 = vec_splat(Vb[13], 2); | |||
Vc6[0] = vec_mul(VbS6, Vc6[0]); | |||
Vc6[1] = vec_mul(VbS6, Vc6[1]); | |||
Vc6[2] = vec_mul(VbS6, Vc6[2]); | |||
Vc6[3] = vec_mul(VbS6, Vc6[3]); | |||
Va[24] = Vc6[0]; | |||
Va[25] = Vc6[1]; | |||
Va[26] = Vc6[2]; | |||
Va[27] = Vc6[3]; | |||
Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); | |||
Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); | |||
Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); | |||
Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); | |||
Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); | |||
Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); | |||
Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); | |||
Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); | |||
Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); | |||
Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); | |||
Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); | |||
Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); | |||
Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); | |||
Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); | |||
Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); | |||
Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); | |||
Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); | |||
Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); | |||
Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); | |||
Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); | |||
Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); | |||
Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); | |||
Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); | |||
Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); | |||
VbS0 = vec_splat(Vb[10], 0); | |||
VbS1 = vec_splat(Vb[10], 1); | |||
VbS2 = vec_splat(Vb[10], 2); | |||
VbS3 = vec_splat(Vb[10], 3); | |||
VbS4 = vec_splat(Vb[11], 0); | |||
VbS5 = vec_splat(Vb[11], 1); | |||
Vc5[0] = vec_mul(VbS5, Vc5[0]); | |||
Vc5[1] = vec_mul(VbS5, Vc5[1]); | |||
Vc5[2] = vec_mul(VbS5, Vc5[2]); | |||
Vc5[3] = vec_mul(VbS5, Vc5[3]); | |||
Va[20] = Vc5[0]; | |||
Va[21] = Vc5[1]; | |||
Va[22] = Vc5[2]; | |||
Va[23] = Vc5[3]; | |||
Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); | |||
Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); | |||
Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); | |||
Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); | |||
Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); | |||
Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); | |||
Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); | |||
Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); | |||
Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); | |||
Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); | |||
Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); | |||
Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); | |||
Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); | |||
Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); | |||
Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); | |||
Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); | |||
Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); | |||
Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); | |||
Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); | |||
Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); | |||
VbS0 = vec_splat(Vb[8], 0); | |||
VbS1 = vec_splat(Vb[8], 1); | |||
VbS2 = vec_splat(Vb[8], 2); | |||
VbS3 = vec_splat(Vb[8], 3); | |||
VbS4 = vec_splat(Vb[9], 0); | |||
Vc4[0] = vec_mul(VbS4, Vc4[0]); | |||
Vc4[1] = vec_mul(VbS4, Vc4[1]); | |||
Vc4[2] = vec_mul(VbS4, Vc4[2]); | |||
Vc4[3] = vec_mul(VbS4, Vc4[3]); | |||
Va[16] = Vc4[0]; | |||
Va[17] = Vc4[1]; | |||
Va[18] = Vc4[2]; | |||
Va[19] = Vc4[3]; | |||
Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); | |||
Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); | |||
Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); | |||
Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); | |||
Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); | |||
Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); | |||
Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); | |||
Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); | |||
Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); | |||
Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); | |||
Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); | |||
Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); | |||
Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); | |||
Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); | |||
Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); | |||
Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); | |||
VbS0 = vec_splat(Vb[6], 0); | |||
VbS1 = vec_splat(Vb[6], 1); | |||
VbS2 = vec_splat(Vb[6], 2); | |||
VbS3 = vec_splat(Vb[6], 3); | |||
Vc3[0] = vec_mul(VbS3, Vc3[0]); | |||
Vc3[1] = vec_mul(VbS3, Vc3[1]); | |||
Vc3[2] = vec_mul(VbS3, Vc3[2]); | |||
Vc3[3] = vec_mul(VbS3, Vc3[3]); | |||
Va[12] = Vc3[0]; | |||
Va[13] = Vc3[1]; | |||
Va[14] = Vc3[2]; | |||
Va[15] = Vc3[3]; | |||
Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); | |||
Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); | |||
Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); | |||
Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); | |||
Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); | |||
Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); | |||
Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); | |||
Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); | |||
Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); | |||
Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); | |||
Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); | |||
Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); | |||
VbS0 = vec_splat(Vb[4], 0); | |||
VbS1 = vec_splat(Vb[4], 1); | |||
VbS2 = vec_splat(Vb[4], 2); | |||
Vc2[0] = vec_mul(VbS2, Vc2[0]); | |||
Vc2[1] = vec_mul(VbS2, Vc2[1]); | |||
Vc2[2] = vec_mul(VbS2, Vc2[2]); | |||
Vc2[3] = vec_mul(VbS2, Vc2[3]); | |||
Va[ 8] = Vc2[0]; | |||
Va[ 9] = Vc2[1]; | |||
Va[10] = Vc2[2]; | |||
Va[11] = Vc2[3]; | |||
Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]); | |||
Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); | |||
Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); | |||
Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); | |||
Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]); | |||
Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); | |||
Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); | |||
Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); | |||
VbS0 = vec_splat(Vb[2], 0); | |||
VbS1 = vec_splat(Vb[2], 1); | |||
Vc1[0] = vec_mul(VbS1, Vc1[0]); | |||
Vc1[1] = vec_mul(VbS1, Vc1[1]); | |||
Vc1[2] = vec_mul(VbS1, Vc1[2]); | |||
Vc1[3] = vec_mul(VbS1, Vc1[3]); | |||
Va[4] = Vc1[0]; | |||
Va[5] = Vc1[1]; | |||
Va[6] = Vc1[2]; | |||
Va[7] = Vc1[3]; | |||
Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]); | |||
Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); | |||
Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); | |||
Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); | |||
VbS0 = vec_splat(Vb[0], 0); | |||
Vc0[0] = vec_mul(VbS0, Vc0[0]); | |||
Vc0[1] = vec_mul(VbS0, Vc0[1]); | |||
Vc0[2] = vec_mul(VbS0, Vc0[2]); | |||
Vc0[3] = vec_mul(VbS0, Vc0[3]); | |||
Va[0] = Vc0[0]; | |||
Va[1] = Vc0[1]; | |||
Va[2] = Vc0[2]; | |||
Va[3] = Vc0[3]; | |||
} | |||
#endif | |||
static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa, bb; | |||
int i, j, k; | |||
a += (n - 1) * m; | |||
b += (n - 1) * n; | |||
for (i = n - 1; i >= 0; i--) { | |||
bb = *(b + i); | |||
for (j = 0; j < m; j ++) { | |||
aa = *(c + j + i * ldc); | |||
aa *= bb; | |||
*a = aa; | |||
*(c + j + i * ldc) = aa; | |||
a ++; | |||
for (k = 0; k < i; k ++){ | |||
*(c + j + k * ldc) -= aa * *(b + k); | |||
} | |||
} | |||
b -= n; | |||
a -= 2 * m; | |||
} | |||
} | |||
#else | |||
static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
FLOAT aa1, aa2; | |||
FLOAT bb1, bb2; | |||
FLOAT cc1, cc2; | |||
int i, j, k; | |||
ldc *= 2; | |||
a += (n - 1) * m * 2; | |||
b += (n - 1) * n * 2; | |||
for (i = n - 1; i >= 0; i--) { | |||
bb1 = *(b + i * 2 + 0); | |||
bb2 = *(b + i * 2 + 1); | |||
for (j = 0; j < m; j ++) { | |||
aa1 = *(c + j * 2 + 0 + i * ldc); | |||
aa2 = *(c + j * 2 + 1 + i * ldc); | |||
#ifndef CONJ | |||
cc1 = aa1 * bb1 - aa2 * bb2; | |||
cc2 = aa1 * bb2 + aa2 * bb1; | |||
#else | |||
cc1 = aa1 * bb1 + aa2 * bb2; | |||
cc2 = - aa1 * bb2 + aa2 * bb1; | |||
#endif | |||
*(a + 0) = cc1; | |||
*(a + 1) = cc2; | |||
*(c + j * 2 + 0 + i * ldc) = cc1; | |||
*(c + j * 2 + 1 + i * ldc) = cc2; | |||
a += 2; | |||
for (k = 0; k < i; k ++){ | |||
#ifndef CONJ | |||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | |||
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
#else | |||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | |||
*(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
#endif | |||
} | |||
} | |||
b -= n * 2; | |||
a -= 4 * m; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
#ifdef COMPLEX | |||
FLOAT dummy2, | |||
#endif | |||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
BLASLONG i, j; | |||
FLOAT *aa, *cc; | |||
BLASLONG kk; | |||
#if 0 | |||
fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
m, n, k, offset); | |||
#endif | |||
#ifdef DOUBLE | |||
int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); | |||
#else | |||
int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); | |||
#endif | |||
kk = n - offset; | |||
c += n * ldc * COMPSIZE; | |||
b += n * k * COMPSIZE; | |||
if (n & (GEMM_UNROLL_N - 1)) { | |||
j = 1; | |||
while (j < GEMM_UNROLL_N) { | |||
if (n & j) { | |||
aa = a; | |||
b -= j * k * COMPSIZE; | |||
c -= j * ldc* COMPSIZE; | |||
cc = c; | |||
i = (m >> GEMM_UNROLL_M_SHIFT); | |||
if (i > 0) { | |||
do { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + GEMM_UNROLL_M * kk * COMPSIZE, | |||
b + j * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
solve(GEMM_UNROLL_M, j, | |||
aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, | |||
b + (kk - j) * j * COMPSIZE, | |||
cc, ldc); | |||
aa += GEMM_UNROLL_M * k * COMPSIZE; | |||
cc += GEMM_UNROLL_M * COMPSIZE; | |||
i --; | |||
} while (i > 0); | |||
} | |||
if (m & (GEMM_UNROLL_M - 1)) { | |||
i = (GEMM_UNROLL_M >> 1); | |||
do { | |||
if (m & i) { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(i, j, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + i * kk * COMPSIZE, | |||
b + j * kk * COMPSIZE, | |||
cc, ldc); | |||
} | |||
solve(i, j, | |||
aa + (kk - j) * i * COMPSIZE, | |||
b + (kk - j) * j * COMPSIZE, | |||
cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
} | |||
i >>= 1; | |||
} while (i > 0); | |||
} | |||
kk -= j; | |||
} | |||
j <<= 1; | |||
} | |||
} | |||
j = (n >> GEMM_UNROLL_N_SHIFT); | |||
if (j > 0) { | |||
do { | |||
aa = a; | |||
b -= GEMM_UNROLL_N * k * COMPSIZE; | |||
c -= GEMM_UNROLL_N * ldc * COMPSIZE; | |||
cc = c; | |||
i = (m >> GEMM_UNROLL_M_SHIFT); | |||
if (i > 0) { | |||
do { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + GEMM_UNROLL_M * kk * COMPSIZE, | |||
b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
if (well_aligned) { | |||
#ifdef DOUBLE | |||
solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, | |||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); | |||
#else | |||
solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, | |||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); | |||
#endif | |||
} | |||
else { | |||
solve(GEMM_UNROLL_M, GEMM_UNROLL_N, | |||
aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, | |||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
} | |||
aa += GEMM_UNROLL_M * k * COMPSIZE; | |||
cc += GEMM_UNROLL_M * COMPSIZE; | |||
i --; | |||
} while (i > 0); | |||
} | |||
if (m & (GEMM_UNROLL_M - 1)) { | |||
i = (GEMM_UNROLL_M >> 1); | |||
do { | |||
if (m & i) { | |||
if (k - kk > 0) { | |||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
aa + i * kk * COMPSIZE, | |||
b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
cc, | |||
ldc); | |||
} | |||
solve(i, GEMM_UNROLL_N, | |||
aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, | |||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | |||
cc, ldc); | |||
aa += i * k * COMPSIZE; | |||
cc += i * COMPSIZE; | |||
} | |||
i >>= 1; | |||
} while (i > 0); | |||
} | |||
kk -= GEMM_UNROLL_N; | |||
j --; | |||
} while (j > 0); | |||
} | |||
return 0; | |||
} | |||
@@ -933,6 +933,77 @@ static void init_parameter(void) { | |||
} | |||
#else // (ARCH_ARM64) | |||
#if defined(ARCH_MIPS64) | |||
static void init_parameter(void) { | |||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
TABLE_NAME.dgemm_r = 640; | |||
TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
#ifdef EXPRECISION | |||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; | |||
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; | |||
TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; | |||
TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; | |||
#endif | |||
#if defined(USE_GEMM3M) | |||
#ifdef CGEMM3M_DEFAULT_P | |||
TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; | |||
#else | |||
TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; | |||
#endif | |||
#ifdef ZGEMM3M_DEFAULT_P | |||
TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; | |||
#else | |||
TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; | |||
#endif | |||
#ifdef CGEMM3M_DEFAULT_Q | |||
TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; | |||
#else | |||
TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; | |||
#endif | |||
#ifdef ZGEMM3M_DEFAULT_Q | |||
TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; | |||
#else | |||
TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; | |||
#endif | |||
#ifdef CGEMM3M_DEFAULT_R | |||
TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; | |||
#else | |||
TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; | |||
#endif | |||
#ifdef ZGEMM3M_DEFAULT_R | |||
TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; | |||
#else | |||
TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; | |||
#endif | |||
#ifdef EXPRECISION | |||
TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; | |||
TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; | |||
TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; | |||
#endif | |||
#endif | |||
} | |||
#else // (ARCH_MIPS64) | |||
#if (ARCH_POWER) | |||
static void init_parameter(void) { | |||
@@ -1780,4 +1851,5 @@ static void init_parameter(void) { | |||
} | |||
#endif //POWER | |||
#endif //ZARCH | |||
#endif //(ARCH_MIPS64) | |||
#endif //(ARCH_ARM64) |
@@ -644,9 +644,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
/* | |||
#define SGEMM_DEFAULT_UNROLL_MN 32 | |||
#define DGEMM_DEFAULT_UNROLL_MN 32 | |||
*/ | |||
#endif | |||
#ifdef ARCH_X86 | |||
@@ -1552,9 +1553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
/* | |||
#define SGEMM_DEFAULT_UNROLL_MN 32 | |||
#define DGEMM_DEFAULT_UNROLL_MN 32 | |||
*/ | |||
#endif | |||
#ifdef ARCH_X86 | |||
@@ -2570,8 +2572,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define SYMV_P 16 | |||
#endif | |||
#ifdef LOONGSON3A | |||
/*Copy from SICORTEX*/ | |||
#if defined(LOONGSON3R4) | |||
#define SNUMOPT 2 | |||
#define DNUMOPT 2 | |||
@@ -2579,6 +2580,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define GEMM_DEFAULT_OFFSET_B 0 | |||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
#ifdef HAVE_MSA | |||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||
#define SGEMM_DEFAULT_UNROLL_N 8 | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#else | |||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
@@ -2590,6 +2604,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#endif | |||
#define SGEMM_DEFAULT_P 64 | |||
#define DGEMM_DEFAULT_P 44 | |||
@@ -2612,7 +2627,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define SYMV_P 16 | |||
#endif | |||
#ifdef LOONGSON3B | |||
#if defined(LOONGSON3R3) | |||
////Copy from SICORTEX | |||
#define SNUMOPT 2 | |||
#define DNUMOPT 2 | |||
@@ -2620,32 +2636,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define GEMM_DEFAULT_OFFSET_B 0 | |||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
#define SGEMM_DEFAULT_UNROLL_M 2 | |||
#define SGEMM_DEFAULT_UNROLL_N 2 | |||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||
#define DGEMM_DEFAULT_UNROLL_N 2 | |||
#define DGEMM_DEFAULT_UNROLL_M 4 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 4 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define SGEMM_DEFAULT_P 64 | |||
#define DGEMM_DEFAULT_P 24 | |||
#define CGEMM_DEFAULT_P 24 | |||
#define ZGEMM_DEFAULT_P 20 | |||
#define DGEMM_DEFAULT_P 44 | |||
#define CGEMM_DEFAULT_P 64 | |||
#define ZGEMM_DEFAULT_P 32 | |||
#define SGEMM_DEFAULT_Q 192 | |||
#define DGEMM_DEFAULT_Q 128 | |||
#define DGEMM_DEFAULT_Q 92 | |||
#define CGEMM_DEFAULT_Q 128 | |||
#define ZGEMM_DEFAULT_Q 64 | |||
#define ZGEMM_DEFAULT_Q 80 | |||
#define SGEMM_DEFAULT_R 512 | |||
#define DGEMM_DEFAULT_R 512 | |||
#define CGEMM_DEFAULT_R 512 | |||
#define ZGEMM_DEFAULT_R 512 | |||
#define SGEMM_DEFAULT_R 640 | |||
#define DGEMM_DEFAULT_R dgemm_r | |||
#define CGEMM_DEFAULT_R 640 | |||
#define ZGEMM_DEFAULT_R 640 | |||
#define GEMM_OFFSET_A1 0x10000 | |||
#define GEMM_OFFSET_B1 0x100000 | |||