@@ -247,11 +247,11 @@ endif | |||
ifdef DYNAMIC_ARCH | |||
ifeq ($(ARCH), x86) | |||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO | |||
endif | |||
ifeq ($(ARCH), x86_64) | |||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO | |||
endif | |||
ifndef DYNAMIC_CORE | |||
@@ -28,6 +28,7 @@ OPTERON_SSE3 | |||
BARCELONA | |||
SHANGHAI | |||
ISTANBUL | |||
BOBCAT | |||
c)VIA CPU: | |||
SSE_GENERIC | |||
@@ -356,4 +356,11 @@ REALNAME: | |||
#ifndef ALIGN_6 | |||
#define ALIGN_6 .align 64 | |||
// ffreep %st(0). | |||
// Because Clang didn't support ffreep, we directly use the opcode. | |||
// Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||
#ifndef ffreep | |||
#define ffreep .byte 0xdf, 0xc0 # | |||
#endif | |||
#endif |
@@ -448,4 +448,10 @@ REALNAME: | |||
#define ALIGN_6 .align 64 | |||
#endif | |||
// ffreep %st(0). | |||
// Because Clang didn't support ffreep, we directly use the opcode. | |||
// Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||
#ifndef ffreep | |||
#define ffreep .byte 0xdf, 0xc0 # | |||
#endif | |||
#endif |
@@ -104,6 +104,7 @@ | |||
#define CORE_ATOM 18 | |||
#define CORE_NANO 19 | |||
#define CORE_SANDYBRIDGE 20 | |||
#define CORE_BOBCAT 21 | |||
#define HAVE_SSE (1 << 0) | |||
#define HAVE_SSE2 (1 << 1) | |||
@@ -191,4 +192,5 @@ typedef struct { | |||
#define CPUTYPE_VIAC3 42 | |||
#define CPUTYPE_NANO 43 | |||
#define CPUTYPE_SANDYBRIDGE 44 | |||
#define CPUTYPE_BOBCAT 45 | |||
#endif |
@@ -1028,6 +1028,8 @@ int get_cpuname(void){ | |||
case 1: | |||
case 10: | |||
return CPUTYPE_BARCELONA; | |||
case 5: | |||
return CPUTYPE_BOBCAT; | |||
} | |||
break; | |||
} | |||
@@ -1148,6 +1150,7 @@ static char *cpuname[] = { | |||
"VIAC3", | |||
"NANO", | |||
"SANDYBRIDGE", | |||
"BOBCAT", | |||
}; | |||
static char *lowercpuname[] = { | |||
@@ -1195,6 +1198,7 @@ static char *lowercpuname[] = { | |||
"nsgeode", | |||
"nano", | |||
"sandybridge", | |||
"bobcat", | |||
}; | |||
static char *corename[] = { | |||
@@ -1219,6 +1223,7 @@ static char *corename[] = { | |||
"ATOM", | |||
"NANO", | |||
"SANDYBRIDGE", | |||
"BOBCAT", | |||
}; | |||
static char *corename_lower[] = { | |||
@@ -1243,6 +1248,7 @@ static char *corename_lower[] = { | |||
"atom", | |||
"nano", | |||
"sandybridge", | |||
"bobcat", | |||
}; | |||
@@ -1351,7 +1357,9 @@ int get_coretype(void){ | |||
if (family <= 0x5) return CORE_80486; | |||
if (family <= 0xe) return CORE_ATHLON; | |||
if (family == 0xf){ | |||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; | |||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||
else if (exfamily == 5) return CORE_BOBCAT; | |||
else return CORE_BARCELONA; | |||
} | |||
} | |||
@@ -1,5 +1,5 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
@@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define MAX_NODES 16 | |||
#define MAX_CPUS 256 | |||
#define NCPUBITS (8*sizeof(unsigned long)) | |||
#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) | |||
#define CPUELT(cpu) ((cpu) / NCPUBITS) | |||
#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) | |||
#define SH_MAGIC 0x510510 | |||
@@ -103,10 +108,10 @@ typedef struct { | |||
int num_nodes; | |||
int num_procs; | |||
int final_num_procs; | |||
unsigned long avail; | |||
unsigned long avail [MAX_BITMASK_LEN]; | |||
int avail_count; | |||
unsigned long cpu_info [MAX_CPUS]; | |||
unsigned long node_info [MAX_NODES]; | |||
unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; | |||
int cpu_use[MAX_CPUS]; | |||
} shm_t; | |||
@@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; | |||
static int shmid, pshmid; | |||
static void *paddr; | |||
static unsigned long lprocmask, lnodemask; | |||
static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; | |||
static int lprocmask_count = 0; | |||
static int numprocs = 1; | |||
static int numnodes = 1; | |||
@@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { | |||
than sizeof(unsigned long). On 64 bits, the limit | |||
is 64. On 32 bits, it is 32. | |||
***/ | |||
static inline unsigned long get_cpumap(int node) { | |||
static inline void get_cpumap(int node, unsigned long * node_info) { | |||
int infile; | |||
unsigned long affinity; | |||
unsigned long affinity[32]; | |||
char name[160]; | |||
char cpumap[160]; | |||
char *p, *dummy; | |||
char *dummy; | |||
int i=0; | |||
int count=0; | |||
int k=0; | |||
sprintf(name, CPUMAP_NAME, node); | |||
infile = open(name, O_RDONLY); | |||
for(i=0; i<32; i++){ | |||
affinity[i] = 0; | |||
} | |||
affinity = 0; | |||
if (infile != -1) { | |||
read(infile, cpumap, sizeof(cpumap)); | |||
p = cpumap; | |||
while (*p != '\n' && i<160){ | |||
if(*p != ',') { | |||
name[i++]=*p; | |||
} | |||
p++; | |||
} | |||
p = name; | |||
// while ((*p == '0') || (*p == ',')) p++; | |||
for(i=0; i<160; i++){ | |||
if(cpumap[i] == '\n') | |||
break; | |||
if(cpumap[i] != ','){ | |||
name[k++]=cpumap[i]; | |||
//Enough data for Hex | |||
if(k >= NCPUBITS/4){ | |||
affinity[count++] = strtoul(name, &dummy, 16); | |||
k=0; | |||
} | |||
} | |||
affinity = strtoul(p, &dummy, 16); | |||
} | |||
if(k!=0){ | |||
name[k]='\0'; | |||
affinity[count++] = strtoul(name, &dummy, 16); | |||
k=0; | |||
} | |||
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||
// revert the sequence | |||
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||
node_info[i]=affinity[count-i-1]; | |||
} | |||
close(infile); | |||
} | |||
return affinity; | |||
return ; | |||
} | |||
static inline unsigned long get_share(int cpu, int level) { | |||
static inline void get_share(int cpu, int level, unsigned long * share) { | |||
int infile; | |||
unsigned long affinity; | |||
unsigned long affinity[32]; | |||
char cpumap[160]; | |||
char name[160]; | |||
char *p; | |||
char *dummy; | |||
int count=0; | |||
int i=0,k=0; | |||
int bitmask_idx = 0; | |||
sprintf(name, SHARE_NAME, cpu, level); | |||
infile = open(name, O_RDONLY); | |||
affinity = (1UL << cpu); | |||
// Init share | |||
for(i=0; i<MAX_BITMASK_LEN; i++){ | |||
share[i]=0; | |||
} | |||
bitmask_idx = CPUELT(cpu); | |||
share[bitmask_idx] = CPUMASK(cpu); | |||
if (infile != -1) { | |||
read(infile, name, sizeof(name)); | |||
p = name; | |||
read(infile, cpumap, sizeof(cpumap)); | |||
while ((*p == '0') || (*p == ',')) p++; | |||
for(i=0; i<160; i++){ | |||
if(cpumap[i] == '\n') | |||
break; | |||
if(cpumap[i] != ','){ | |||
name[k++]=cpumap[i]; | |||
//Enough data | |||
if(k >= NCPUBITS/4){ | |||
affinity[count++] = strtoul(name, &dummy, 16); | |||
k=0; | |||
} | |||
} | |||
affinity = strtol(p, &p, 16); | |||
} | |||
if(k!=0){ | |||
name[k]='\0'; | |||
affinity[count++] = strtoul(name, &dummy, 16); | |||
k=0; | |||
} | |||
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||
// revert the sequence | |||
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||
share[i]=affinity[count-i-1]; | |||
} | |||
close(infile); | |||
} | |||
return affinity; | |||
return ; | |||
} | |||
static int numa_check(void) { | |||
@@ -248,6 +298,7 @@ static int numa_check(void) { | |||
DIR *dp; | |||
struct dirent *dir; | |||
int node; | |||
int j; | |||
common -> num_nodes = 0; | |||
@@ -258,7 +309,9 @@ static int numa_check(void) { | |||
return 0; | |||
} | |||
for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; | |||
for (node = 0; node < MAX_NODES; node ++) { | |||
for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0; | |||
} | |||
while ((dir = readdir(dp)) != NULL) { | |||
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | |||
@@ -266,12 +319,12 @@ static int numa_check(void) { | |||
node = atoi(&dir -> d_name[4]); | |||
if (node > MAX_NODES) { | |||
fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||
fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||
exit(1); | |||
} | |||
common -> num_nodes ++; | |||
common -> node_info[node] = get_cpumap(node); | |||
get_cpumap(node, common->node_info[node]); | |||
} | |||
} | |||
@@ -284,7 +337,7 @@ static int numa_check(void) { | |||
fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); | |||
for (node = 0; node < common -> num_nodes; node ++) | |||
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); | |||
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); | |||
#endif | |||
return common -> num_nodes; | |||
@@ -296,11 +349,13 @@ static void numa_mapping(void) { | |||
int i, j, h; | |||
unsigned long work, bit; | |||
int count = 0; | |||
int bitmask_idx = 0; | |||
for (node = 0; node < common -> num_nodes; node ++) { | |||
core = 0; | |||
for (cpu = 0; cpu < common -> num_procs; cpu ++) { | |||
if (common -> node_info[node] & common -> avail & (1UL << cpu)) { | |||
bitmask_idx = CPUELT(cpu); | |||
if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { | |||
common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); | |||
count ++; | |||
core ++; | |||
@@ -357,58 +412,89 @@ static void numa_mapping(void) { | |||
static void disable_hyperthread(void) { | |||
unsigned long share; | |||
unsigned long share[MAX_BITMASK_LEN]; | |||
int cpu; | |||
int bitmask_idx = 0; | |||
int i=0, count=0; | |||
bitmask_idx = CPUELT(common -> num_procs); | |||
if(common->num_procs > 64){ | |||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); | |||
exit(1); | |||
}else if(common->num_procs == 64){ | |||
common -> avail = 0xFFFFFFFFFFFFFFFFUL; | |||
}else | |||
common -> avail = (1UL << common -> num_procs) - 1; | |||
for(i=0; i< bitmask_idx; i++){ | |||
common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||
} | |||
if(CPUMASK(common -> num_procs) != 1){ | |||
common -> avail[count++] = CPUMASK(common -> num_procs) - 1; | |||
} | |||
common -> avail_count = count; | |||
/* if(common->num_procs > 64){ */ | |||
/* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ | |||
/* exit(1); */ | |||
/* }else if(common->num_procs == 64){ */ | |||
/* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ | |||
/* }else */ | |||
/* common -> avail = (1UL << common -> num_procs) - 1; */ | |||
#ifdef DEBUG | |||
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); | |||
fprintf(stderr, "\nAvail CPUs : "); | |||
for(i=0; i<count; i++) | |||
fprintf(stderr, "%04lx ", common -> avail[i]); | |||
fprintf(stderr, ".\n"); | |||
#endif | |||
for (cpu = 0; cpu < common -> num_procs; cpu ++) { | |||
share = (get_share(cpu, 1) & common -> avail); | |||
if (popcount(share) > 1) { | |||
get_share(cpu, 1, share); | |||
//When the shared cpu are in different element of share & avail array, this may be a bug. | |||
for (i = 0; i < count ; i++){ | |||
if (popcount(share[i]) > 1) { | |||
#ifdef DEBUG | |||
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||
cpu, share & ~(1UL << cpu)); | |||
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||
cpu, share[i] & ~(CPUMASK(cpu))); | |||
#endif | |||
common -> avail &= ~((share & ~(1UL << cpu))); | |||
common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); | |||
} | |||
} | |||
} | |||
} | |||
static void disable_affinity(void) { | |||
int i=0; | |||
int bitmask_idx=0; | |||
int count=0; | |||
#ifdef DEBUG | |||
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); | |||
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); | |||
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | |||
#endif | |||
if(common->final_num_procs > 64){ | |||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); | |||
exit(1); | |||
}else if(common->final_num_procs == 64){ | |||
lprocmask = 0xFFFFFFFFFFFFFFFFUL; | |||
}else | |||
lprocmask = (1UL << common -> final_num_procs) - 1; | |||
/* if(common->final_num_procs > 64){ */ | |||
/* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ | |||
/* exit(1); */ | |||
/* }else if(common->final_num_procs == 64){ */ | |||
/* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ | |||
/* }else */ | |||
/* lprocmask = (1UL << common -> final_num_procs) - 1; */ | |||
bitmask_idx = CPUELT(common -> final_num_procs); | |||
for(i=0; i< bitmask_idx; i++){ | |||
lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||
} | |||
if(CPUMASK(common -> final_num_procs) != 1){ | |||
lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; | |||
} | |||
lprocmask_count = count; | |||
#ifndef USE_OPENMP | |||
lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; | |||
for(i=0; i< count; i++){ | |||
lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; | |||
} | |||
#endif | |||
#ifdef DEBUG | |||
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); | |||
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); | |||
#endif | |||
} | |||
@@ -498,7 +584,7 @@ static void create_pshmem(void) { | |||
static void local_cpu_map(void) { | |||
int cpu, id, mapping; | |||
int bitmask_idx = 0; | |||
cpu = 0; | |||
mapping = 0; | |||
@@ -508,8 +594,9 @@ static void local_cpu_map(void) { | |||
if (id > 0) { | |||
if (is_dead(id)) common -> cpu_use[cpu] = 0; | |||
} | |||
if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { | |||
bitmask_idx = CPUELT(cpu); | |||
if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { | |||
common -> cpu_use[cpu] = pshmid; | |||
cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); | |||
@@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) { | |||
#ifndef USE_OPENMP | |||
cpu_set_t cpu_mask; | |||
#endif | |||
int i; | |||
if (initialized) return; | |||
@@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) { | |||
common -> num_procs = get_nprocs(); | |||
if(common -> num_procs > MAX_CPUS) { | |||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | |||
exit(1); | |||
} | |||
for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; | |||
numa_check(); | |||
@@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) { | |||
if (common -> num_nodes > 1) numa_mapping(); | |||
common -> final_num_procs = popcount(common -> avail); | |||
common -> final_num_procs = 0; | |||
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); | |||
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | |||
@@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) { | |||
disable_affinity(); | |||
num_avail = popcount(lprocmask); | |||
num_avail = 0; | |||
for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]); | |||
if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; | |||
@@ -163,7 +163,7 @@ int get_L2_size(void){ | |||
int eax, ebx, ecx, edx; | |||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ | |||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ | |||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | |||
@@ -446,7 +446,7 @@ void blas_set_parameter(void){ | |||
#endif | |||
#endif | |||
#if defined(CORE_BARCELONA) | |||
#if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) | |||
size >>= 8; | |||
sgemm_p = 232 * size; | |||
@@ -1,5 +1,5 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
@@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
/* #define FORCE_BARCELONA */ | |||
/* #define FORCE_SHANGHAI */ | |||
/* #define FORCE_ISTANBUL */ | |||
/* #define FORCE_BOBCAT */ | |||
/* #define FORCE_SSE_GENERIC */ | |||
/* #define FORCE_VIAC3 */ | |||
/* #define FORCE_NANO */ | |||
@@ -363,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CORENAME "BARCELONA" | |||
#endif | |||
#if defined(FORCE_BOBCAT) | |||
#define FORCE | |||
#define FORCE_INTEL | |||
#define ARCHITECTURE "X86" | |||
#define SUBARCHITECTURE "BOBCAT" | |||
#define ARCHCONFIG "-DBOBCAT " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ | |||
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ | |||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" | |||
#define LIBNAME "bobcat" | |||
#define CORENAME "BOBCAT" | |||
#endif | |||
#ifdef FORCE_SSE_GENERIC | |||
#define FORCE | |||
#define FORCE_INTEL | |||
@@ -794,6 +794,22 @@ static void init_parameter(void) { | |||
#endif | |||
#endif | |||
#ifdef BOBCAT | |||
#ifdef DEBUG | |||
fprintf(stderr, "Bobcate\n"); | |||
#endif | |||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
#ifdef EXPRECISION | |||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
#endif | |||
#endif | |||
#ifdef NANO | |||
#ifdef DEBUG | |||
@@ -0,0 +1,59 @@ | |||
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
SGEMMINCOPY = | |||
SGEMMITCOPY = | |||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
SGEMMINCOPYOBJ = | |||
SGEMMITCOPYOBJ = | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
CGEMMINCOPY = | |||
CGEMMITCOPY = | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMINCOPYOBJ = | |||
CGEMMITCOPYOBJ = | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S |
@@ -69,7 +69,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHSIZE (8 * 10 + 4) | |||
#endif | |||
@@ -439,7 +439,7 @@ | |||
.L22: | |||
mulsd %xmm0, %xmm2 | |||
addsd %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movlpd 2 * SIZE(BB), %xmm2 | |||
@@ -488,7 +488,7 @@ | |||
movlpd 40 * SIZE(BB), %xmm3 | |||
addsd %xmm0, %xmm7 | |||
movlpd 8 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulsd %xmm1, %xmm2 | |||
@@ -1697,7 +1697,7 @@ | |||
.L42: | |||
mulpd %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulpd 2 * SIZE(BB), %xmm0 | |||
@@ -1727,7 +1727,7 @@ | |||
addpd %xmm0, %xmm7 | |||
movapd 16 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulpd %xmm1, %xmm2 | |||
@@ -64,7 +64,7 @@ | |||
#define BORIG 60(%esp) | |||
#define BUFFER 128(%esp) | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
@@ -437,7 +437,7 @@ | |||
.L32: | |||
mulss %xmm0, %xmm2 | |||
addss %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 4 * SIZE(BB), %xmm2 | |||
@@ -833,7 +833,7 @@ | |||
.L22: | |||
mulps %xmm0, %xmm2 | |||
addps %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(BB), %xmm2 | |||
@@ -1848,7 +1848,7 @@ | |||
.L72: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulss 4 * SIZE(BB), %xmm0 | |||
@@ -2109,7 +2109,7 @@ | |||
ALIGN_4 | |||
.L62: | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
@@ -2429,7 +2429,7 @@ | |||
.L52: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulps 4 * SIZE(BB), %xmm0 | |||
@@ -2459,7 +2459,7 @@ | |||
addps %xmm0, %xmm5 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm2 | |||
@@ -2952,7 +2952,7 @@ | |||
.L112: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 1 * SIZE(AA), %xmm0 | |||
@@ -3148,7 +3148,7 @@ | |||
.L102: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movsd 2 * SIZE(AA), %xmm0 | |||
@@ -3389,7 +3389,7 @@ | |||
.L92: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(AA), %xmm0 | |||
@@ -3404,7 +3404,7 @@ | |||
mulps 12 * SIZE(BB), %xmm0 | |||
addps %xmm0, %xmm7 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm3 | |||
@@ -69,7 +69,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHSIZE (8 * 10 + 4) | |||
#endif | |||
@@ -910,7 +910,7 @@ | |||
.L22: | |||
mulsd %xmm0, %xmm2 | |||
addsd %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movlpd 2 * SIZE(BB), %xmm2 | |||
@@ -959,7 +959,7 @@ | |||
movlpd 40 * SIZE(BB), %xmm3 | |||
addsd %xmm0, %xmm7 | |||
movlpd 8 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulsd %xmm1, %xmm2 | |||
@@ -1439,7 +1439,7 @@ | |||
.L42: | |||
mulpd %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulpd 2 * SIZE(BB), %xmm0 | |||
@@ -1469,7 +1469,7 @@ | |||
addpd %xmm0, %xmm7 | |||
movapd 16 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulpd %xmm1, %xmm2 | |||
@@ -64,7 +64,7 @@ | |||
#define BORIG 60(%esp) | |||
#define BUFFER 128(%esp) | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
@@ -872,7 +872,7 @@ | |||
.L22: | |||
mulps %xmm0, %xmm2 | |||
addps %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(BB), %xmm2 | |||
@@ -1316,7 +1316,7 @@ | |||
.L32: | |||
mulss %xmm0, %xmm2 | |||
addss %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 4 * SIZE(BB), %xmm2 | |||
@@ -1855,7 +1855,7 @@ | |||
.L52: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulps 4 * SIZE(BB), %xmm0 | |||
@@ -1885,7 +1885,7 @@ | |||
addps %xmm0, %xmm5 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm2 | |||
@@ -2249,7 +2249,7 @@ | |||
ALIGN_4 | |||
.L62: | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
@@ -2562,7 +2562,7 @@ | |||
.L72: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulss 4 * SIZE(BB), %xmm0 | |||
@@ -2957,7 +2957,7 @@ | |||
.L92: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(AA), %xmm0 | |||
@@ -2972,7 +2972,7 @@ | |||
mulps 12 * SIZE(BB), %xmm0 | |||
addps %xmm0, %xmm7 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm3 | |||
@@ -3280,7 +3280,7 @@ | |||
.L102: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movsd 2 * SIZE(AA), %xmm0 | |||
@@ -3515,7 +3515,7 @@ | |||
.L112: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 1 * SIZE(AA), %xmm0 | |||
@@ -69,7 +69,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHSIZE (8 * 10 + 4) | |||
#endif | |||
@@ -1036,7 +1036,7 @@ | |||
.L42: | |||
mulpd %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulpd 2 * SIZE(BB), %xmm0 | |||
@@ -1066,7 +1066,7 @@ | |||
addpd %xmm0, %xmm7 | |||
movapd 16 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulpd %xmm1, %xmm2 | |||
@@ -2224,7 +2224,7 @@ | |||
.L22: | |||
mulsd %xmm0, %xmm2 | |||
addsd %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movlpd 2 * SIZE(BB), %xmm2 | |||
@@ -2273,7 +2273,7 @@ | |||
movlpd 40 * SIZE(BB), %xmm3 | |||
addsd %xmm0, %xmm7 | |||
movlpd 8 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulsd %xmm1, %xmm2 | |||
@@ -64,7 +64,7 @@ | |||
#define BORIG 60(%esp) | |||
#define BUFFER 128(%esp) | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
@@ -439,7 +439,7 @@ | |||
.L92: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(AA), %xmm0 | |||
@@ -454,7 +454,7 @@ | |||
mulps 12 * SIZE(BB), %xmm0 | |||
addps %xmm0, %xmm7 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm3 | |||
@@ -758,7 +758,7 @@ | |||
.L102: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movsd 2 * SIZE(AA), %xmm0 | |||
@@ -993,7 +993,7 @@ | |||
.L112: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 1 * SIZE(AA), %xmm0 | |||
@@ -1324,7 +1324,7 @@ | |||
.L52: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulps 4 * SIZE(BB), %xmm0 | |||
@@ -1354,7 +1354,7 @@ | |||
addps %xmm0, %xmm5 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm2 | |||
@@ -1718,7 +1718,7 @@ | |||
ALIGN_4 | |||
.L62: | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
@@ -2031,7 +2031,7 @@ | |||
.L72: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulss 4 * SIZE(BB), %xmm0 | |||
@@ -2859,7 +2859,7 @@ | |||
.L22: | |||
mulps %xmm0, %xmm2 | |||
addps %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(BB), %xmm2 | |||
@@ -3303,7 +3303,7 @@ | |||
.L32: | |||
mulss %xmm0, %xmm2 | |||
addss %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 4 * SIZE(BB), %xmm2 | |||
@@ -75,7 +75,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
#define WPREFETCHSIZE 112 | |||
#define PREFETCH prefetch | |||
@@ -533,7 +533,7 @@ | |||
addps %xmm0, %xmm7 | |||
movsd 16 * SIZE(AA), %xmm0 | |||
mulps %xmm1, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
addps %xmm2, %xmm4 | |||
@@ -75,7 +75,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
#define WPREFETCHSIZE 112 | |||
#define PREFETCH prefetch | |||
@@ -994,7 +994,7 @@ | |||
addps %xmm0, %xmm7 | |||
movsd 16 * SIZE(AA), %xmm0 | |||
mulps %xmm1, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
addps %xmm2, %xmm4 | |||
@@ -75,7 +75,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
#define WPREFETCHSIZE 112 | |||
#define PREFETCH prefetch | |||
@@ -1820,7 +1820,7 @@ | |||
addps %xmm0, %xmm7 | |||
movsd 16 * SIZE(AA), %xmm0 | |||
mulps %xmm1, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
addps %xmm2, %xmm4 | |||
@@ -0,0 +1,62 @@ | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
DGEMMINCOPY = | |||
DGEMMITCOPY = | |||
DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
DGEMMINCOPYOBJ = | |||
DGEMMITCOPYOBJ = | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
CGEMMONCOPY = zgemm_ncopy_2.S | |||
CGEMMOTCOPY = zgemm_tcopy_2.S | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
ZGEMMINCOPY = | |||
ZGEMMITCOPY = | |||
ZGEMMONCOPY = zgemm_ncopy_2.S | |||
ZGEMMOTCOPY = zgemm_tcopy_2.S | |||
ZGEMMINCOPYOBJ = | |||
ZGEMMITCOPYOBJ = | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S |
@@ -76,7 +76,7 @@ | |||
#define movsd movlps | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlps | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -160,7 +160,7 @@ | |||
#define a3 %xmm14 | |||
#define xt1 %xmm15 | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define MOVDDUP(a, b, c) movddup a(b), c | |||
#define MOVDDUP2(a, b, c) movddup a##b, c | |||
#else | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -86,7 +86,7 @@ | |||
#define BORIG 72(%rsp) | |||
#define BUFFER 128(%rsp) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHNTA prefetchnta | |||
@@ -95,7 +95,7 @@ | |||
#define PREFETCHSIZE (8 * 6 + 4) | |||
#endif | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHNTA prefetchnta | |||
@@ -86,7 +86,7 @@ | |||
#define BORIG 72(%rsp) | |||
#define BUFFER 128(%rsp) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHNTA prefetchnta | |||
@@ -95,7 +95,7 @@ | |||
#define PREFETCHSIZE (8 * 6 + 4) | |||
#endif | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHNTA prefetchnta | |||
@@ -86,7 +86,7 @@ | |||
#define BORIG 72(%rsp) | |||
#define BUFFER 128(%rsp) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHNTA prefetchnta | |||
@@ -95,7 +95,7 @@ | |||
#define PREFETCHSIZE (8 * 6 + 4) | |||
#endif | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHNTA prefetchnta | |||
@@ -67,6 +67,13 @@ | |||
#define ALIGNED_ACCESS | |||
#endif | |||
#ifdef BOBCAT | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (128 * 5) | |||
#define ALIGNED_ACCESS | |||
#endif | |||
#ifdef NANO | |||
#define PREFETCH prefetcht0 | |||
#define PREFETCHW prefetcht0 | |||
@@ -85,7 +85,7 @@ | |||
#define movsd movlps | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define ALIGNED_ACCESS | |||
#define MOVUPS_A movaps | |||
#define MOVUPS_XL movaps | |||
@@ -1,5 +1,5 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
@@ -208,6 +208,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#define SGEMM_DEFAULT_R sgemm_r | |||
#define QGEMM_DEFAULT_R qgemm_r | |||
#define DGEMM_DEFAULT_R dgemm_r | |||
#define CGEMM_DEFAULT_R cgemm_r | |||
#define ZGEMM_DEFAULT_R zgemm_r | |||
#define XGEMM_DEFAULT_R xgemm_r | |||
#define SYMV_P 16 | |||
#define HAVE_EXCLUSIVE_CACHE | |||
#define GEMM_THREAD gemm_thread_mn | |||
#endif | |||
#if defined(BOBCAT) | |||
#define SNUMOPT 8 | |||
#define DNUMOPT 4 | |||
#define GEMM_DEFAULT_OFFSET_A 64 | |||
#define GEMM_DEFAULT_OFFSET_B 832 | |||
#define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
#ifdef ARCH_X86 | |||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_M 1 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#else | |||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||
#define DGEMM_DEFAULT_UNROLL_M 4 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#endif | |||
#define SGEMM_DEFAULT_P 448 | |||
#define DGEMM_DEFAULT_P 224 | |||
#define QGEMM_DEFAULT_P 112 | |||
#define CGEMM_DEFAULT_P 224 | |||
#define ZGEMM_DEFAULT_P 112 | |||
#define XGEMM_DEFAULT_P 56 | |||
#define SGEMM_DEFAULT_Q 224 | |||
#define DGEMM_DEFAULT_Q 224 | |||
#define QGEMM_DEFAULT_Q 224 | |||
#define CGEMM_DEFAULT_Q 224 | |||
#define ZGEMM_DEFAULT_Q 224 | |||
#define XGEMM_DEFAULT_Q 224 | |||
#define SGEMM_DEFAULT_R sgemm_r | |||
#define QGEMM_DEFAULT_R qgemm_r | |||
#define DGEMM_DEFAULT_R dgemm_r | |||