@@ -311,14 +311,14 @@ ifeq ($(ARCH), x86) | |||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
ifneq ($(NO_AVX), 1) | |||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||
endif | |||
endif | |||
ifeq ($(ARCH), x86_64) | |||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
ifneq ($(NO_AVX), 1) | |||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||
endif | |||
endif | |||
@@ -48,6 +48,7 @@ Please read GotoBLAS_01Readme.txt | |||
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge). | |||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||
- **AMD PILEDRIVER**: Used Bulldozer codes. | |||
#### MIPS64: | |||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||
@@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
#define MMXSTORE movd | |||
#endif | |||
#if defined(PILEDRIVER) || defined(BULLDOZER) | |||
//Enable some optimazation for barcelona. | |||
#define BARCELONA_OPTIMIZATION | |||
#endif | |||
#if defined(HAVE_3DNOW) | |||
#define EMMS femms | |||
#elif defined(HAVE_MMX) | |||
@@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
#ifdef ASSEMBLER | |||
#if defined(PILEDRIVER) || defined(BULLDOZER) | |||
//Enable some optimazation for barcelona. | |||
#define BARCELONA_OPTIMIZATION | |||
#endif | |||
#if defined(HAVE_3DNOW) | |||
#define EMMS femms | |||
#elif defined(HAVE_MMX) | |||
@@ -106,6 +106,7 @@ | |||
#define CORE_SANDYBRIDGE 20 | |||
#define CORE_BOBCAT 21 | |||
#define CORE_BULLDOZER 22 | |||
#define CORE_PILEDRIVER 23 | |||
#define CORE_HASWELL CORE_SANDYBRIDGE | |||
#define HAVE_SSE (1 << 0) | |||
@@ -128,6 +129,7 @@ | |||
#define HAVE_FASTMOVU (1 << 17) | |||
#define HAVE_AVX (1 << 18) | |||
#define HAVE_FMA4 (1 << 19) | |||
#define HAVE_FMA3 (1 << 20) | |||
#define CACHE_INFO_L1_I 1 | |||
#define CACHE_INFO_L1_D 2 | |||
@@ -197,6 +199,7 @@ typedef struct { | |||
#define CPUTYPE_SANDYBRIDGE 44 | |||
#define CPUTYPE_BOBCAT 45 | |||
#define CPUTYPE_BULLDOZER 46 | |||
#define CPUTYPE_PILEDRIVER 47 | |||
// this define is because BLAS doesn't have haswell specific optimizations yet | |||
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE | |||
@@ -47,6 +47,8 @@ | |||
#define CORE_SANDYBRIDGE CORE_NEHALEM | |||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||
#define CORE_BULLDOZER CORE_BARCELONA | |||
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA | |||
#define CORE_PILEDRIVER CORE_BARCELONA | |||
#endif | |||
#ifndef CPUIDEMU | |||
@@ -228,6 +230,7 @@ int get_cputype(int gettype){ | |||
#ifndef NO_AVX | |||
if (support_avx()) feature |= HAVE_AVX; | |||
#endif | |||
if ((ecx & (1 << 20)) != 0) feature |= HAVE_FMA3; | |||
if (have_excpuid() >= 0x01) { | |||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | |||
@@ -1100,11 +1103,21 @@ int get_cpuname(void){ | |||
case 1: | |||
case 10: | |||
return CPUTYPE_BARCELONA; | |||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(support_avx()) | |||
return CPUTYPE_BULLDOZER; | |||
else | |||
return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
case 6: | |||
switch (model) { | |||
case 1: | |||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(support_avx()) | |||
return CPUTYPE_BULLDOZER; | |||
else | |||
return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
case 2: | |||
if(support_avx()) | |||
return CPUTYPE_PILEDRIVER; | |||
else | |||
return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
} | |||
break; | |||
case 5: | |||
return CPUTYPE_BOBCAT; | |||
} | |||
@@ -1229,6 +1242,7 @@ static char *cpuname[] = { | |||
"SANDYBRIDGE", | |||
"BOBCAT", | |||
"BULLDOZER", | |||
"PILEDRIVER", | |||
}; | |||
static char *lowercpuname[] = { | |||
@@ -1278,6 +1292,7 @@ static char *lowercpuname[] = { | |||
"sandybridge", | |||
"bobcat", | |||
"bulldozer", | |||
"piledriver", | |||
}; | |||
static char *corename[] = { | |||
@@ -1304,6 +1319,7 @@ static char *corename[] = { | |||
"SANDYBRIDGE", | |||
"BOBCAT", | |||
"BULLDOZER", | |||
"PILEDRIVER", | |||
}; | |||
static char *corename_lower[] = { | |||
@@ -1330,6 +1346,7 @@ static char *corename_lower[] = { | |||
"sandybridge", | |||
"bobcat", | |||
"bulldozer", | |||
"piledriver", | |||
}; | |||
@@ -1472,11 +1489,19 @@ int get_coretype(void){ | |||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||
else if (exfamily == 5) return CORE_BOBCAT; | |||
else if (exfamily == 6) { | |||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(support_avx()) | |||
return CORE_BULLDOZER; | |||
else | |||
return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||
switch (model) { | |||
case 1: | |||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(support_avx()) | |||
return CORE_BULLDOZER; | |||
else | |||
return CORE_BARCELONA; //OS don't support AVX. | |||
case 2: | |||
if(support_avx()) | |||
return CORE_PILEDRIVER; | |||
else | |||
return CORE_BARCELONA; //OS don't support AVX. | |||
} | |||
}else return CORE_BARCELONA; | |||
} | |||
} | |||
@@ -1564,6 +1589,7 @@ void get_cpuconfig(void){ | |||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n"); | |||
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | |||
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | |||
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | |||
@@ -1631,5 +1657,6 @@ void get_sse(void){ | |||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); | |||
} |
@@ -64,10 +64,12 @@ extern gotoblas_t gotoblas_BOBCAT; | |||
#ifndef NO_AVX | |||
extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
extern gotoblas_t gotoblas_BULLDOZER; | |||
extern gotoblas_t gotoblas_PILEDRIVER; | |||
#else | |||
//Use NEHALEM kernels for sandy bridge | |||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
#define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA | |||
#endif | |||
//Use sandy bridge kernels for haswell. | |||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||
@@ -228,13 +230,23 @@ static gotoblas_t *get_coretype(void){ | |||
} else if (exfamily == 5) { | |||
return &gotoblas_BOBCAT; | |||
} else if (exfamily == 6) { | |||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(model == 1){ | |||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(support_avx()) | |||
return &gotoblas_BULLDOZER; | |||
else{ | |||
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | |||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
}else if(model == 2){ | |||
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 | |||
if(support_avx()) | |||
return &gotoblas_PILEDRIVER; | |||
else{ | |||
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | |||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
} else { | |||
return &gotoblas_BARCELONA; | |||
} | |||
@@ -272,6 +284,7 @@ static char *corename[] = { | |||
"Sandybridge", | |||
"Bobcat", | |||
"Bulldozer", | |||
"Piledriver", | |||
}; | |||
char *gotoblas_corename(void) { | |||
@@ -294,6 +307,7 @@ char *gotoblas_corename(void) { | |||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||
if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||
return corename[0]; | |||
} | |||
@@ -106,6 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
/* #define FORCE_ISTANBUL */ | |||
/* #define FORCE_BOBCAT */ | |||
/* #define FORCE_BULLDOZER */ | |||
/* #define FORCE_PILEDRIVER */ | |||
/* #define FORCE_SSE_GENERIC */ | |||
/* #define FORCE_VIAC3 */ | |||
/* #define FORCE_NANO */ | |||
@@ -398,6 +399,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CORENAME "BULLDOZER" | |||
#endif | |||
#if defined (FORCE_PILEDRIVER) | |||
#define FORCE | |||
#define FORCE_INTEL | |||
#define ARCHITECTURE "X86" | |||
#define SUBARCHITECTURE "PILEDRIVER" | |||
#define ARCHCONFIG "-DPILEDRIVER " \ | |||
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" \ | |||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH" \ | |||
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||
#define LIBNAME "piledriver" | |||
#define CORENAME "PILEDRIVER" | |||
#endif | |||
#ifdef FORCE_SSE_GENERIC | |||
#define FORCE | |||
#define FORCE_INTEL | |||
@@ -826,6 +826,22 @@ static void init_parameter(void) { | |||
#endif | |||
#endif | |||
#ifdef PILEDRIVER | |||
#ifdef DEBUG | |||
fprintf(stderr, "Piledriver\n"); | |||
#endif | |||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
#ifdef EXPRECISION | |||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
#endif | |||
#endif | |||
#ifdef NANO | |||
#ifdef DEBUG | |||
@@ -0,0 +1,59 @@ | |||
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
SGEMMINCOPY = | |||
SGEMMITCOPY = | |||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
SGEMMINCOPYOBJ = | |||
SGEMMITCOPYOBJ = | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
CGEMMINCOPY = | |||
CGEMMITCOPY = | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMINCOPYOBJ = | |||
CGEMMITCOPYOBJ = | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S |
@@ -69,7 +69,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHSIZE (8 * 10 + 4) | |||
#endif | |||
@@ -439,7 +439,7 @@ | |||
.L22: | |||
mulsd %xmm0, %xmm2 | |||
addsd %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movlpd 2 * SIZE(BB), %xmm2 | |||
@@ -488,7 +488,7 @@ | |||
movlpd 40 * SIZE(BB), %xmm3 | |||
addsd %xmm0, %xmm7 | |||
movlpd 8 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulsd %xmm1, %xmm2 | |||
@@ -1697,7 +1697,7 @@ | |||
.L42: | |||
mulpd %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulpd 2 * SIZE(BB), %xmm0 | |||
@@ -1727,7 +1727,7 @@ | |||
addpd %xmm0, %xmm7 | |||
movapd 16 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulpd %xmm1, %xmm2 | |||
@@ -64,7 +64,7 @@ | |||
#define BORIG 60(%esp) | |||
#define BUFFER 128(%esp) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
@@ -437,7 +437,7 @@ | |||
.L32: | |||
mulss %xmm0, %xmm2 | |||
addss %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 4 * SIZE(BB), %xmm2 | |||
@@ -833,7 +833,7 @@ | |||
.L22: | |||
mulps %xmm0, %xmm2 | |||
addps %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(BB), %xmm2 | |||
@@ -1848,7 +1848,7 @@ | |||
.L72: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulss 4 * SIZE(BB), %xmm0 | |||
@@ -2109,7 +2109,7 @@ | |||
ALIGN_4 | |||
.L62: | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
@@ -2429,7 +2429,7 @@ | |||
.L52: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulps 4 * SIZE(BB), %xmm0 | |||
@@ -2459,7 +2459,7 @@ | |||
addps %xmm0, %xmm5 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm2 | |||
@@ -2952,7 +2952,7 @@ | |||
.L112: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 1 * SIZE(AA), %xmm0 | |||
@@ -3148,7 +3148,7 @@ | |||
.L102: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movsd 2 * SIZE(AA), %xmm0 | |||
@@ -3389,7 +3389,7 @@ | |||
.L92: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(AA), %xmm0 | |||
@@ -3404,7 +3404,7 @@ | |||
mulps 12 * SIZE(BB), %xmm0 | |||
addps %xmm0, %xmm7 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm3 | |||
@@ -69,7 +69,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHSIZE (8 * 10 + 4) | |||
#endif | |||
@@ -910,7 +910,7 @@ | |||
.L22: | |||
mulsd %xmm0, %xmm2 | |||
addsd %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movlpd 2 * SIZE(BB), %xmm2 | |||
@@ -959,7 +959,7 @@ | |||
movlpd 40 * SIZE(BB), %xmm3 | |||
addsd %xmm0, %xmm7 | |||
movlpd 8 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulsd %xmm1, %xmm2 | |||
@@ -1439,7 +1439,7 @@ | |||
.L42: | |||
mulpd %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulpd 2 * SIZE(BB), %xmm0 | |||
@@ -1469,7 +1469,7 @@ | |||
addpd %xmm0, %xmm7 | |||
movapd 16 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulpd %xmm1, %xmm2 | |||
@@ -64,7 +64,7 @@ | |||
#define BORIG 60(%esp) | |||
#define BUFFER 128(%esp) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
@@ -872,7 +872,7 @@ | |||
.L22: | |||
mulps %xmm0, %xmm2 | |||
addps %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(BB), %xmm2 | |||
@@ -1316,7 +1316,7 @@ | |||
.L32: | |||
mulss %xmm0, %xmm2 | |||
addss %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 4 * SIZE(BB), %xmm2 | |||
@@ -1855,7 +1855,7 @@ | |||
.L52: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulps 4 * SIZE(BB), %xmm0 | |||
@@ -1885,7 +1885,7 @@ | |||
addps %xmm0, %xmm5 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm2 | |||
@@ -2249,7 +2249,7 @@ | |||
ALIGN_4 | |||
.L62: | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
@@ -2562,7 +2562,7 @@ | |||
.L72: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulss 4 * SIZE(BB), %xmm0 | |||
@@ -2957,7 +2957,7 @@ | |||
.L92: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(AA), %xmm0 | |||
@@ -2972,7 +2972,7 @@ | |||
mulps 12 * SIZE(BB), %xmm0 | |||
addps %xmm0, %xmm7 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm3 | |||
@@ -3280,7 +3280,7 @@ | |||
.L102: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movsd 2 * SIZE(AA), %xmm0 | |||
@@ -3515,7 +3515,7 @@ | |||
.L112: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 1 * SIZE(AA), %xmm0 | |||
@@ -69,7 +69,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHSIZE (8 * 10 + 4) | |||
#endif | |||
@@ -1036,7 +1036,7 @@ | |||
.L42: | |||
mulpd %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulpd 2 * SIZE(BB), %xmm0 | |||
@@ -1066,7 +1066,7 @@ | |||
addpd %xmm0, %xmm7 | |||
movapd 16 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulpd %xmm1, %xmm2 | |||
@@ -2224,7 +2224,7 @@ | |||
.L22: | |||
mulsd %xmm0, %xmm2 | |||
addsd %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movlpd 2 * SIZE(BB), %xmm2 | |||
@@ -2273,7 +2273,7 @@ | |||
movlpd 40 * SIZE(BB), %xmm3 | |||
addsd %xmm0, %xmm7 | |||
movlpd 8 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
#endif | |||
mulsd %xmm1, %xmm2 | |||
@@ -64,7 +64,7 @@ | |||
#define BORIG 60(%esp) | |||
#define BUFFER 128(%esp) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
@@ -439,7 +439,7 @@ | |||
.L92: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(AA), %xmm0 | |||
@@ -454,7 +454,7 @@ | |||
mulps 12 * SIZE(BB), %xmm0 | |||
addps %xmm0, %xmm7 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm3 | |||
@@ -758,7 +758,7 @@ | |||
.L102: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movsd 2 * SIZE(AA), %xmm0 | |||
@@ -993,7 +993,7 @@ | |||
.L112: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 1 * SIZE(AA), %xmm0 | |||
@@ -1324,7 +1324,7 @@ | |||
.L52: | |||
mulps %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulps 4 * SIZE(BB), %xmm0 | |||
@@ -1354,7 +1354,7 @@ | |||
addps %xmm0, %xmm5 | |||
movaps 32 * SIZE(AA), %xmm0 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
mulps %xmm1, %xmm2 | |||
@@ -1718,7 +1718,7 @@ | |||
ALIGN_4 | |||
.L62: | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
@@ -2031,7 +2031,7 @@ | |||
.L72: | |||
mulss %xmm0, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
mulss 4 * SIZE(BB), %xmm0 | |||
@@ -2859,7 +2859,7 @@ | |||
.L22: | |||
mulps %xmm0, %xmm2 | |||
addps %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movaps 4 * SIZE(BB), %xmm2 | |||
@@ -3303,7 +3303,7 @@ | |||
.L32: | |||
mulss %xmm0, %xmm2 | |||
addss %xmm2, %xmm4 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
#endif | |||
movss 4 * SIZE(BB), %xmm2 | |||
@@ -75,7 +75,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
#define WPREFETCHSIZE 112 | |||
#define PREFETCH prefetch | |||
@@ -533,7 +533,7 @@ | |||
addps %xmm0, %xmm7 | |||
movsd 16 * SIZE(AA), %xmm0 | |||
mulps %xmm1, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
addps %xmm2, %xmm4 | |||
@@ -75,7 +75,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
#define WPREFETCHSIZE 112 | |||
#define PREFETCH prefetch | |||
@@ -994,7 +994,7 @@ | |||
addps %xmm0, %xmm7 | |||
movsd 16 * SIZE(AA), %xmm0 | |||
mulps %xmm1, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
addps %xmm2, %xmm4 | |||
@@ -75,7 +75,7 @@ | |||
#define STACK_ALIGN 4096 | |||
#define STACK_OFFSET 1024 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCHSIZE (16 * 10 + 8) | |||
#define WPREFETCHSIZE 112 | |||
#define PREFETCH prefetch | |||
@@ -1820,7 +1820,7 @@ | |||
addps %xmm0, %xmm7 | |||
movsd 16 * SIZE(AA), %xmm0 | |||
mulps %xmm1, %xmm2 | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
#endif | |||
addps %xmm2, %xmm4 | |||
@@ -0,0 +1,70 @@ | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
DGEMVNKERNEL = dgemv_n_bulldozer.S | |||
DGEMVTKERNEL = dgemv_t_bulldozer.S | |||
DAXPYKERNEL = daxpy_bulldozer.S | |||
DDOTKERNEL = ddot_bulldozer.S | |||
DCOPYKERNEL = dcopy_bulldozer.S | |||
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | |||
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | |||
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | |||
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | |||
ZGEMMINCOPY = | |||
ZGEMMITCOPY = | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMINCOPYOBJ = | |||
ZGEMMITCOPYOBJ = | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlps | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlps | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -160,7 +160,7 @@ | |||
#define a3 %xmm14 | |||
#define xt1 %xmm15 | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define MOVDDUP(a, b, c) movddup a(b), c | |||
#define MOVDDUP2(a, b, c) movddup a##b, c | |||
#else | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -167,7 +167,7 @@ | |||
#define a3 %xmm14 | |||
#define xt1 %xmm15 | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) | |||
#define MOVDDUP(a, b, c) movddup a(b), c | |||
#define MOVDDUP2(a, b, c) movddup a##b, c | |||
#else | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -166,7 +166,7 @@ | |||
#define xt1 %xmm14 | |||
#define xt2 %xmm15 | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) | |||
#define MOVDDUP(a, b, c) movddup a(b), c | |||
#define MOVDDUP2(a, b, c) movddup a##b, c | |||
#else | |||
@@ -76,7 +76,7 @@ | |||
#define movsd movlpd | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define PREFETCH prefetch | |||
#define PREFETCHW prefetchw | |||
#define PREFETCHSIZE (16 * 16) | |||
@@ -166,7 +166,7 @@ | |||
#define a3 %xmm14 | |||
#define xt1 %xmm15 | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) | |||
#define MOVDDUP(a, b, c) movddup a(b), c | |||
#define MOVDDUP2(a, b, c) movddup a##b, c | |||
#else | |||
@@ -85,7 +85,7 @@ | |||
#define movsd movlps | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||
#define ALIGNED_ACCESS | |||
#define MOVUPS_A movaps | |||
#define MOVUPS_XL movaps | |||
@@ -234,6 +234,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
#ifdef ARCH_X86 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_M 1 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#else | |||
#define SGEMM_DEFAULT_UNROLL_N 2 | |||
#define DGEMM_DEFAULT_UNROLL_N 2 | |||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#define CGEMM3M_DEFAULT_UNROLL_N 4 | |||
#define CGEMM3M_DEFAULT_UNROLL_M 8 | |||
#define ZGEMM3M_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM3M_DEFAULT_UNROLL_M 4 | |||
#define GEMV_UNROLL 8 | |||
#endif | |||
#if defined(ARCH_X86_64) | |||
#define SGEMM_DEFAULT_P 768 | |||
#define DGEMM_DEFAULT_P 384 | |||
#else | |||
#define SGEMM_DEFAULT_P 448 | |||
#define DGEMM_DEFAULT_P 224 | |||
#endif | |||
#define QGEMM_DEFAULT_P 112 | |||
#define CGEMM_DEFAULT_P 224 | |||
#define ZGEMM_DEFAULT_P 112 | |||
#define XGEMM_DEFAULT_P 56 | |||
#if defined(ARCH_X86_64) | |||
#define SGEMM_DEFAULT_Q 168 | |||
#define DGEMM_DEFAULT_Q 168 | |||
#else | |||
#define SGEMM_DEFAULT_Q 224 | |||
#define DGEMM_DEFAULT_Q 224 | |||
#endif | |||
#define QGEMM_DEFAULT_Q 224 | |||
#define CGEMM_DEFAULT_Q 224 | |||
#define ZGEMM_DEFAULT_Q 224 | |||
#define XGEMM_DEFAULT_Q 224 | |||
#define SGEMM_DEFAULT_R sgemm_r | |||
#define QGEMM_DEFAULT_R qgemm_r | |||
#define DGEMM_DEFAULT_R dgemm_r | |||
#define CGEMM_DEFAULT_R cgemm_r | |||
#define ZGEMM_DEFAULT_R zgemm_r | |||
#define XGEMM_DEFAULT_R xgemm_r | |||
#define SYMV_P 16 | |||
#define HAVE_EXCLUSIVE_CACHE | |||
#define GEMM_THREAD gemm_thread_mn | |||
#endif | |||
#ifdef PILEDRIVER | |||
#define SNUMOPT 8 | |||
#define DNUMOPT 4 | |||
#define GEMM_DEFAULT_OFFSET_A 64 | |||
#define GEMM_DEFAULT_OFFSET_B 832 | |||
#define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||