| @@ -311,14 +311,14 @@ ifeq ($(ARCH), x86) | |||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -48,6 +48,7 @@ Please read GotoBLAS_01Readme.txt | |||||
| - **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge). | - **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge). | ||||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | ||||
| - **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | - **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | ||||
| - **AMD PILEDRIVER**: Used Bulldozer codes. | |||||
| #### MIPS64: | #### MIPS64: | ||||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | ||||
| @@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #define MMXSTORE movd | #define MMXSTORE movd | ||||
| #endif | #endif | ||||
| #if defined(PILEDRIVER) || defined(BULLDOZER) | |||||
| //Enable some optimazation for barcelona. | |||||
| #define BARCELONA_OPTIMIZATION | |||||
| #endif | |||||
| #if defined(HAVE_3DNOW) | #if defined(HAVE_3DNOW) | ||||
| #define EMMS femms | #define EMMS femms | ||||
| #elif defined(HAVE_MMX) | #elif defined(HAVE_MMX) | ||||
| @@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #ifdef ASSEMBLER | #ifdef ASSEMBLER | ||||
| #if defined(PILEDRIVER) || defined(BULLDOZER) | |||||
| //Enable some optimazation for barcelona. | |||||
| #define BARCELONA_OPTIMIZATION | |||||
| #endif | |||||
| #if defined(HAVE_3DNOW) | #if defined(HAVE_3DNOW) | ||||
| #define EMMS femms | #define EMMS femms | ||||
| #elif defined(HAVE_MMX) | #elif defined(HAVE_MMX) | ||||
| @@ -106,6 +106,7 @@ | |||||
| #define CORE_SANDYBRIDGE 20 | #define CORE_SANDYBRIDGE 20 | ||||
| #define CORE_BOBCAT 21 | #define CORE_BOBCAT 21 | ||||
| #define CORE_BULLDOZER 22 | #define CORE_BULLDOZER 22 | ||||
| #define CORE_PILEDRIVER 23 | |||||
| #define CORE_HASWELL CORE_SANDYBRIDGE | #define CORE_HASWELL CORE_SANDYBRIDGE | ||||
| #define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
| @@ -128,6 +129,7 @@ | |||||
| #define HAVE_FASTMOVU (1 << 17) | #define HAVE_FASTMOVU (1 << 17) | ||||
| #define HAVE_AVX (1 << 18) | #define HAVE_AVX (1 << 18) | ||||
| #define HAVE_FMA4 (1 << 19) | #define HAVE_FMA4 (1 << 19) | ||||
| #define HAVE_FMA3 (1 << 20) | |||||
| #define CACHE_INFO_L1_I 1 | #define CACHE_INFO_L1_I 1 | ||||
| #define CACHE_INFO_L1_D 2 | #define CACHE_INFO_L1_D 2 | ||||
| @@ -197,6 +199,7 @@ typedef struct { | |||||
| #define CPUTYPE_SANDYBRIDGE 44 | #define CPUTYPE_SANDYBRIDGE 44 | ||||
| #define CPUTYPE_BOBCAT 45 | #define CPUTYPE_BOBCAT 45 | ||||
| #define CPUTYPE_BULLDOZER 46 | #define CPUTYPE_BULLDOZER 46 | ||||
| #define CPUTYPE_PILEDRIVER 47 | |||||
| // this define is because BLAS doesn't have haswell specific optimizations yet | // this define is because BLAS doesn't have haswell specific optimizations yet | ||||
| #define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE | #define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE | ||||
| @@ -47,6 +47,8 @@ | |||||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | #define CORE_SANDYBRIDGE CORE_NEHALEM | ||||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | ||||
| #define CORE_BULLDOZER CORE_BARCELONA | #define CORE_BULLDOZER CORE_BARCELONA | ||||
| #define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA | |||||
| #define CORE_PILEDRIVER CORE_BARCELONA | |||||
| #endif | #endif | ||||
| #ifndef CPUIDEMU | #ifndef CPUIDEMU | ||||
| @@ -228,6 +230,7 @@ int get_cputype(int gettype){ | |||||
| #ifndef NO_AVX | #ifndef NO_AVX | ||||
| if (support_avx()) feature |= HAVE_AVX; | if (support_avx()) feature |= HAVE_AVX; | ||||
| #endif | #endif | ||||
| if ((ecx & (1 << 20)) != 0) feature |= HAVE_FMA3; | |||||
| if (have_excpuid() >= 0x01) { | if (have_excpuid() >= 0x01) { | ||||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||||
| @@ -1100,11 +1103,21 @@ int get_cpuname(void){ | |||||
| case 1: | case 1: | ||||
| case 10: | case 10: | ||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CPUTYPE_BULLDOZER; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| case 6: | |||||
| switch (model) { | |||||
| case 1: | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CPUTYPE_BULLDOZER; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| case 2: | |||||
| if(support_avx()) | |||||
| return CPUTYPE_PILEDRIVER; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| } | |||||
| break; | |||||
| case 5: | case 5: | ||||
| return CPUTYPE_BOBCAT; | return CPUTYPE_BOBCAT; | ||||
| } | } | ||||
| @@ -1229,6 +1242,7 @@ static char *cpuname[] = { | |||||
| "SANDYBRIDGE", | "SANDYBRIDGE", | ||||
| "BOBCAT", | "BOBCAT", | ||||
| "BULLDOZER", | "BULLDOZER", | ||||
| "PILEDRIVER", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1278,6 +1292,7 @@ static char *lowercpuname[] = { | |||||
| "sandybridge", | "sandybridge", | ||||
| "bobcat", | "bobcat", | ||||
| "bulldozer", | "bulldozer", | ||||
| "piledriver", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -1304,6 +1319,7 @@ static char *corename[] = { | |||||
| "SANDYBRIDGE", | "SANDYBRIDGE", | ||||
| "BOBCAT", | "BOBCAT", | ||||
| "BULLDOZER", | "BULLDOZER", | ||||
| "PILEDRIVER", | |||||
| }; | }; | ||||
| static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
| @@ -1330,6 +1346,7 @@ static char *corename_lower[] = { | |||||
| "sandybridge", | "sandybridge", | ||||
| "bobcat", | "bobcat", | ||||
| "bulldozer", | "bulldozer", | ||||
| "piledriver", | |||||
| }; | }; | ||||
| @@ -1472,11 +1489,19 @@ int get_coretype(void){ | |||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | ||||
| else if (exfamily == 5) return CORE_BOBCAT; | else if (exfamily == 5) return CORE_BOBCAT; | ||||
| else if (exfamily == 6) { | else if (exfamily == 6) { | ||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CORE_BULLDOZER; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||||
| switch (model) { | |||||
| case 1: | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CORE_BULLDOZER; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. | |||||
| case 2: | |||||
| if(support_avx()) | |||||
| return CORE_PILEDRIVER; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. | |||||
| } | |||||
| }else return CORE_BARCELONA; | }else return CORE_BARCELONA; | ||||
| } | } | ||||
| } | } | ||||
| @@ -1564,6 +1589,7 @@ void get_cpuconfig(void){ | |||||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | ||||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | ||||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | ||||
| if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n"); | |||||
| if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | ||||
| if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | ||||
| if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | ||||
| @@ -1631,5 +1657,6 @@ void get_sse(void){ | |||||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | ||||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | ||||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | ||||
| if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); | |||||
| } | } | ||||
| @@ -64,10 +64,12 @@ extern gotoblas_t gotoblas_BOBCAT; | |||||
| #ifndef NO_AVX | #ifndef NO_AVX | ||||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | extern gotoblas_t gotoblas_SANDYBRIDGE; | ||||
| extern gotoblas_t gotoblas_BULLDOZER; | extern gotoblas_t gotoblas_BULLDOZER; | ||||
| extern gotoblas_t gotoblas_PILEDRIVER; | |||||
| #else | #else | ||||
| //Use NEHALEM kernels for sandy bridge | //Use NEHALEM kernels for sandy bridge | ||||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | ||||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | #define gotoblas_BULLDOZER gotoblas_BARCELONA | ||||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | |||||
| #endif | #endif | ||||
| //Use sandy bridge kernels for haswell. | //Use sandy bridge kernels for haswell. | ||||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | ||||
| @@ -228,13 +230,23 @@ static gotoblas_t *get_coretype(void){ | |||||
| } else if (exfamily == 5) { | } else if (exfamily == 5) { | ||||
| return &gotoblas_BOBCAT; | return &gotoblas_BOBCAT; | ||||
| } else if (exfamily == 6) { | } else if (exfamily == 6) { | ||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(model == 1){ | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | if(support_avx()) | ||||
| return &gotoblas_BULLDOZER; | return &gotoblas_BULLDOZER; | ||||
| else{ | else{ | ||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | ||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | ||||
| } | |||||
| } | |||||
| }else if(model == 2){ | |||||
| //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 | |||||
| if(support_avx()) | |||||
| return &gotoblas_PILEDRIVER; | |||||
| else{ | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| } else { | } else { | ||||
| return &gotoblas_BARCELONA; | return &gotoblas_BARCELONA; | ||||
| } | } | ||||
| @@ -272,6 +284,7 @@ static char *corename[] = { | |||||
| "Sandybridge", | "Sandybridge", | ||||
| "Bobcat", | "Bobcat", | ||||
| "Bulldozer", | "Bulldozer", | ||||
| "Piledriver", | |||||
| }; | }; | ||||
| char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
| @@ -294,6 +307,7 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | ||||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | ||||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | ||||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| @@ -106,6 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_ISTANBUL */ | /* #define FORCE_ISTANBUL */ | ||||
| /* #define FORCE_BOBCAT */ | /* #define FORCE_BOBCAT */ | ||||
| /* #define FORCE_BULLDOZER */ | /* #define FORCE_BULLDOZER */ | ||||
| /* #define FORCE_PILEDRIVER */ | |||||
| /* #define FORCE_SSE_GENERIC */ | /* #define FORCE_SSE_GENERIC */ | ||||
| /* #define FORCE_VIAC3 */ | /* #define FORCE_VIAC3 */ | ||||
| /* #define FORCE_NANO */ | /* #define FORCE_NANO */ | ||||
| @@ -398,6 +399,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "BULLDOZER" | #define CORENAME "BULLDOZER" | ||||
| #endif | #endif | ||||
| #if defined (FORCE_PILEDRIVER) | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "PILEDRIVER" | |||||
| #define ARCHCONFIG "-DPILEDRIVER " \ | |||||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH" \ | |||||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||||
| #define LIBNAME "piledriver" | |||||
| #define CORENAME "PILEDRIVER" | |||||
| #endif | |||||
| #ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -826,6 +826,22 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef PILEDRIVER | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Piledriver\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef NANO | #ifdef NANO | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| @@ -0,0 +1,59 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| CGEMMINCOPY = | |||||
| CGEMMITCOPY = | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = | |||||
| CGEMMITCOPYOBJ = | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -488,7 +488,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1697,7 +1697,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1727,7 +1727,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -437,7 +437,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -833,7 +833,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1848,7 +1848,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2109,7 +2109,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2429,7 +2429,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -2459,7 +2459,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2952,7 +2952,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -3148,7 +3148,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3389,7 +3389,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -3404,7 +3404,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -910,7 +910,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -959,7 +959,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1439,7 +1439,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1469,7 +1469,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -872,7 +872,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1316,7 +1316,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -1855,7 +1855,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1885,7 +1885,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2249,7 +2249,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2562,7 +2562,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2957,7 +2957,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -2972,7 +2972,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -3280,7 +3280,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3515,7 +3515,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -1036,7 +1036,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1066,7 +1066,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -2224,7 +2224,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -2273,7 +2273,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -454,7 +454,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -758,7 +758,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -993,7 +993,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -1324,7 +1324,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1354,7 +1354,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -1718,7 +1718,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2031,7 +2031,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2859,7 +2859,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -3303,7 +3303,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -533,7 +533,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -994,7 +994,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -1820,7 +1820,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -0,0 +1,70 @@ | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||||
| DGEMVNKERNEL = dgemv_n_bulldozer.S | |||||
| DGEMVTKERNEL = dgemv_t_bulldozer.S | |||||
| DAXPYKERNEL = daxpy_bulldozer.S | |||||
| DDOTKERNEL = ddot_bulldozer.S | |||||
| DCOPYKERNEL = dcopy_bulldozer.S | |||||
| SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||||
| SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||||
| SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | |||||
| DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | |||||
| DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | |||||
| DGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||||
| DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -160,7 +160,7 @@ | |||||
| #define a3 %xmm14 | #define a3 %xmm14 | ||||
| #define xt1 %xmm15 | #define xt1 %xmm15 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define MOVDDUP(a, b, c) movddup a(b), c | #define MOVDDUP(a, b, c) movddup a(b), c | ||||
| #define MOVDDUP2(a, b, c) movddup a##b, c | #define MOVDDUP2(a, b, c) movddup a##b, c | ||||
| #else | #else | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -167,7 +167,7 @@ | |||||
| #define a3 %xmm14 | #define a3 %xmm14 | ||||
| #define xt1 %xmm15 | #define xt1 %xmm15 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define MOVDDUP(a, b, c) movddup a(b), c | #define MOVDDUP(a, b, c) movddup a(b), c | ||||
| #define MOVDDUP2(a, b, c) movddup a##b, c | #define MOVDDUP2(a, b, c) movddup a##b, c | ||||
| #else | #else | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -166,7 +166,7 @@ | |||||
| #define xt1 %xmm14 | #define xt1 %xmm14 | ||||
| #define xt2 %xmm15 | #define xt2 %xmm15 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define MOVDDUP(a, b, c) movddup a(b), c | #define MOVDDUP(a, b, c) movddup a(b), c | ||||
| #define MOVDDUP2(a, b, c) movddup a##b, c | #define MOVDDUP2(a, b, c) movddup a##b, c | ||||
| #else | #else | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -166,7 +166,7 @@ | |||||
| #define a3 %xmm14 | #define a3 %xmm14 | ||||
| #define xt1 %xmm15 | #define xt1 %xmm15 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define MOVDDUP(a, b, c) movddup a(b), c | #define MOVDDUP(a, b, c) movddup a(b), c | ||||
| #define MOVDDUP2(a, b, c) movddup a##b, c | #define MOVDDUP2(a, b, c) movddup a##b, c | ||||
| #else | #else | ||||
| @@ -85,7 +85,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define ALIGNED_ACCESS | #define ALIGNED_ACCESS | ||||
| #define MOVUPS_A movaps | #define MOVUPS_A movaps | ||||
| #define MOVUPS_XL movaps | #define MOVUPS_XL movaps | ||||
| @@ -234,6 +234,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||||
| #ifdef ARCH_X86 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #else | |||||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define CGEMM3M_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM3M_DEFAULT_UNROLL_M 8 | |||||
| #define ZGEMM3M_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM3M_DEFAULT_UNROLL_M 4 | |||||
| #define GEMV_UNROLL 8 | |||||
| #endif | |||||
| #if defined(ARCH_X86_64) | |||||
| #define SGEMM_DEFAULT_P 768 | |||||
| #define DGEMM_DEFAULT_P 384 | |||||
| #else | |||||
| #define SGEMM_DEFAULT_P 448 | |||||
| #define DGEMM_DEFAULT_P 224 | |||||
| #endif | |||||
| #define QGEMM_DEFAULT_P 112 | |||||
| #define CGEMM_DEFAULT_P 224 | |||||
| #define ZGEMM_DEFAULT_P 112 | |||||
| #define XGEMM_DEFAULT_P 56 | |||||
| #if defined(ARCH_X86_64) | |||||
| #define SGEMM_DEFAULT_Q 168 | |||||
| #define DGEMM_DEFAULT_Q 168 | |||||
| #else | |||||
| #define SGEMM_DEFAULT_Q 224 | |||||
| #define DGEMM_DEFAULT_Q 224 | |||||
| #endif | |||||
| #define QGEMM_DEFAULT_Q 224 | |||||
| #define CGEMM_DEFAULT_Q 224 | |||||
| #define ZGEMM_DEFAULT_Q 224 | |||||
| #define XGEMM_DEFAULT_Q 224 | |||||
| #define SGEMM_DEFAULT_R sgemm_r | |||||
| #define QGEMM_DEFAULT_R qgemm_r | |||||
| #define DGEMM_DEFAULT_R dgemm_r | |||||
| #define CGEMM_DEFAULT_R cgemm_r | |||||
| #define ZGEMM_DEFAULT_R zgemm_r | |||||
| #define XGEMM_DEFAULT_R xgemm_r | |||||
| #define SYMV_P 16 | |||||
| #define HAVE_EXCLUSIVE_CACHE | |||||
| #define GEMM_THREAD gemm_thread_mn | |||||
| #endif | |||||
| #ifdef PILEDRIVER | |||||
| #define SNUMOPT 8 | |||||
| #define DNUMOPT 4 | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | #define QGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | #define CGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||