@@ -277,14 +277,14 @@ ifeq ($(ARCH), x86) | |||||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
DYNAMIC_CORE += SANDYBRIDGE | |||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
endif | endif | ||||
endif | endif | ||||
ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
DYNAMIC_CORE += SANDYBRIDGE | |||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
endif | endif | ||||
endif | endif | ||||
@@ -125,7 +125,8 @@ | |||||
#define HAVE_MISALIGNSSE (1 << 15) | #define HAVE_MISALIGNSSE (1 << 15) | ||||
#define HAVE_128BITFPU (1 << 16) | #define HAVE_128BITFPU (1 << 16) | ||||
#define HAVE_FASTMOVU (1 << 17) | #define HAVE_FASTMOVU (1 << 17) | ||||
#define HAVE_AVX (1 << 18) | |||||
#define HAVE_AVX (1 << 18) | |||||
#define HAVE_FMA4 (1 << 19) | |||||
#define CACHE_INFO_L1_I 1 | #define CACHE_INFO_L1_I 1 | ||||
#define CACHE_INFO_L1_D 2 | #define CACHE_INFO_L1_D 2 | ||||
@@ -43,6 +43,8 @@ | |||||
#ifdef NO_AVX | #ifdef NO_AVX | ||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | ||||
#define CORE_SANDYBRIDGE CORE_NEHALEM | #define CORE_SANDYBRIDGE CORE_NEHALEM | ||||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||||
#define CORE_BULLDOZER CORE_BARCELONA | |||||
#endif | #endif | ||||
#ifndef CPUIDEMU | #ifndef CPUIDEMU | ||||
@@ -228,6 +230,9 @@ int get_cputype(int gettype){ | |||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||||
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | ||||
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | ||||
#ifndef NO_AVX | |||||
if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; | |||||
#endif | |||||
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | ||||
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | ||||
} | } | ||||
@@ -1075,8 +1080,12 @@ int get_cpuname(void){ | |||||
return CPUTYPE_OPTERON; | return CPUTYPE_OPTERON; | ||||
case 1: | case 1: | ||||
case 10: | case 10: | ||||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
if(support_avx()) | |||||
return CPUTYPE_BULLDOZER; | |||||
else | |||||
return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
case 5: | case 5: | ||||
return CPUTYPE_BOBCAT; | return CPUTYPE_BOBCAT; | ||||
} | } | ||||
@@ -1427,8 +1436,13 @@ int get_coretype(void){ | |||||
if (family == 0xf){ | if (family == 0xf){ | ||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | ||||
else if (exfamily == 5) return CORE_BOBCAT; | else if (exfamily == 5) return CORE_BOBCAT; | ||||
else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
else return CORE_BARCELONA; | |||||
else if (exfamily == 6) { | |||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
if(support_avx()) | |||||
return CORE_BULLDOZER; | |||||
else | |||||
return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||||
}else return CORE_BARCELONA; | |||||
} | } | ||||
} | } | ||||
@@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA; | |||||
extern gotoblas_t gotoblas_BOBCAT; | extern gotoblas_t gotoblas_BOBCAT; | ||||
#ifndef NO_AVX | #ifndef NO_AVX | ||||
extern gotoblas_t gotoblas_SANDYBRIDGE; | extern gotoblas_t gotoblas_SANDYBRIDGE; | ||||
extern gotoblas_t gotoblas_BULLDOZER; | |||||
#else | #else | ||||
//Use NEHALEM kernels for sandy bridge | //Use NEHALEM kernels for sandy bridge | ||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | ||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA | |||||
#endif | #endif | ||||
@@ -202,6 +204,14 @@ static gotoblas_t *get_coretype(void){ | |||||
else return &gotoblas_OPTERON; | else return &gotoblas_OPTERON; | ||||
} else if (exfamily == 5) { | } else if (exfamily == 5) { | ||||
return &gotoblas_BOBCAT; | return &gotoblas_BOBCAT; | ||||
} else if (exfamily == 6) { | |||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
if(support_avx()) | |||||
return &gotoblas_BULLDOZER; | |||||
else{ | |||||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); | |||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
} | |||||
} else { | } else { | ||||
return &gotoblas_BARCELONA; | return &gotoblas_BARCELONA; | ||||
} | } | ||||
@@ -238,6 +248,7 @@ static char *corename[] = { | |||||
"Nano", | "Nano", | ||||
"Sandybridge", | "Sandybridge", | ||||
"Bobcat", | "Bobcat", | ||||
"Bulldozer", | |||||
}; | }; | ||||
char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
@@ -259,6 +270,7 @@ char *gotoblas_corename(void) { | |||||
if (gotoblas == &gotoblas_NANO) return corename[15]; | if (gotoblas == &gotoblas_NANO) return corename[15]; | ||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | ||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | ||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||||
return corename[0]; | return corename[0]; | ||||
} | } | ||||
@@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define CORENAME "OPTERON" | #define CORENAME "OPTERON" | ||||
#endif | #endif | ||||
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) | |||||
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) | |||||
#define FORCE | #define FORCE | ||||
#define FORCE_INTEL | #define FORCE_INTEL | ||||
#define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
@@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define CORENAME "BOBCAT" | #define CORENAME "BOBCAT" | ||||
#endif | #endif | ||||
#if defined (FORCE_BULLDOZER) | |||||
#define FORCE | |||||
#define FORCE_INTEL | |||||
#define ARCHITECTURE "X86" | |||||
#define SUBARCHITECTURE "BULLDOZER" | |||||
#define ARCHCONFIG "-DBARCELONA " \ | |||||
"-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ | |||||
"-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | |||||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \ | |||||
"-DHAVE_AVX -DHAVE_FMA4" | |||||
#define LIBNAME "bulldozer" | |||||
#define CORENAME "BULLDOZER" | |||||
#endif | |||||
#ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
#define FORCE | #define FORCE | ||||
#define FORCE_INTEL | #define FORCE_INTEL | ||||
@@ -0,0 +1,59 @@ | |||||
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
SGEMMINCOPY = | |||||
SGEMMITCOPY = | |||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
SGEMMINCOPYOBJ = | |||||
SGEMMITCOPYOBJ = | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
CGEMMINCOPY = | |||||
CGEMMITCOPY = | |||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
CGEMMINCOPYOBJ = | |||||
CGEMMITCOPYOBJ = | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S |
@@ -0,0 +1,62 @@ | |||||
ZGEMVNKERNEL = zgemv_n_dup.S | |||||
ZGEMVTKERNEL = zgemv_t_dup.S | |||||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
DGEMMINCOPY = | |||||
DGEMMITCOPY = | |||||
DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
DGEMMINCOPYOBJ = | |||||
DGEMMITCOPYOBJ = | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
CGEMMONCOPY = zgemm_ncopy_2.S | |||||
CGEMMOTCOPY = zgemm_tcopy_2.S | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
ZGEMMINCOPY = | |||||
ZGEMMITCOPY = | |||||
ZGEMMONCOPY = zgemm_ncopy_2.S | |||||
ZGEMMOTCOPY = zgemm_tcopy_2.S | |||||
ZGEMMINCOPYOBJ = | |||||
ZGEMMITCOPYOBJ = | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||||
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||||
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||||
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||||
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||||
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||||
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||||
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||||
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S |
@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
#define SNUMOPT 8 | #define SNUMOPT 8 | ||||
#define DNUMOPT 4 | #define DNUMOPT 4 | ||||