@@ -277,14 +277,14 @@ ifeq ($(ARCH), x86) | |||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
ifneq ($(NO_AVX), 1) | |||
DYNAMIC_CORE += SANDYBRIDGE | |||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
endif | |||
endif | |||
ifeq ($(ARCH), x86_64) | |||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
ifneq ($(NO_AVX), 1) | |||
DYNAMIC_CORE += SANDYBRIDGE | |||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
endif | |||
endif | |||
@@ -125,7 +125,8 @@ | |||
#define HAVE_MISALIGNSSE (1 << 15) | |||
#define HAVE_128BITFPU (1 << 16) | |||
#define HAVE_FASTMOVU (1 << 17) | |||
#define HAVE_AVX (1 << 18) | |||
#define HAVE_AVX (1 << 18) | |||
#define HAVE_FMA4 (1 << 19) | |||
#define CACHE_INFO_L1_I 1 | |||
#define CACHE_INFO_L1_D 2 | |||
@@ -43,6 +43,8 @@ | |||
#ifdef NO_AVX | |||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | |||
#define CORE_SANDYBRIDGE CORE_NEHALEM | |||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||
#define CORE_BULLDOZER CORE_BARCELONA | |||
#endif | |||
#ifndef CPUIDEMU | |||
@@ -228,6 +230,9 @@ int get_cputype(int gettype){ | |||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | |||
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | |||
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | |||
#ifndef NO_AVX | |||
if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; | |||
#endif | |||
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | |||
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | |||
} | |||
@@ -1075,8 +1080,12 @@ int get_cpuname(void){ | |||
return CPUTYPE_OPTERON; | |||
case 1: | |||
case 10: | |||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
return CPUTYPE_BARCELONA; | |||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(support_avx()) | |||
return CPUTYPE_BULLDOZER; | |||
else | |||
return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
case 5: | |||
return CPUTYPE_BOBCAT; | |||
} | |||
@@ -1427,8 +1436,13 @@ int get_coretype(void){ | |||
if (family == 0xf){ | |||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||
else if (exfamily == 5) return CORE_BOBCAT; | |||
else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
else return CORE_BARCELONA; | |||
else if (exfamily == 6) { | |||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(support_avx()) | |||
return CORE_BULLDOZER; | |||
else | |||
return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||
}else return CORE_BARCELONA; | |||
} | |||
} | |||
@@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA; | |||
extern gotoblas_t gotoblas_BOBCAT; | |||
#ifndef NO_AVX | |||
extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
extern gotoblas_t gotoblas_BULLDOZER; | |||
#else | |||
//Use NEHALEM kernels for sandy bridge | |||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
#define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
#endif | |||
@@ -202,6 +204,14 @@ static gotoblas_t *get_coretype(void){ | |||
else return &gotoblas_OPTERON; | |||
} else if (exfamily == 5) { | |||
return &gotoblas_BOBCAT; | |||
} else if (exfamily == 6) { | |||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
if(support_avx()) | |||
return &gotoblas_BULLDOZER; | |||
else{ | |||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); | |||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} else { | |||
return &gotoblas_BARCELONA; | |||
} | |||
@@ -238,6 +248,7 @@ static char *corename[] = { | |||
"Nano", | |||
"Sandybridge", | |||
"Bobcat", | |||
"Bulldozer", | |||
}; | |||
char *gotoblas_corename(void) { | |||
@@ -259,6 +270,7 @@ char *gotoblas_corename(void) { | |||
if (gotoblas == &gotoblas_NANO) return corename[15]; | |||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||
if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
return corename[0]; | |||
} | |||
@@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CORENAME "OPTERON" | |||
#endif | |||
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) | |||
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) | |||
#define FORCE | |||
#define FORCE_INTEL | |||
#define ARCHITECTURE "X86" | |||
@@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CORENAME "BOBCAT" | |||
#endif | |||
#if defined (FORCE_BULLDOZER) | |||
#define FORCE | |||
#define FORCE_INTEL | |||
#define ARCHITECTURE "X86" | |||
#define SUBARCHITECTURE "BULLDOZER" | |||
#define ARCHCONFIG "-DBARCELONA " \ | |||
"-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ | |||
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \ | |||
"-DHAVE_AVX -DHAVE_FMA4" | |||
#define LIBNAME "bulldozer" | |||
#define CORENAME "BULLDOZER" | |||
#endif | |||
#ifdef FORCE_SSE_GENERIC | |||
#define FORCE | |||
#define FORCE_INTEL | |||
@@ -0,0 +1,59 @@ | |||
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
SGEMMINCOPY = | |||
SGEMMITCOPY = | |||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
SGEMMINCOPYOBJ = | |||
SGEMMITCOPYOBJ = | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
CGEMMINCOPY = | |||
CGEMMITCOPY = | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMINCOPYOBJ = | |||
CGEMMITCOPYOBJ = | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S |
@@ -0,0 +1,62 @@ | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
DGEMMINCOPY = | |||
DGEMMITCOPY = | |||
DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
DGEMMINCOPYOBJ = | |||
DGEMMITCOPYOBJ = | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
CGEMMONCOPY = zgemm_ncopy_2.S | |||
CGEMMOTCOPY = zgemm_tcopy_2.S | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
ZGEMMINCOPY = | |||
ZGEMMITCOPY = | |||
ZGEMMONCOPY = zgemm_ncopy_2.S | |||
ZGEMMOTCOPY = zgemm_tcopy_2.S | |||
ZGEMMINCOPYOBJ = | |||
ZGEMMITCOPYOBJ = | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S |
@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
#define SNUMOPT 8 | |||
#define DNUMOPT 4 | |||