Just copy the kernel codes from Nehalem. The optimization is ongoing.tags/v0.2.0^2
| @@ -247,11 +247,11 @@ endif | |||||
| ifdef DYNAMIC_ARCH | ifdef DYNAMIC_ARCH | ||||
| ifeq ($(ARCH), x86) | ifeq ($(ARCH), x86) | ||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| endif | endif | ||||
| ifndef DYNAMIC_CORE | ifndef DYNAMIC_CORE | ||||
| @@ -770,6 +770,7 @@ export HAVE_SSE4_1 | |||||
| export HAVE_SSE4_2 | export HAVE_SSE4_2 | ||||
| export HAVE_SSE4A | export HAVE_SSE4A | ||||
| export HAVE_SSE5 | export HAVE_SSE5 | ||||
| export HAVE_AVX | |||||
| export KERNELDIR | export KERNELDIR | ||||
| export FUNCTION_PROFILE | export FUNCTION_PROFILE | ||||
| export TARGET_CORE | export TARGET_CORE | ||||
| @@ -18,6 +18,7 @@ CORE2 | |||||
| PENRYN | PENRYN | ||||
| DUNNINGTON | DUNNINGTON | ||||
| NEHALEM | NEHALEM | ||||
| SANDYBRIDGE | |||||
| ATOM | ATOM | ||||
| b)AMD CPU: | b)AMD CPU: | ||||
| @@ -47,6 +48,7 @@ CELL | |||||
| 3.MIPS64 CPU: | 3.MIPS64 CPU: | ||||
| SICORTEX | SICORTEX | ||||
| LOONGSON3A | LOONGSON3A | ||||
| LOONGSON3B | |||||
| 4.IA64 CPU: | 4.IA64 CPU: | ||||
| ITANIUM2 | ITANIUM2 | ||||
| @@ -103,6 +103,7 @@ | |||||
| #define CORE_NEHALEM 17 | #define CORE_NEHALEM 17 | ||||
| #define CORE_ATOM 18 | #define CORE_ATOM 18 | ||||
| #define CORE_NANO 19 | #define CORE_NANO 19 | ||||
| #define CORE_SANDYBRIDGE 20 | |||||
| #define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
| #define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
| @@ -122,6 +123,7 @@ | |||||
| #define HAVE_MISALIGNSSE (1 << 15) | #define HAVE_MISALIGNSSE (1 << 15) | ||||
| #define HAVE_128BITFPU (1 << 16) | #define HAVE_128BITFPU (1 << 16) | ||||
| #define HAVE_FASTMOVU (1 << 17) | #define HAVE_FASTMOVU (1 << 17) | ||||
| #define HAVE_AVX (1 << 18) | |||||
| #define CACHE_INFO_L1_I 1 | #define CACHE_INFO_L1_I 1 | ||||
| #define CACHE_INFO_L1_D 2 | #define CACHE_INFO_L1_D 2 | ||||
| @@ -188,4 +190,5 @@ typedef struct { | |||||
| #define CPUTYPE_NSGEODE 41 | #define CPUTYPE_NSGEODE 41 | ||||
| #define CPUTYPE_VIAC3 42 | #define CPUTYPE_VIAC3 42 | ||||
| #define CPUTYPE_NANO 43 | #define CPUTYPE_NANO 43 | ||||
| #define CPUTYPE_SANDYBRIDGE 44 | |||||
| #endif | #endif | ||||
| @@ -189,6 +189,7 @@ int get_cputype(int gettype){ | |||||
| if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; | if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; | ||||
| if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; | if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; | ||||
| if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | ||||
| if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX; | |||||
| if (have_excpuid() >= 0x01) { | if (have_excpuid() >= 0x01) { | ||||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||||
| @@ -983,13 +984,13 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 10: | case 10: | ||||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | ||||
| return CPUTYPE_NEHALEM; | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| case 12: | case 12: | ||||
| //Xeon Processor 5600 (Westmere-EP) | //Xeon Processor 5600 (Westmere-EP) | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 13: | case 13: | ||||
| //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | ||||
| return CPUTYPE_NEHALEM; | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| case 15: | case 15: | ||||
| //Xeon Processor E7 (Westmere-EX) | //Xeon Processor E7 (Westmere-EX) | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| @@ -1146,6 +1147,7 @@ static char *cpuname[] = { | |||||
| "NSGEODE", | "NSGEODE", | ||||
| "VIAC3", | "VIAC3", | ||||
| "NANO", | "NANO", | ||||
| "SANDYBRIDGE", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1192,6 +1194,7 @@ static char *lowercpuname[] = { | |||||
| "tms3x00", | "tms3x00", | ||||
| "nsgeode", | "nsgeode", | ||||
| "nano", | "nano", | ||||
| "sandybridge", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -1215,6 +1218,7 @@ static char *corename[] = { | |||||
| "NEHALEM", | "NEHALEM", | ||||
| "ATOM", | "ATOM", | ||||
| "NANO", | "NANO", | ||||
| "SANDYBRIDGE", | |||||
| }; | }; | ||||
| static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
| @@ -1238,6 +1242,7 @@ static char *corename_lower[] = { | |||||
| "nehalem", | "nehalem", | ||||
| "atom", | "atom", | ||||
| "nano", | "nano", | ||||
| "sandybridge", | |||||
| }; | }; | ||||
| @@ -1321,13 +1326,13 @@ int get_coretype(void){ | |||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 10: | case 10: | ||||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | ||||
| return CORE_NEHALEM; | |||||
| return CORE_SANDYBRIDGE; | |||||
| case 12: | case 12: | ||||
| //Xeon Processor 5600 (Westmere-EP) | //Xeon Processor 5600 (Westmere-EP) | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 13: | case 13: | ||||
| //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | ||||
| return CORE_NEHALEM; | |||||
| return CORE_SANDYBRIDGE; | |||||
| case 15: | case 15: | ||||
| //Xeon Processor E7 (Westmere-EX) | //Xeon Processor E7 (Westmere-EX) | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| @@ -1426,6 +1431,7 @@ void get_cpuconfig(void){ | |||||
| if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); | if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); | ||||
| if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | ||||
| if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | ||||
| if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | ||||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | ||||
| if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | ||||
| @@ -1491,6 +1497,7 @@ void get_sse(void){ | |||||
| if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); | if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); | ||||
| if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | ||||
| if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | ||||
| if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | ||||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | ||||
| @@ -165,7 +165,7 @@ int get_L2_size(void){ | |||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ | #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ | ||||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | ||||
| defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) | |||||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | |||||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | ||||
| @@ -384,6 +384,17 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(SANDYBRIDGE) | |||||
| sgemm_p = 1024; | |||||
| dgemm_p = 512; | |||||
| cgemm_p = 512; | |||||
| zgemm_p = 256; | |||||
| #ifdef EXPRECISION | |||||
| qgemm_p = 256; | |||||
| xgemm_p = 128; | |||||
| #endif | |||||
| #endif | |||||
| #if defined(CORE_PRESCOTT) || defined(GENERIC) | #if defined(CORE_PRESCOTT) || defined(GENERIC) | ||||
| size >>= 6; | size >>= 6; | ||||
| @@ -278,6 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "NEHALEM" | #define CORENAME "NEHALEM" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_SANDYBRIDGE | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||||
| #define LIBNAME "sandybridge" | |||||
| #define CORENAME "SANDYBRIDGE" | |||||
| #endif | |||||
| #ifdef FORCE_ATOM | #ifdef FORCE_ATOM | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -746,6 +746,22 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Sandybridge\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef OPTERON | #ifdef OPTERON | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| @@ -0,0 +1 @@ | |||||
| include $(KERNELDIR)/KERNEL.PENRYN | |||||
| @@ -76,6 +76,12 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE (8 * 1 - 4) | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define PREFETCHB prefetcht0 | |||||
| #endif | |||||
| #ifndef PREFETCH | #ifndef PREFETCH | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #endif | #endif | ||||
| @@ -69,6 +69,12 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE (16 * 1 - 8) | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define PREFETCHB prefetcht0 | |||||
| #endif | |||||
| #ifndef PREFETCH | #ifndef PREFETCH | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #endif | #endif | ||||
| @@ -262,7 +268,7 @@ | |||||
| movaps -16 * SIZE(AA), %xmm0 | movaps -16 * SIZE(AA), %xmm0 | ||||
| addps %xmm2, %xmm7 | addps %xmm2, %xmm7 | ||||
| #ifndef NEHALEM | |||||
| #if !(defined(NEHALEM) || defined(SANDYBRIDGE)) | |||||
| PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| pshufd $0x93, %xmm1, %xmm2 | pshufd $0x93, %xmm1, %xmm2 | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 4) | #define PREFETCHSIZE (16 * 4) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 4) | #define PREFETCHSIZE (16 * 4) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCHSIZE (8 * 1 - 4) | #define PREFETCHSIZE (8 * 1 - 4) | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCHSIZE (16 * 1 + 8) | #define PREFETCHSIZE (16 * 1 + 8) | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 2) | #define PREFETCHSIZE (16 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 2) | #define PREFETCHSIZE (16 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -55,7 +55,7 @@ | |||||
| #define XX %edi | #define XX %edi | ||||
| #define FLAG %ebp | #define FLAG %ebp | ||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) | |||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) | |||||
| #define USE_PSHUFD | #define USE_PSHUFD | ||||
| #else | #else | ||||
| #define USE_PSHUFD_HALF | #define USE_PSHUFD_HALF | ||||
| @@ -697,7 +697,7 @@ | |||||
| cmpl $2 * SIZE, INCX | cmpl $2 * SIZE, INCX | ||||
| jne .L120 | jne .L120 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| PSHUFD2($0, %xmm0, %xmm6) | PSHUFD2($0, %xmm0, %xmm6) | ||||
| PSHUFD2($0, %xmm1, %xmm1) | PSHUFD2($0, %xmm1, %xmm1) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #include "l1param.h" | #include "l1param.h" | ||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) | |||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) | |||||
| #define USE_PSHUFD | #define USE_PSHUFD | ||||
| #else | #else | ||||
| #define USE_PSHUFD_HALF | #define USE_PSHUFD_HALF | ||||
| @@ -860,7 +860,7 @@ | |||||
| cmpl $2 * SIZE, INCX | cmpl $2 * SIZE, INCX | ||||
| jne .L220 | jne .L220 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| #ifdef HAVE_SSE3 | #ifdef HAVE_SSE3 | ||||
| movddup %xmm0, %xmm6 | movddup %xmm0, %xmm6 | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -63,7 +63,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -63,7 +63,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,59 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||||
| SGEMMINCOPY = gemm_ncopy_4.S | |||||
| SGEMMITCOPY = gemm_tcopy_4.S | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_2x8_nehalem.S | |||||
| DGEMMINCOPY = dgemm_ncopy_2.S | |||||
| DGEMMITCOPY = dgemm_tcopy_2.S | |||||
| DGEMMONCOPY = dgemm_ncopy_8.S | |||||
| DGEMMOTCOPY = dgemm_tcopy_8.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||||
| CGEMMINCOPY = zgemm_ncopy_2.S | |||||
| CGEMMITCOPY = zgemm_tcopy_2.S | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||||
| ZGEMMINCOPY = zgemm_ncopy_1.S | |||||
| ZGEMMITCOPY = zgemm_tcopy_1.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | |||||
| @@ -45,6 +45,12 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 16 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #endif | |||||
| #ifndef MOVAPS | #ifndef MOVAPS | ||||
| #define MOVAPS movaps | #define MOVAPS movaps | ||||
| #endif | #endif | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCHSIZE 16 | #define PREFETCHSIZE 16 | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| @@ -45,6 +45,12 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 12 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #endif | |||||
| #ifndef MOVAPS | #ifndef MOVAPS | ||||
| #define MOVAPS movaps | #define MOVAPS movaps | ||||
| #endif | #endif | ||||
| @@ -52,6 +52,13 @@ | |||||
| #define MOVUPS_A movups | #define MOVUPS_A movups | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 12 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define MOVUPS_A movups | |||||
| #endif | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) | #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) | ||||
| #define PREFETCHSIZE 16 | #define PREFETCHSIZE 16 | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| @@ -51,6 +51,12 @@ | |||||
| #define MOVUPS_A movups | #define MOVUPS_A movups | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 12 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define MOVUPS_A movups | |||||
| #endif | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) | #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) | ||||
| #define PREFETCHSIZE 16 | #define PREFETCHSIZE 16 | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| @@ -46,6 +46,13 @@ | |||||
| #define MOVUPS_A movups | #define MOVUPS_A movups | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 16 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define MOVUPS_A movups | |||||
| #endif | |||||
| #ifdef MOVUPS_A | #ifdef MOVUPS_A | ||||
| #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS | #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS | ||||
| #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS | #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS | ||||
| @@ -46,6 +46,13 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(SANDYBRIDGE) | |||||
| #define RPREFETCHSIZE 12 | |||||
| #define WPREFETCHSIZE (RPREFETCHSIZE * 2) | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #endif | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| #define M ARG1 /* rdi */ | #define M ARG1 /* rdi */ | ||||
| @@ -46,7 +46,7 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define RPREFETCHSIZE 12 | #define RPREFETCHSIZE 12 | ||||
| #define WPREFETCHSIZE (RPREFETCHSIZE * 4) | #define WPREFETCHSIZE (RPREFETCHSIZE * 4) | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| @@ -46,6 +46,13 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(SANDYBRIDGE) | |||||
| #define RPREFETCHSIZE 12 | |||||
| #define WPREFETCHSIZE (RPREFETCHSIZE * 2) | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #endif | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| #define M ARG1 /* rdi */ | #define M ARG1 /* rdi */ | ||||
| @@ -46,7 +46,7 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define RPREFETCHSIZE 12 | #define RPREFETCHSIZE 12 | ||||
| #define WPREFETCHSIZE (RPREFETCHSIZE * 4) | #define WPREFETCHSIZE (RPREFETCHSIZE * 4) | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -685,7 +685,7 @@ | |||||
| cmpq $2 * SIZE, INCX | cmpq $2 * SIZE, INCX | ||||
| jne .L120 | jne .L120 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| pshufd $0, %xmm0, %xmm14 | pshufd $0, %xmm0, %xmm14 | ||||
| pshufd $0, %xmm1, %xmm1 | pshufd $0, %xmm1, %xmm1 | ||||
| @@ -55,7 +55,7 @@ | |||||
| #include "l1param.h" | #include "l1param.h" | ||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) | |||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE) | |||||
| #define USE_PSHUFD | #define USE_PSHUFD | ||||
| #else | #else | ||||
| #define USE_PSHUFD_HALF | #define USE_PSHUFD_HALF | ||||
| @@ -803,7 +803,7 @@ | |||||
| cmpq $2 * SIZE, INCX | cmpq $2 * SIZE, INCX | ||||
| jne .L220 | jne .L220 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| movddup %xmm0, %xmm14 | movddup %xmm0, %xmm14 | ||||
| pxor %xmm15, %xmm15 | pxor %xmm15, %xmm15 | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -9,6 +9,13 @@ | |||||
| #define ALIGNED_ACCESS | #define ALIGNED_ACCESS | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define PREFETCHSIZE (128 * 12) | |||||
| #define ALIGNED_ACCESS | |||||
| #endif | |||||
| #ifdef ATHLON | #ifdef ATHLON | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| @@ -63,6 +63,17 @@ | |||||
| #define PREFETCHSIZE 64 * 3 | #define PREFETCHSIZE 64 * 3 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define MOVUPS_A movups | |||||
| #define MOVUPS_XL movups | |||||
| #define MOVUPS_XS movups | |||||
| #define MOVUPS_YL movups | |||||
| #define MOVUPS_YS movups | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define PREFETCHSIZE 64 * 3 | |||||
| #endif | |||||
| #ifdef OPTERON | #ifdef OPTERON | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| @@ -913,6 +913,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define SNUMOPT 8 | |||||
| #define DNUMOPT 4 | |||||
| #define GEMM_DEFAULT_OFFSET_A 32 | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define SYMV_P 8 | |||||
| #define SWITCH_RATIO 4 | |||||
| #ifdef ARCH_X86 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||||
| #else | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||||
| #endif | |||||
| #define SGEMM_DEFAULT_P 504 | |||||
| #define SGEMM_DEFAULT_R sgemm_r | |||||
| #define DGEMM_DEFAULT_P 504 | |||||
| #define DGEMM_DEFAULT_R dgemm_r | |||||
| #define QGEMM_DEFAULT_P 504 | |||||
| #define QGEMM_DEFAULT_R qgemm_r | |||||
| #define CGEMM_DEFAULT_P 252 | |||||
| #define CGEMM_DEFAULT_R cgemm_r | |||||
| #define ZGEMM_DEFAULT_P 252 | |||||
| #define ZGEMM_DEFAULT_R zgemm_r | |||||
| #define XGEMM_DEFAULT_P 252 | |||||
| #define XGEMM_DEFAULT_R xgemm_r | |||||
| #define SGEMM_DEFAULT_Q 512 | |||||
| #define DGEMM_DEFAULT_Q 256 | |||||
| #define QGEMM_DEFAULT_Q 128 | |||||
| #define CGEMM_DEFAULT_Q 512 | |||||
| #define ZGEMM_DEFAULT_Q 256 | |||||
| #define XGEMM_DEFAULT_Q 128 | |||||
| #define GETRF_FACTOR 0.72 | |||||
| #endif | |||||
| #ifdef ATOM | #ifdef ATOM | ||||
| #define SNUMOPT 2 | #define SNUMOPT 2 | ||||