Browse Source

Merge branch 'sandybridge' into develop

Just copy the kernel codes from Nehalem. The optimization is ongoing.
tags/v0.2.0^2
Zhang Xianyi 13 years ago
parent
commit
90d6ad569d
57 changed files with 311 additions and 47 deletions
  1. +3
    -2
      Makefile.system
  2. +2
    -0
      TargetList.txt
  3. +3
    -0
      cpuid.h
  4. +11
    -4
      cpuid_x86.c
  5. +12
    -1
      driver/others/parameter.c
  6. +14
    -0
      getarch.c
  7. +16
    -0
      kernel/setparam-ref.c
  8. +1
    -0
      kernel/x86/KERNEL.SANDYBRIDGE
  9. +6
    -0
      kernel/x86/gemm_kernel_2x4_penryn.S
  10. +7
    -1
      kernel/x86/gemm_kernel_4x4_penryn.S
  11. +1
    -1
      kernel/x86/gemv_n_sse.S
  12. +1
    -1
      kernel/x86/gemv_n_sse2.S
  13. +1
    -1
      kernel/x86/gemv_t_sse.S
  14. +1
    -1
      kernel/x86/gemv_t_sse2.S
  15. +1
    -1
      kernel/x86/trsm_kernel_LN_2x4_penryn.S
  16. +1
    -1
      kernel/x86/trsm_kernel_LN_4x4_penryn.S
  17. +1
    -1
      kernel/x86/trsm_kernel_LT_2x4_penryn.S
  18. +1
    -1
      kernel/x86/trsm_kernel_LT_4x4_penryn.S
  19. +1
    -1
      kernel/x86/trsm_kernel_RT_2x4_penryn.S
  20. +1
    -1
      kernel/x86/trsm_kernel_RT_4x4_penryn.S
  21. +1
    -1
      kernel/x86/zgemm_kernel_1x2_penryn.S
  22. +1
    -1
      kernel/x86/zgemm_kernel_2x2_penryn.S
  23. +1
    -1
      kernel/x86/zgemv_n_sse.S
  24. +1
    -1
      kernel/x86/zgemv_n_sse2.S
  25. +1
    -1
      kernel/x86/zgemv_t_sse.S
  26. +1
    -1
      kernel/x86/zgemv_t_sse2.S
  27. +2
    -2
      kernel/x86/zscal_sse.S
  28. +2
    -2
      kernel/x86/zscal_sse2.S
  29. +1
    -1
      kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
  30. +1
    -1
      kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
  31. +1
    -1
      kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
  32. +1
    -1
      kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
  33. +1
    -1
      kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
  34. +59
    -0
      kernel/x86_64/KERNEL.SANDYBRIDGE
  35. +6
    -0
      kernel/x86_64/dgemm_ncopy_2.S
  36. +1
    -1
      kernel/x86_64/dgemm_ncopy_4.S
  37. +6
    -0
      kernel/x86_64/dgemm_ncopy_8.S
  38. +7
    -0
      kernel/x86_64/dgemm_tcopy_2.S
  39. +6
    -0
      kernel/x86_64/dgemm_tcopy_4.S
  40. +7
    -0
      kernel/x86_64/dgemm_tcopy_8.S
  41. +7
    -0
      kernel/x86_64/gemm_ncopy_2.S
  42. +1
    -1
      kernel/x86_64/gemm_ncopy_4.S
  43. +7
    -0
      kernel/x86_64/gemm_tcopy_2.S
  44. +1
    -1
      kernel/x86_64/gemm_tcopy_4.S
  45. +1
    -1
      kernel/x86_64/symv_L_sse.S
  46. +1
    -1
      kernel/x86_64/symv_L_sse2.S
  47. +1
    -1
      kernel/x86_64/symv_U_sse.S
  48. +1
    -1
      kernel/x86_64/symv_U_sse2.S
  49. +1
    -1
      kernel/x86_64/zscal_sse.S
  50. +2
    -2
      kernel/x86_64/zscal_sse2.S
  51. +1
    -1
      kernel/x86_64/zsymv_L_sse.S
  52. +1
    -1
      kernel/x86_64/zsymv_L_sse2.S
  53. +1
    -1
      kernel/x86_64/zsymv_U_sse.S
  54. +1
    -1
      kernel/x86_64/zsymv_U_sse2.S
  55. +7
    -0
      l1param.h
  56. +11
    -0
      l2param.h
  57. +74
    -0
      param.h

+ 3
- 2
Makefile.system View File

@@ -247,11 +247,11 @@ endif
ifdef DYNAMIC_ARCH ifdef DYNAMIC_ARCH
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
endif endif


ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
endif endif


ifndef DYNAMIC_CORE ifndef DYNAMIC_CORE
@@ -770,6 +770,7 @@ export HAVE_SSE4_1
export HAVE_SSE4_2 export HAVE_SSE4_2
export HAVE_SSE4A export HAVE_SSE4A
export HAVE_SSE5 export HAVE_SSE5
export HAVE_AVX
export KERNELDIR export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE


+ 2
- 0
TargetList.txt View File

@@ -18,6 +18,7 @@ CORE2
PENRYN PENRYN
DUNNINGTON DUNNINGTON
NEHALEM NEHALEM
SANDYBRIDGE
ATOM ATOM


b)AMD CPU: b)AMD CPU:
@@ -47,6 +48,7 @@ CELL
3.MIPS64 CPU: 3.MIPS64 CPU:
SICORTEX SICORTEX
LOONGSON3A LOONGSON3A
LOONGSON3B


4.IA64 CPU: 4.IA64 CPU:
ITANIUM2 ITANIUM2


+ 3
- 0
cpuid.h View File

@@ -103,6 +103,7 @@
#define CORE_NEHALEM 17 #define CORE_NEHALEM 17
#define CORE_ATOM 18 #define CORE_ATOM 18
#define CORE_NANO 19 #define CORE_NANO 19
#define CORE_SANDYBRIDGE 20


#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@@ -122,6 +123,7 @@
#define HAVE_MISALIGNSSE (1 << 15) #define HAVE_MISALIGNSSE (1 << 15)
#define HAVE_128BITFPU (1 << 16) #define HAVE_128BITFPU (1 << 16)
#define HAVE_FASTMOVU (1 << 17) #define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18)


#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2
@@ -188,4 +190,5 @@ typedef struct {
#define CPUTYPE_NSGEODE 41 #define CPUTYPE_NSGEODE 41
#define CPUTYPE_VIAC3 42 #define CPUTYPE_VIAC3 42
#define CPUTYPE_NANO 43 #define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44
#endif #endif

+ 11
- 4
cpuid_x86.c View File

@@ -189,6 +189,7 @@ int get_cputype(int gettype){
if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3;
if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1;
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX;


if (have_excpuid() >= 0x01) { if (have_excpuid() >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx); cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
@@ -983,13 +984,13 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 10: case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CPUTYPE_NEHALEM;
return CPUTYPE_SANDYBRIDGE;
case 12: case 12:
//Xeon Processor 5600 (Westmere-EP) //Xeon Processor 5600 (Westmere-EP)
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 13: case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
return CPUTYPE_NEHALEM;
return CPUTYPE_SANDYBRIDGE;
case 15: case 15:
//Xeon Processor E7 (Westmere-EX) //Xeon Processor E7 (Westmere-EX)
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
@@ -1146,6 +1147,7 @@ static char *cpuname[] = {
"NSGEODE", "NSGEODE",
"VIAC3", "VIAC3",
"NANO", "NANO",
"SANDYBRIDGE",
}; };


static char *lowercpuname[] = { static char *lowercpuname[] = {
@@ -1192,6 +1194,7 @@ static char *lowercpuname[] = {
"tms3x00", "tms3x00",
"nsgeode", "nsgeode",
"nano", "nano",
"sandybridge",
}; };


static char *corename[] = { static char *corename[] = {
@@ -1215,6 +1218,7 @@ static char *corename[] = {
"NEHALEM", "NEHALEM",
"ATOM", "ATOM",
"NANO", "NANO",
"SANDYBRIDGE",
}; };


static char *corename_lower[] = { static char *corename_lower[] = {
@@ -1238,6 +1242,7 @@ static char *corename_lower[] = {
"nehalem", "nehalem",
"atom", "atom",
"nano", "nano",
"sandybridge",
}; };




@@ -1321,13 +1326,13 @@ int get_coretype(void){
return CORE_NEHALEM; return CORE_NEHALEM;
case 10: case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CORE_NEHALEM;
return CORE_SANDYBRIDGE;
case 12: case 12:
//Xeon Processor 5600 (Westmere-EP) //Xeon Processor 5600 (Westmere-EP)
return CORE_NEHALEM; return CORE_NEHALEM;
case 13: case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
return CORE_NEHALEM;
return CORE_SANDYBRIDGE;
case 15: case 15:
//Xeon Processor E7 (Westmere-EX) //Xeon Processor E7 (Westmere-EX)
return CORE_NEHALEM; return CORE_NEHALEM;
@@ -1426,6 +1431,7 @@ void get_cpuconfig(void){
if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n");
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
@@ -1491,6 +1497,7 @@ void get_sse(void){
if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n");
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");




+ 12
- 1
driver/others/parameter.c View File

@@ -165,7 +165,7 @@ int get_L2_size(void){


#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC)
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)


cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cpuid(0x80000006, &eax, &ebx, &ecx, &edx);


@@ -384,6 +384,17 @@ void blas_set_parameter(void){
#endif #endif
#endif #endif


#if defined(SANDYBRIDGE)
sgemm_p = 1024;
dgemm_p = 512;
cgemm_p = 512;
zgemm_p = 256;
#ifdef EXPRECISION
qgemm_p = 256;
xgemm_p = 128;
#endif
#endif

#if defined(CORE_PRESCOTT) || defined(GENERIC) #if defined(CORE_PRESCOTT) || defined(GENERIC)
size >>= 6; size >>= 6;




+ 14
- 0
getarch.c View File

@@ -278,6 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "NEHALEM" #define CORENAME "NEHALEM"
#endif #endif


#ifdef FORCE_SANDYBRIDGE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif

#ifdef FORCE_ATOM #ifdef FORCE_ATOM
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL


+ 16
- 0
kernel/setparam-ref.c View File

@@ -746,6 +746,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif


#ifdef SANDYBRIDGE

#ifdef DEBUG
fprintf(stderr, "Sandybridge\n");
#endif

TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif

#ifdef OPTERON #ifdef OPTERON


#ifdef DEBUG #ifdef DEBUG


+ 1
- 0
kernel/x86/KERNEL.SANDYBRIDGE View File

@@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.PENRYN

+ 6
- 0
kernel/x86/gemm_kernel_2x4_penryn.S View File

@@ -76,6 +76,12 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif


#ifdef SANDYBRIDGE
#define PREFETCHSIZE (8 * 1 - 4)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0
#endif

#ifndef PREFETCH #ifndef PREFETCH
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#endif #endif


+ 7
- 1
kernel/x86/gemm_kernel_4x4_penryn.S View File

@@ -69,6 +69,12 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif


#ifdef SANDYBRIDGE
#define PREFETCHSIZE (16 * 1 - 8)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0
#endif

#ifndef PREFETCH #ifndef PREFETCH
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#endif #endif
@@ -262,7 +268,7 @@
movaps -16 * SIZE(AA), %xmm0 movaps -16 * SIZE(AA), %xmm0


addps %xmm2, %xmm7 addps %xmm2, %xmm7
#ifndef NEHALEM
#if !(defined(NEHALEM) || defined(SANDYBRIDGE))
PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
pshufd $0x93, %xmm1, %xmm2 pshufd $0x93, %xmm1, %xmm2


+ 1
- 1
kernel/x86/gemv_n_sse.S View File

@@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 4) #define PREFETCHSIZE (16 * 4)
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)


+ 1
- 1
kernel/x86/gemv_n_sse2.S View File

@@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)


+ 1
- 1
kernel/x86/gemv_t_sse.S View File

@@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 4) #define PREFETCHSIZE (16 * 4)
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)


+ 1
- 1
kernel/x86/gemv_t_sse2.S View File

@@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)


+ 1
- 1
kernel/x86/trsm_kernel_LN_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


+ 1
- 1
kernel/x86/trsm_kernel_LN_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


+ 1
- 1
kernel/x86/trsm_kernel_LT_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


+ 1
- 1
kernel/x86/trsm_kernel_LT_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


+ 1
- 1
kernel/x86/trsm_kernel_RT_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


+ 1
- 1
kernel/x86/trsm_kernel_RT_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif


+ 1
- 1
kernel/x86/zgemm_kernel_1x2_penryn.S View File

@@ -64,7 +64,7 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHSIZE (8 * 1 - 4)
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0


+ 1
- 1
kernel/x86/zgemm_kernel_2x2_penryn.S View File

@@ -64,7 +64,7 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE (16 * 1 + 8) #define PREFETCHSIZE (16 * 1 + 8)
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0


+ 1
- 1
kernel/x86/zgemv_n_sse.S View File

@@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 2) #define PREFETCHSIZE (16 * 2)
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)


+ 1
- 1
kernel/x86/zgemv_n_sse2.S View File

@@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)


+ 1
- 1
kernel/x86/zgemv_t_sse.S View File

@@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 2) #define PREFETCHSIZE (16 * 2)
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)


+ 1
- 1
kernel/x86/zgemv_t_sse2.S View File

@@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)


+ 2
- 2
kernel/x86/zscal_sse.S View File

@@ -55,7 +55,7 @@
#define XX %edi #define XX %edi
#define FLAG %ebp #define FLAG %ebp


#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON)
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
#define USE_PSHUFD #define USE_PSHUFD
#else #else
#define USE_PSHUFD_HALF #define USE_PSHUFD_HALF
@@ -697,7 +697,7 @@
cmpl $2 * SIZE, INCX cmpl $2 * SIZE, INCX
jne .L120 jne .L120


#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)


PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm0, %xmm6)
PSHUFD2($0, %xmm1, %xmm1) PSHUFD2($0, %xmm1, %xmm1)


+ 2
- 2
kernel/x86/zscal_sse2.S View File

@@ -57,7 +57,7 @@


#include "l1param.h" #include "l1param.h"


#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON)
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
#define USE_PSHUFD #define USE_PSHUFD
#else #else
#define USE_PSHUFD_HALF #define USE_PSHUFD_HALF
@@ -860,7 +860,7 @@
cmpl $2 * SIZE, INCX cmpl $2 * SIZE, INCX
jne .L220 jne .L220


#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)


#ifdef HAVE_SSE3 #ifdef HAVE_SSE3
movddup %xmm0, %xmm6 movddup %xmm0, %xmm6


+ 1
- 1
kernel/x86/ztrsm_kernel_LN_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


+ 1
- 1
kernel/x86/ztrsm_kernel_LT_1x2_penryn.S View File

@@ -63,7 +63,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


+ 1
- 1
kernel/x86/ztrsm_kernel_LT_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


+ 1
- 1
kernel/x86/ztrsm_kernel_RT_1x2_penryn.S View File

@@ -63,7 +63,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


+ 1
- 1
kernel/x86/ztrsm_kernel_RT_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif


+ 59
- 0
kernel/x86_64/KERNEL.SANDYBRIDGE View File

@@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x8_nehalem.S
DGEMMINCOPY = dgemm_ncopy_2.S
DGEMMITCOPY = dgemm_tcopy_2.S
DGEMMONCOPY = dgemm_ncopy_8.S
DGEMMOTCOPY = dgemm_tcopy_8.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
CGEMMINCOPY = zgemm_ncopy_2.S
CGEMMITCOPY = zgemm_tcopy_2.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
ZGEMMINCOPY = zgemm_ncopy_1.S
ZGEMMITCOPY = zgemm_tcopy_1.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S

DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S

CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S

ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S

CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S

+ 6
- 0
kernel/x86_64/dgemm_ncopy_2.S View File

@@ -45,6 +45,12 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#ifdef SANDYBRIDGE
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif

#ifndef MOVAPS #ifndef MOVAPS
#define MOVAPS movaps #define MOVAPS movaps
#endif #endif


+ 1
- 1
kernel/x86_64/dgemm_ncopy_4.S View File

@@ -45,7 +45,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE 16 #define PREFETCHSIZE 16
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0


+ 6
- 0
kernel/x86_64/dgemm_ncopy_8.S View File

@@ -45,6 +45,12 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif

#ifndef MOVAPS #ifndef MOVAPS
#define MOVAPS movaps #define MOVAPS movaps
#endif #endif


+ 7
- 0
kernel/x86_64/dgemm_tcopy_2.S View File

@@ -52,6 +52,13 @@
#define MOVUPS_A movups #define MOVUPS_A movups
#endif #endif


#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define MOVUPS_A movups
#endif

#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCHSIZE 16 #define PREFETCHSIZE 16
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0


+ 6
- 0
kernel/x86_64/dgemm_tcopy_4.S View File

@@ -51,6 +51,12 @@
#define MOVUPS_A movups #define MOVUPS_A movups
#endif #endif


#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define MOVUPS_A movups
#endif

#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCHSIZE 16 #define PREFETCHSIZE 16
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0


+ 7
- 0
kernel/x86_64/dgemm_tcopy_8.S View File

@@ -46,6 +46,13 @@
#define MOVUPS_A movups #define MOVUPS_A movups
#endif #endif


#ifdef SANDYBRIDGE
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define MOVUPS_A movups
#endif

#ifdef MOVUPS_A #ifdef MOVUPS_A
#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS


+ 7
- 0
kernel/x86_64/gemm_ncopy_2.S View File

@@ -46,6 +46,13 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#if defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif

#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI


#define M ARG1 /* rdi */ #define M ARG1 /* rdi */


+ 1
- 1
kernel/x86_64/gemm_ncopy_4.S View File

@@ -46,7 +46,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12 #define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define WPREFETCHSIZE (RPREFETCHSIZE * 4)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0


+ 7
- 0
kernel/x86_64/gemm_tcopy_2.S View File

@@ -46,6 +46,13 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#if defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif

#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI


#define M ARG1 /* rdi */ #define M ARG1 /* rdi */


+ 1
- 1
kernel/x86_64/gemm_tcopy_4.S View File

@@ -46,7 +46,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12 #define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define WPREFETCHSIZE (RPREFETCHSIZE * 4)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0


+ 1
- 1
kernel/x86_64/symv_L_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_L_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_U_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_U_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zscal_sse.S View File

@@ -685,7 +685,7 @@
cmpq $2 * SIZE, INCX cmpq $2 * SIZE, INCX
jne .L120 jne .L120


#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)


pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm0, %xmm14
pshufd $0, %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1


+ 2
- 2
kernel/x86_64/zscal_sse2.S View File

@@ -55,7 +55,7 @@


#include "l1param.h" #include "l1param.h"


#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO)
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE)
#define USE_PSHUFD #define USE_PSHUFD
#else #else
#define USE_PSHUFD_HALF #define USE_PSHUFD_HALF
@@ -803,7 +803,7 @@
cmpq $2 * SIZE, INCX cmpq $2 * SIZE, INCX
jne .L220 jne .L220


#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
movddup %xmm0, %xmm14 movddup %xmm0, %xmm14
pxor %xmm15, %xmm15 pxor %xmm15, %xmm15


+ 1
- 1
kernel/x86_64/zsymv_L_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_L_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_U_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_U_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif


#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)


+ 7
- 0
l1param.h View File

@@ -9,6 +9,13 @@
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#endif #endif


#ifdef SANDYBRIDGE
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (128 * 12)
#define ALIGNED_ACCESS
#endif

#ifdef ATHLON #ifdef ATHLON
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw


+ 11
- 0
l2param.h View File

@@ -63,6 +63,17 @@
#define PREFETCHSIZE 64 * 3 #define PREFETCHSIZE 64 * 3
#endif #endif


#ifdef SANDYBRIDGE
#define MOVUPS_A movups
#define MOVUPS_XL movups
#define MOVUPS_XS movups
#define MOVUPS_YL movups
#define MOVUPS_YS movups
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE 64 * 3
#endif

#ifdef OPTERON #ifdef OPTERON
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw


+ 74
- 0
param.h View File

@@ -913,6 +913,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif




#ifdef SANDYBRIDGE

#define SNUMOPT 8
#define DNUMOPT 4

#define GEMM_DEFAULT_OFFSET_A 32
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL

#define SYMV_P 8

#define SWITCH_RATIO 4

#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1

#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#else
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1

#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_N 8
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define XGEMM_DEFAULT_UNROLL_N 1
#endif

#define SGEMM_DEFAULT_P 504
#define SGEMM_DEFAULT_R sgemm_r

#define DGEMM_DEFAULT_P 504
#define DGEMM_DEFAULT_R dgemm_r

#define QGEMM_DEFAULT_P 504
#define QGEMM_DEFAULT_R qgemm_r

#define CGEMM_DEFAULT_P 252
#define CGEMM_DEFAULT_R cgemm_r

#define ZGEMM_DEFAULT_P 252
#define ZGEMM_DEFAULT_R zgemm_r

#define XGEMM_DEFAULT_P 252
#define XGEMM_DEFAULT_R xgemm_r

#define SGEMM_DEFAULT_Q 512
#define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 512
#define ZGEMM_DEFAULT_Q 256
#define XGEMM_DEFAULT_Q 128

#define GETRF_FACTOR 0.72

#endif



#ifdef ATOM #ifdef ATOM


#define SNUMOPT 2 #define SNUMOPT 2


Loading…
Cancel
Save