Browse Source

Init code base for Intel Haswell.

tags/v0.2.9.rc1
Zhang Xianyi 12 years ago
parent
commit
2638370844
29 changed files with 220 additions and 27 deletions
  1. +2
    -2
      Makefile.system
  2. +5
    -0
      common_x86.h
  3. +6
    -0
      common_x86_64.h
  4. +2
    -3
      cpuid.h
  5. +4
    -0
      cpuid_x86.c
  6. +6
    -3
      driver/others/dynamic.c
  7. +15
    -0
      getarch.c
  8. +1
    -0
      kernel/x86/KERNEL.HASWELL
  9. +1
    -1
      kernel/x86/trsm_kernel_LN_2x4_penryn.S
  10. +1
    -1
      kernel/x86/trsm_kernel_LN_4x4_penryn.S
  11. +1
    -1
      kernel/x86/trsm_kernel_LT_2x4_penryn.S
  12. +1
    -1
      kernel/x86/trsm_kernel_LT_4x4_penryn.S
  13. +1
    -1
      kernel/x86/trsm_kernel_RT_2x4_penryn.S
  14. +1
    -1
      kernel/x86/trsm_kernel_RT_4x4_penryn.S
  15. +1
    -1
      kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
  16. +1
    -1
      kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
  17. +1
    -1
      kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
  18. +1
    -1
      kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
  19. +1
    -1
      kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
  20. +84
    -0
      kernel/x86_64/KERNEL.HASWELL
  21. +1
    -1
      kernel/x86_64/symv_L_sse.S
  22. +1
    -1
      kernel/x86_64/symv_L_sse2.S
  23. +1
    -1
      kernel/x86_64/symv_U_sse.S
  24. +1
    -1
      kernel/x86_64/symv_U_sse2.S
  25. +1
    -1
      kernel/x86_64/zsymv_L_sse.S
  26. +1
    -1
      kernel/x86_64/zsymv_L_sse2.S
  27. +1
    -1
      kernel/x86_64/zsymv_U_sse.S
  28. +1
    -1
      kernel/x86_64/zsymv_U_sse2.S
  29. +76
    -0
      param.h

+ 2
- 2
Makefile.system View File

@@ -324,14 +324,14 @@ ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif

ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif



+ 5
- 0
common_x86.h View File

@@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define MMXSTORE movd
#endif

#if defined(SANDYBRIDGE) || defined(HASWELL)
//Enable some optimazation for nehalem.
#define NEHALEM_OPTIMIZATION
#endif

#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION


+ 6
- 0
common_x86_64.h View File

@@ -218,6 +218,12 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){

#ifdef ASSEMBLER

#if defined(SANDYBRIDGE) || defined(HASWELL)
//Enable some optimazation for nehalem.
#define NEHALEM_OPTIMIZATION
#endif


#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION


+ 2
- 3
cpuid.h View File

@@ -107,7 +107,7 @@
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23
#define CORE_HASWELL CORE_SANDYBRIDGE
#define CORE_HASWELL 24

#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -200,7 +200,6 @@ typedef struct {
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47
// this define is because BLAS doesn't have haswell specific optimizations yet
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
#define CPUTYPE_HASWELL 48

#endif

+ 4
- 0
cpuid_x86.c View File

@@ -1243,6 +1243,7 @@ static char *cpuname[] = {
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
"HASWELL",
};

static char *lowercpuname[] = {
@@ -1293,6 +1294,7 @@ static char *lowercpuname[] = {
"bobcat",
"bulldozer",
"piledriver",
"haswell",
};

static char *corename[] = {
@@ -1320,6 +1322,7 @@ static char *corename[] = {
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
"HASWELL",
};

static char *corename_lower[] = {
@@ -1347,6 +1350,7 @@ static char *corename_lower[] = {
"bobcat",
"bulldozer",
"piledriver",
"haswell",
};




+ 6
- 3
driver/others/dynamic.c View File

@@ -65,14 +65,15 @@ extern gotoblas_t gotoblas_BOBCAT;
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_HASWELL;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#endif
//Use sandy bridge kernels for haswell.
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE


#define VENDOR_INTEL 1
#define VENDOR_AMD 2
@@ -285,6 +286,7 @@ static char *corename[] = {
"Bobcat",
"Bulldozer",
"Piledriver",
"Haswell",
};

char *gotoblas_corename(void) {
@@ -307,7 +309,8 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];

return corename[0];
}


+ 15
- 0
getarch.c View File

@@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "SANDYBRIDGE"
#endif

#ifdef FORCE_HASWELL
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif

#ifdef FORCE_ATOM
#define FORCE
#define FORCE_INTEL


+ 1
- 0
kernel/x86/KERNEL.HASWELL View File

@@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.PENRYN

+ 1
- 1
kernel/x86/trsm_kernel_LN_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_LN_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_LT_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_LT_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_RT_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_RT_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_LN_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_LT_1x2_penryn.S View File

@@ -63,7 +63,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_LT_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_RT_1x2_penryn.S View File

@@ -63,7 +63,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_RT_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 84
- 0
kernel/x86_64/KERNEL.HASWELL View File

@@ -0,0 +1,84 @@
SGEMMKERNEL = sgemm_kernel_8x8_sandy.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
#DGEMMONCOPY = gemm_ncopy_4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
#DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
CGEMMKERNEL = cgemm_kernel_4x8_sandy.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S

#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S

#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S

#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c



CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S

+ 1
- 1
kernel/x86_64/symv_L_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_L_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_U_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_U_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_L_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_L_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_U_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_U_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 76
- 0
param.h View File

@@ -1150,6 +1150,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#endif

#ifdef HASWELL

#define SNUMOPT 8
#define DNUMOPT 4

#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL

#define SYMV_P 8

#define SWITCH_RATIO 4

#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1

#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#else
#define SGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1

#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define XGEMM_DEFAULT_UNROLL_N 1
#endif

#define SGEMM_DEFAULT_P 512
#define SGEMM_DEFAULT_R sgemm_r
//#define SGEMM_DEFAULT_R 1024

#define DGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_R dgemm_r
//#define DGEMM_DEFAULT_R 1024

#define QGEMM_DEFAULT_P 504
#define QGEMM_DEFAULT_R qgemm_r

#define CGEMM_DEFAULT_P 128
//#define CGEMM_DEFAULT_R cgemm_r
#define CGEMM_DEFAULT_R 1024

#define ZGEMM_DEFAULT_P 512
#define ZGEMM_DEFAULT_R zgemm_r
//#define ZGEMM_DEFAULT_R 1024

#define XGEMM_DEFAULT_P 252
#define XGEMM_DEFAULT_R xgemm_r

#define SGEMM_DEFAULT_Q 256
#define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 256
#define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128

#define GETRF_FACTOR 0.72

#endif



#ifdef ATOM


Loading…
Cancel
Save