Browse Source

Refs #113. Support AMD Bobcate using Barcelona kernel codes. Replace 3DNow! with MMX.

tags/v0.2.0^2
Zhang Xianyi 13 years ago
parent
commit
d6cab3f37e
29 changed files with 303 additions and 70 deletions
  1. +2
    -2
      Makefile.system
  2. +1
    -0
      TargetList.txt
  3. +2
    -0
      cpuid.h
  4. +9
    -1
      cpuid_x86.c
  5. +2
    -2
      driver/others/parameter.c
  6. +17
    -1
      getarch.c
  7. +16
    -0
      kernel/setparam-ref.c
  8. +59
    -0
      kernel/x86/KERNEL.BOBCATE
  9. +5
    -5
      kernel/x86/trsm_kernel_LN_2x4_sse2.S
  10. +11
    -11
      kernel/x86/trsm_kernel_LN_4x4_sse.S
  11. +5
    -5
      kernel/x86/trsm_kernel_LT_2x4_sse2.S
  12. +11
    -11
      kernel/x86/trsm_kernel_LT_4x4_sse.S
  13. +5
    -5
      kernel/x86/trsm_kernel_RT_2x4_sse2.S
  14. +11
    -11
      kernel/x86/trsm_kernel_RT_4x4_sse.S
  15. +2
    -2
      kernel/x86/ztrsm_kernel_LN_2x2_sse.S
  16. +2
    -2
      kernel/x86/ztrsm_kernel_LT_2x2_sse.S
  17. +2
    -2
      kernel/x86/ztrsm_kernel_RT_2x2_sse.S
  18. +62
    -0
      kernel/x86_64/KERNEL.BOBCATE
  19. +1
    -1
      kernel/x86_64/symv_L_sse.S
  20. +1
    -1
      kernel/x86_64/symv_L_sse2.S
  21. +1
    -1
      kernel/x86_64/symv_U_sse.S
  22. +1
    -1
      kernel/x86_64/symv_U_sse2.S
  23. +1
    -1
      kernel/x86_64/zsymv_L_sse.S
  24. +1
    -1
      kernel/x86_64/zsymv_L_sse2.S
  25. +1
    -1
      kernel/x86_64/zsymv_U_sse.S
  26. +1
    -1
      kernel/x86_64/zsymv_U_sse2.S
  27. +7
    -0
      l1param.h
  28. +1
    -1
      l2param.h
  29. +63
    -1
      param.h

+ 2
- 2
Makefile.system View File

@@ -247,11 +247,11 @@ endif
ifdef DYNAMIC_ARCH ifdef DYNAMIC_ARCH
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO
endif endif


ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO
endif endif


ifndef DYNAMIC_CORE ifndef DYNAMIC_CORE


+ 1
- 0
TargetList.txt View File

@@ -28,6 +28,7 @@ OPTERON_SSE3
BARCELONA BARCELONA
SHANGHAI SHANGHAI
ISTANBUL ISTANBUL
BOBCATE


c)VIA CPU: c)VIA CPU:
SSE_GENERIC SSE_GENERIC


+ 2
- 0
cpuid.h View File

@@ -104,6 +104,7 @@
#define CORE_ATOM 18 #define CORE_ATOM 18
#define CORE_NANO 19 #define CORE_NANO 19
#define CORE_SANDYBRIDGE 20 #define CORE_SANDYBRIDGE 20
#define CORE_BOBCATE 21


#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@@ -191,4 +192,5 @@ typedef struct {
#define CPUTYPE_VIAC3 42 #define CPUTYPE_VIAC3 42
#define CPUTYPE_NANO 43 #define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44 #define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCATE 45
#endif #endif

+ 9
- 1
cpuid_x86.c View File

@@ -1028,6 +1028,8 @@ int get_cpuname(void){
case 1: case 1:
case 10: case 10:
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
case 5:
return CPUTYPE_BOBCATE;
} }
break; break;
} }
@@ -1148,6 +1150,7 @@ static char *cpuname[] = {
"VIAC3", "VIAC3",
"NANO", "NANO",
"SANDYBRIDGE", "SANDYBRIDGE",
"BOBCATE",
}; };


static char *lowercpuname[] = { static char *lowercpuname[] = {
@@ -1195,6 +1198,7 @@ static char *lowercpuname[] = {
"nsgeode", "nsgeode",
"nano", "nano",
"sandybridge", "sandybridge",
"bobcate",
}; };


static char *corename[] = { static char *corename[] = {
@@ -1219,6 +1223,7 @@ static char *corename[] = {
"ATOM", "ATOM",
"NANO", "NANO",
"SANDYBRIDGE", "SANDYBRIDGE",
"BOBCATE",
}; };


static char *corename_lower[] = { static char *corename_lower[] = {
@@ -1243,6 +1248,7 @@ static char *corename_lower[] = {
"atom", "atom",
"nano", "nano",
"sandybridge", "sandybridge",
"bobcate",
}; };




@@ -1351,7 +1357,9 @@ int get_coretype(void){
if (family <= 0x5) return CORE_80486; if (family <= 0x5) return CORE_80486;
if (family <= 0xe) return CORE_ATHLON; if (family <= 0xe) return CORE_ATHLON;
if (family == 0xf){ if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA;
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCATE;
else return CORE_BARCELONA;
} }
} }




+ 2
- 2
driver/others/parameter.c View File

@@ -163,7 +163,7 @@ int get_L2_size(void){


int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;


#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)


@@ -446,7 +446,7 @@ void blas_set_parameter(void){
#endif #endif
#endif #endif


#if defined(CORE_BARCELONA)
#if defined(CORE_BARCELONA) || defined(CORE_BOBCATE)
size >>= 8; size >>= 8;


sgemm_p = 232 * size; sgemm_p = 232 * size;


+ 17
- 1
getarch.c View File

@@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.


Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_BARCELONA */ /* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */ /* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */ /* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCATE */
/* #define FORCE_SSE_GENERIC */ /* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */ /* #define FORCE_VIAC3 */
/* #define FORCE_NANO */ /* #define FORCE_NANO */
@@ -363,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BARCELONA" #define CORENAME "BARCELONA"
#endif #endif


#if defined(FORCE_BOBCATE)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BOBCATE"
#define ARCHCONFIG "-DBOBCATE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV"
#define LIBNAME "bobcate"
#define CORENAME "BOBCATE"
#endif

#ifdef FORCE_SSE_GENERIC #ifdef FORCE_SSE_GENERIC
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL


+ 16
- 0
kernel/setparam-ref.c View File

@@ -794,6 +794,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif


#ifdef BOBCATE

#ifdef DEBUG
fprintf(stderr, "Bobcate\n");
#endif

TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif

#ifdef NANO #ifdef NANO


#ifdef DEBUG #ifdef DEBUG


+ 59
- 0
kernel/x86/KERNEL.BOBCATE View File

@@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S

DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S

CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S

ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S

CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

+ 5
- 5
kernel/x86/trsm_kernel_LN_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -439,7 +439,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@@ -1697,7 +1697,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_LN_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -437,7 +437,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@@ -833,7 +833,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -1848,7 +1848,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2109,7 +2109,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2429,7 +2429,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -2952,7 +2952,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@@ -3148,7 +3148,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -3389,7 +3389,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3


+ 5
- 5
kernel/x86/trsm_kernel_LT_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -910,7 +910,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@@ -1439,7 +1439,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_LT_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -872,7 +872,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -1316,7 +1316,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@@ -1855,7 +1855,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -2249,7 +2249,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2562,7 +2562,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2957,7 +2957,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@@ -3280,7 +3280,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -3515,7 +3515,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0


+ 5
- 5
kernel/x86/trsm_kernel_RT_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -1036,7 +1036,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2
@@ -2224,7 +2224,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_RT_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -439,7 +439,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@@ -758,7 +758,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -993,7 +993,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@@ -1324,7 +1324,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -1718,7 +1718,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2031,7 +2031,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2859,7 +2859,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -3303,7 +3303,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2


+ 2
- 2
kernel/x86/ztrsm_kernel_LN_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -533,7 +533,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 2
- 2
kernel/x86/ztrsm_kernel_LT_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -994,7 +994,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 2
- 2
kernel/x86/ztrsm_kernel_RT_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 62
- 0
kernel/x86_64/KERNEL.BOBCATE View File

@@ -0,0 +1,62 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S

SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S

DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S

CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S

ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S

CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

+ 1
- 1
kernel/x86_64/symv_L_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_L_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_U_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_U_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/zsymv_L_sse.S View File

@@ -160,7 +160,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15


#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else


+ 1
- 1
kernel/x86_64/zsymv_L_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/zsymv_U_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/zsymv_U_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 7
- 0
l1param.h View File

@@ -67,6 +67,13 @@
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#endif #endif


#ifdef BOBCATE
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (128 * 5)
#define ALIGNED_ACCESS
#endif

#ifdef NANO #ifdef NANO
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0


+ 1
- 1
l2param.h View File

@@ -85,7 +85,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#define MOVUPS_A movaps #define MOVUPS_A movaps
#define MOVUPS_XL movaps #define MOVUPS_XL movaps


+ 63
- 1
param.h View File

@@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.


Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -208,6 +208,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#endif #endif


#define SGEMM_DEFAULT_R sgemm_r
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r
#define XGEMM_DEFAULT_R xgemm_r

#define SYMV_P 16
#define HAVE_EXCLUSIVE_CACHE

#define GEMM_THREAD gemm_thread_mn

#endif

#if defined(BOBCATE)

#define SNUMOPT 8
#define DNUMOPT 4

#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL

#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1

#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#else
#define SGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_M 4
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_M 2
#define XGEMM_DEFAULT_UNROLL_M 1
#endif


#define SGEMM_DEFAULT_P 448
#define DGEMM_DEFAULT_P 224
#define QGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
#define ZGEMM_DEFAULT_P 112
#define XGEMM_DEFAULT_P 56

#define SGEMM_DEFAULT_Q 224
#define DGEMM_DEFAULT_Q 224
#define QGEMM_DEFAULT_Q 224
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 224
#define XGEMM_DEFAULT_Q 224


#define SGEMM_DEFAULT_R sgemm_r #define SGEMM_DEFAULT_R sgemm_r
#define QGEMM_DEFAULT_R qgemm_r #define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R dgemm_r #define DGEMM_DEFAULT_R dgemm_r


Loading…
Cancel
Save