Browse Source

Added BULLDOZER target. So far it uses barcelona kernels.

tags/v0.2.6
Zhang Xianyi 13 years ago
parent
commit
bfaaa975e6
47 changed files with 156 additions and 132 deletions
  1. +1
    -0
      TargetList.txt
  2. +1
    -1
      driver/others/parameter.c
  3. +2
    -2
      getarch.c
  4. +16
    -0
      kernel/setparam-ref.c
  5. +10
    -10
      kernel/x86/gemm_kernel_4x4_barcelona.S
  6. +1
    -1
      kernel/x86/scal_sse.S
  7. +1
    -1
      kernel/x86/scal_sse2.S
  8. +5
    -5
      kernel/x86/trsm_kernel_LN_2x4_sse2.S
  9. +11
    -11
      kernel/x86/trsm_kernel_LN_4x4_sse.S
  10. +5
    -5
      kernel/x86/trsm_kernel_LT_2x4_sse2.S
  11. +11
    -11
      kernel/x86/trsm_kernel_LT_4x4_sse.S
  12. +5
    -5
      kernel/x86/trsm_kernel_RT_2x4_sse2.S
  13. +11
    -11
      kernel/x86/trsm_kernel_RT_4x4_sse.S
  14. +11
    -11
      kernel/x86/zgemm3m_kernel_4x4_barcelona.S
  15. +1
    -1
      kernel/x86/zgemv_n_sse.S
  16. +1
    -1
      kernel/x86/zgemv_n_sse2.S
  17. +1
    -1
      kernel/x86/zgemv_t_sse.S
  18. +1
    -1
      kernel/x86/zgemv_t_sse2.S
  19. +2
    -2
      kernel/x86/ztrsm_kernel_LN_2x2_sse.S
  20. +2
    -2
      kernel/x86/ztrsm_kernel_LT_2x2_sse.S
  21. +2
    -2
      kernel/x86/ztrsm_kernel_RT_2x2_sse.S
  22. +19
    -19
      kernel/x86_64/gemm_kernel_8x4_barcelona.S
  23. +2
    -2
      kernel/x86_64/gemm_ncopy_4_opteron.S
  24. +2
    -2
      kernel/x86_64/gemm_tcopy_4_opteron.S
  25. +1
    -1
      kernel/x86_64/izamax_sse2.S
  26. +1
    -1
      kernel/x86_64/scal_sse.S
  27. +1
    -1
      kernel/x86_64/scal_sse2.S
  28. +1
    -1
      kernel/x86_64/symv_L_sse.S
  29. +1
    -1
      kernel/x86_64/symv_L_sse2.S
  30. +1
    -1
      kernel/x86_64/symv_U_sse.S
  31. +1
    -1
      kernel/x86_64/symv_U_sse2.S
  32. +1
    -1
      kernel/x86_64/trsm_kernel_LN_8x4_sse.S
  33. +1
    -1
      kernel/x86_64/trsm_kernel_LT_8x4_sse.S
  34. +1
    -1
      kernel/x86_64/trsm_kernel_RT_8x4_sse.S
  35. +1
    -1
      kernel/x86_64/zgemm_ncopy_2.S
  36. +1
    -1
      kernel/x86_64/zsymv_L_sse.S
  37. +2
    -2
      kernel/x86_64/zsymv_L_sse2.S
  38. +2
    -2
      kernel/x86_64/zsymv_U_sse.S
  39. +2
    -2
      kernel/x86_64/zsymv_U_sse2.S
  40. +1
    -1
      kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S
  41. +1
    -1
      kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S
  42. +1
    -1
      kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S
  43. +1
    -1
      kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S
  44. +1
    -1
      kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S
  45. +1
    -1
      kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S
  46. +7
    -0
      l1param.h
  47. +1
    -1
      l2param.h

+ 1
- 0
TargetList.txt View File

@@ -29,6 +29,7 @@ BARCELONA
SHANGHAI SHANGHAI
ISTANBUL ISTANBUL
BOBCAT BOBCAT
BULLDOZER


c)VIA CPU: c)VIA CPU:
SSE_GENERIC SSE_GENERIC


+ 1
- 1
driver/others/parameter.c View File

@@ -163,7 +163,7 @@ int get_L2_size(void){


int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;


#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)




+ 2
- 2
getarch.c View File

@@ -385,12 +385,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FORCE_INTEL #define FORCE_INTEL
#define ARCHITECTURE "X86" #define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BULLDOZER" #define SUBARCHITECTURE "BULLDOZER"
#define ARCHCONFIG "-DBARCELONA " \
#define ARCHCONFIG "-DBULLDOZER " \
"-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
"-DHAVE_AVX -DHAVE_FMA4" "-DHAVE_AVX -DHAVE_FMA4"
#define LIBNAME "bulldozer" #define LIBNAME "bulldozer"
#define CORENAME "BULLDOZER" #define CORENAME "BULLDOZER"


+ 16
- 0
kernel/setparam-ref.c View File

@@ -810,6 +810,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif


#ifdef BULLDOZER

#ifdef DEBUG
fprintf(stderr, "Bulldozer\n");
#endif

TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif

#ifdef NANO #ifdef NANO


#ifdef DEBUG #ifdef DEBUG


+ 10
- 10
kernel/x86/gemm_kernel_4x4_barcelona.S View File

@@ -596,7 +596,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 4 * SIZE(BB), %xmm2 movsd 4 * SIZE(BB), %xmm2
@@ -842,7 +842,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@@ -1168,7 +1168,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -1198,7 +1198,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -1347,7 +1347,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -1531,7 +1531,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -1778,7 +1778,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -1793,7 +1793,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@@ -1924,7 +1924,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -2069,7 +2069,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0


+ 1
- 1
kernel/x86/scal_sse.S View File

@@ -269,7 +269,7 @@
sarl $5, I sarl $5, I
jle .L113 jle .L113


#if defined(BARCELONA)
#if defined(BARCELONA) || defined(BULLDOZER)


movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulps -32 * SIZE(X), %xmm1 mulps -32 * SIZE(X), %xmm1


+ 1
- 1
kernel/x86/scal_sse2.S View File

@@ -253,7 +253,7 @@
sarl $4, I sarl $4, I
jle .L113 jle .L113


#if defined(BARCELONA)
#if defined(BARCELONA) || defined(BULLDOZER)


movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulpd -16 * SIZE(X), %xmm1 mulpd -16 * SIZE(X), %xmm1


+ 5
- 5
kernel/x86/trsm_kernel_LN_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -439,7 +439,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@@ -1697,7 +1697,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_LN_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -437,7 +437,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@@ -833,7 +833,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -1848,7 +1848,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2109,7 +2109,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2429,7 +2429,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -2952,7 +2952,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@@ -3148,7 +3148,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -3389,7 +3389,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3


+ 5
- 5
kernel/x86/trsm_kernel_LT_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -910,7 +910,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@@ -1439,7 +1439,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_LT_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -872,7 +872,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -1316,7 +1316,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@@ -1855,7 +1855,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -2249,7 +2249,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2562,7 +2562,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2957,7 +2957,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@@ -3280,7 +3280,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -3515,7 +3515,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0


+ 5
- 5
kernel/x86/trsm_kernel_RT_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -1036,7 +1036,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2
@@ -2224,7 +2224,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_RT_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -439,7 +439,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@@ -758,7 +758,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -993,7 +993,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@@ -1324,7 +1324,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -1718,7 +1718,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2031,7 +2031,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2859,7 +2859,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -3303,7 +3303,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2


+ 11
- 11
kernel/x86/zgemm3m_kernel_4x4_barcelona.S View File

@@ -74,7 +74,7 @@
#define BB %ecx #define BB %ecx
#define LDC %ebp #define LDC %ebp


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define movsd movlps #define movsd movlps
#endif #endif


@@ -625,7 +625,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 4 * SIZE(BB), %xmm2 movsd 4 * SIZE(BB), %xmm2
@@ -870,7 +870,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@@ -1173,7 +1173,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -1203,7 +1203,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -1359,7 +1359,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -1536,7 +1536,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -1794,7 +1794,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -1809,7 +1809,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@@ -1936,7 +1936,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -2069,7 +2069,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0


+ 1
- 1
kernel/x86/zgemv_n_sse.S View File

@@ -71,7 +71,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#ifdef BARCELONA
#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5) #define PREFETCHSIZE (16 * 5)


+ 1
- 1
kernel/x86/zgemv_n_sse2.S View File

@@ -58,7 +58,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#ifdef BARCELONA
#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5) #define PREFETCHSIZE (8 * 5)


+ 1
- 1
kernel/x86/zgemv_t_sse.S View File

@@ -71,7 +71,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#ifdef BARCELONA
#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5) #define PREFETCHSIZE (16 * 5)


+ 1
- 1
kernel/x86/zgemv_t_sse2.S View File

@@ -58,7 +58,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#ifdef BARCELONA
#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5) #define PREFETCHSIZE (8 * 5)


+ 2
- 2
kernel/x86/ztrsm_kernel_LN_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -533,7 +533,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 2
- 2
kernel/x86/ztrsm_kernel_LT_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -994,7 +994,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 2
- 2
kernel/x86/ztrsm_kernel_RT_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 19
- 19
kernel/x86_64/gemm_kernel_8x4_barcelona.S View File

@@ -930,7 +930,7 @@
.L22: .L22:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
addps %xmm9, %xmm0 addps %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movaps 4 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm9
@@ -983,7 +983,7 @@
addps %xmm8, %xmm3 addps %xmm8, %xmm3
movaps 0 * SIZE(AO), %xmm8 movaps 0 * SIZE(AO), %xmm8


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif #endif
mulps %xmm10, %xmm9 mulps %xmm10, %xmm9
@@ -1178,7 +1178,7 @@
.L32: .L32:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
addps %xmm9, %xmm0 addps %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movsd 4 * SIZE(BO), %xmm9 movsd 4 * SIZE(BO), %xmm9
@@ -1423,7 +1423,7 @@
.L42: .L42:
mulss %xmm8, %xmm9 mulss %xmm8, %xmm9
addss %xmm9, %xmm0 addss %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movss 4 * SIZE(BO), %xmm9 movss 4 * SIZE(BO), %xmm9
@@ -1765,7 +1765,7 @@


.L62: .L62:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
mulps 4 * SIZE(BO), %xmm8 mulps 4 * SIZE(BO), %xmm8
@@ -1793,7 +1793,7 @@
addps %xmm8, %xmm5 addps %xmm8, %xmm5
movaps 32 * SIZE(AO), %xmm8 movaps 32 * SIZE(AO), %xmm8


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif #endif
mulps %xmm10, %xmm11 mulps %xmm10, %xmm11
@@ -1822,7 +1822,7 @@
addps %xmm10, %xmm5 addps %xmm10, %xmm5
movaps 48 * SIZE(AO), %xmm10 movaps 48 * SIZE(AO), %xmm10


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
#endif #endif
mulps %xmm12, %xmm13 mulps %xmm12, %xmm13
@@ -1851,7 +1851,7 @@
addps %xmm12, %xmm5 addps %xmm12, %xmm5
movaps 64 * SIZE(AO), %xmm12 movaps 64 * SIZE(AO), %xmm12


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
#endif #endif
mulps %xmm14, %xmm15 mulps %xmm14, %xmm15
@@ -2024,7 +2024,7 @@


.L72: .L72:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif


@@ -2208,7 +2208,7 @@
.L82: .L82:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
addps %xmm9, %xmm0 addps %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movsd 4 * SIZE(BO), %xmm9 movsd 4 * SIZE(BO), %xmm9
@@ -2395,7 +2395,7 @@
.L92: .L92:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
addps %xmm9, %xmm0 addps %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movss 4 * SIZE(BO), %xmm9 movss 4 * SIZE(BO), %xmm9
@@ -2670,7 +2670,7 @@


.L112: .L112:
mulps %xmm9, %xmm8 mulps %xmm9, %xmm8
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif


@@ -2687,7 +2687,7 @@
addps %xmm9, %xmm4 addps %xmm9, %xmm4
movaps 8 * SIZE(BO), %xmm9 movaps 8 * SIZE(BO), %xmm9


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif #endif
mulps %xmm9, %xmm10 mulps %xmm9, %xmm10
@@ -2704,7 +2704,7 @@
addps %xmm9, %xmm4 addps %xmm9, %xmm4
movaps 32 * SIZE(BO), %xmm9 movaps 32 * SIZE(BO), %xmm9


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
#endif #endif
mulps %xmm11, %xmm12 mulps %xmm11, %xmm12
@@ -2721,7 +2721,7 @@
addps %xmm11, %xmm4 addps %xmm11, %xmm4
movaps 24 * SIZE(BO), %xmm11 movaps 24 * SIZE(BO), %xmm11


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
#endif #endif
mulps %xmm11, %xmm14 mulps %xmm11, %xmm14
@@ -2857,7 +2857,7 @@


.L122: .L122:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movaps -28 * SIZE(AO), %xmm8 movaps -28 * SIZE(AO), %xmm8
@@ -2873,7 +2873,7 @@
addps %xmm8, %xmm3 addps %xmm8, %xmm3
movaps 0 * SIZE(AO), %xmm8 movaps 0 * SIZE(AO), %xmm8


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif #endif
mulps %xmm10, %xmm11 mulps %xmm10, %xmm11
@@ -3003,7 +3003,7 @@


.L132: .L132:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movsd -30 * SIZE(AO), %xmm8 movsd -30 * SIZE(AO), %xmm8
@@ -3150,7 +3150,7 @@


.L142: .L142:
mulss %xmm8, %xmm9 mulss %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movss -31 * SIZE(AO), %xmm8 movss -31 * SIZE(AO), %xmm8


+ 2
- 2
kernel/x86_64/gemm_ncopy_4_opteron.S View File

@@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE (12 + 4) #define RPREFETCHSIZE (12 + 4)
#define WPREFETCHSIZE (48 + 4) #define WPREFETCHSIZE (48 + 4)
#define MOVNTQ MOVQ #define MOVNTQ MOVQ
@@ -79,7 +79,7 @@
#define AO3 %r13 #define AO3 %r13
#define AO4 %rax #define AO4 %rax


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCH prefetch #define RPREFETCH prefetch
#else #else
#define RPREFETCH prefetch #define RPREFETCH prefetch


+ 2
- 2
kernel/x86_64/gemm_tcopy_4_opteron.S View File

@@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE (12 + 4) #define RPREFETCHSIZE (12 + 4)
#define WPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (12 + 4)
#define MOVNTQ MOVQ #define MOVNTQ MOVQ
@@ -96,7 +96,7 @@


#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCH prefetch #define RPREFETCH prefetch
#else #else
#define RPREFETCH prefetch #define RPREFETCH prefetch


+ 1
- 1
kernel/x86_64/izamax_sse2.S View File

@@ -469,7 +469,7 @@
ALIGN_4 ALIGN_4


.L71: .L71:
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
prefetch PREFETCHSIZE * SIZE(X) prefetch PREFETCHSIZE * SIZE(X)
#endif #endif




+ 1
- 1
kernel/x86_64/scal_sse.S View File

@@ -266,7 +266,7 @@
sarq $5, I sarq $5, I
jle .L113 jle .L113


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)


movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulps -32 * SIZE(X), %xmm1 mulps -32 * SIZE(X), %xmm1


+ 1
- 1
kernel/x86_64/scal_sse2.S View File

@@ -251,7 +251,7 @@
sarq $4, I sarq $4, I
jle .L113 jle .L113


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)


movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulpd -16 * SIZE(X), %xmm1 mulpd -16 * SIZE(X), %xmm1


+ 1
- 1
kernel/x86_64/symv_L_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_L_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_U_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_U_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/trsm_kernel_LN_8x4_sse.S View File

@@ -86,7 +86,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define movsd movlps #define movsd movlps


+ 1
- 1
kernel/x86_64/trsm_kernel_LT_8x4_sse.S View File

@@ -86,7 +86,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define movsd movlps #define movsd movlps


+ 1
- 1
kernel/x86_64/trsm_kernel_RT_8x4_sse.S View File

@@ -86,7 +86,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif


#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define movsd movlps #define movsd movlps


+ 1
- 1
kernel/x86_64/zgemm_ncopy_2.S View File

@@ -85,7 +85,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE 32 #define RPREFETCHSIZE 32
#define WPREFETCHSIZE 48 #define WPREFETCHSIZE 48
#endif #endif


+ 1
- 1
kernel/x86_64/zsymv_L_sse.S View File

@@ -160,7 +160,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15


#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else


+ 2
- 2
kernel/x86_64/zsymv_L_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)
@@ -167,7 +167,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15


#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else


+ 2
- 2
kernel/x86_64/zsymv_U_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)
@@ -166,7 +166,7 @@
#define xt1 %xmm14 #define xt1 %xmm14
#define xt2 %xmm15 #define xt2 %xmm15


#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else


+ 2
- 2
kernel/x86_64/zsymv_U_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)
@@ -166,7 +166,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15


#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else


+ 1
- 1
kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S View File

@@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta


+ 1
- 1
kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S View File

@@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta


+ 1
- 1
kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S View File

@@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta


+ 1
- 1
kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S View File

@@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta


+ 1
- 1
kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S View File

@@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta


+ 1
- 1
kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S View File

@@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif


#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta


+ 7
- 0
l1param.h View File

@@ -74,6 +74,13 @@
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#endif #endif


#ifdef BULLDOZER
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (128 * 5)
#define ALIGNED_ACCESS
#endif

#ifdef NANO #ifdef NANO
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0


+ 1
- 1
l2param.h View File

@@ -85,7 +85,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#define MOVUPS_A movaps #define MOVUPS_A movaps
#define MOVUPS_XL movaps #define MOVUPS_XL movaps


Loading…
Cancel
Save