@@ -247,11 +247,11 @@ endif | |||||
ifdef DYNAMIC_ARCH | ifdef DYNAMIC_ARCH | ||||
ifeq ($(ARCH), x86) | ifeq ($(ARCH), x86) | ||||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO | |||||
endif | endif | ||||
ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO | |||||
endif | endif | ||||
ifndef DYNAMIC_CORE | ifndef DYNAMIC_CORE | ||||
@@ -28,6 +28,7 @@ OPTERON_SSE3 | |||||
BARCELONA | BARCELONA | ||||
SHANGHAI | SHANGHAI | ||||
ISTANBUL | ISTANBUL | ||||
BOBCATE | |||||
c)VIA CPU: | c)VIA CPU: | ||||
SSE_GENERIC | SSE_GENERIC | ||||
@@ -104,6 +104,7 @@ | |||||
#define CORE_ATOM 18 | #define CORE_ATOM 18 | ||||
#define CORE_NANO 19 | #define CORE_NANO 19 | ||||
#define CORE_SANDYBRIDGE 20 | #define CORE_SANDYBRIDGE 20 | ||||
#define CORE_BOBCATE 21 | |||||
#define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
#define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
@@ -191,4 +192,5 @@ typedef struct { | |||||
#define CPUTYPE_VIAC3 42 | #define CPUTYPE_VIAC3 42 | ||||
#define CPUTYPE_NANO 43 | #define CPUTYPE_NANO 43 | ||||
#define CPUTYPE_SANDYBRIDGE 44 | #define CPUTYPE_SANDYBRIDGE 44 | ||||
#define CPUTYPE_BOBCATE 45 | |||||
#endif | #endif |
@@ -1028,6 +1028,8 @@ int get_cpuname(void){ | |||||
case 1: | case 1: | ||||
case 10: | case 10: | ||||
return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
case 5: | |||||
return CPUTYPE_BOBCATE; | |||||
} | } | ||||
break; | break; | ||||
} | } | ||||
@@ -1148,6 +1150,7 @@ static char *cpuname[] = { | |||||
"VIAC3", | "VIAC3", | ||||
"NANO", | "NANO", | ||||
"SANDYBRIDGE", | "SANDYBRIDGE", | ||||
"BOBCATE", | |||||
}; | }; | ||||
static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
@@ -1195,6 +1198,7 @@ static char *lowercpuname[] = { | |||||
"nsgeode", | "nsgeode", | ||||
"nano", | "nano", | ||||
"sandybridge", | "sandybridge", | ||||
"bobcate", | |||||
}; | }; | ||||
static char *corename[] = { | static char *corename[] = { | ||||
@@ -1219,6 +1223,7 @@ static char *corename[] = { | |||||
"ATOM", | "ATOM", | ||||
"NANO", | "NANO", | ||||
"SANDYBRIDGE", | "SANDYBRIDGE", | ||||
"BOBCATE", | |||||
}; | }; | ||||
static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
@@ -1243,6 +1248,7 @@ static char *corename_lower[] = { | |||||
"atom", | "atom", | ||||
"nano", | "nano", | ||||
"sandybridge", | "sandybridge", | ||||
"bobcate", | |||||
}; | }; | ||||
@@ -1351,7 +1357,9 @@ int get_coretype(void){ | |||||
if (family <= 0x5) return CORE_80486; | if (family <= 0x5) return CORE_80486; | ||||
if (family <= 0xe) return CORE_ATHLON; | if (family <= 0xe) return CORE_ATHLON; | ||||
if (family == 0xf){ | if (family == 0xf){ | ||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; | |||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||||
else if (exfamily == 5) return CORE_BOBCATE; | |||||
else return CORE_BARCELONA; | |||||
} | } | ||||
} | } | ||||
@@ -163,7 +163,7 @@ int get_L2_size(void){ | |||||
int eax, ebx, ecx, edx; | int eax, ebx, ecx, edx; | ||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ | |||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) || \ | |||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | ||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | ||||
@@ -446,7 +446,7 @@ void blas_set_parameter(void){ | |||||
#endif | #endif | ||||
#endif | #endif | ||||
#if defined(CORE_BARCELONA) | |||||
#if defined(CORE_BARCELONA) || defined(CORE_BOBCATE) | |||||
size >>= 8; | size >>= 8; | ||||
sgemm_p = 232 * size; | sgemm_p = 232 * size; | ||||
@@ -1,5 +1,5 @@ | |||||
/***************************************************************************** | /***************************************************************************** | ||||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
All rights reserved. | All rights reserved. | ||||
Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
@@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
/* #define FORCE_BARCELONA */ | /* #define FORCE_BARCELONA */ | ||||
/* #define FORCE_SHANGHAI */ | /* #define FORCE_SHANGHAI */ | ||||
/* #define FORCE_ISTANBUL */ | /* #define FORCE_ISTANBUL */ | ||||
/* #define FORCE_BOBCATE */ | |||||
/* #define FORCE_SSE_GENERIC */ | /* #define FORCE_SSE_GENERIC */ | ||||
/* #define FORCE_VIAC3 */ | /* #define FORCE_VIAC3 */ | ||||
/* #define FORCE_NANO */ | /* #define FORCE_NANO */ | ||||
@@ -363,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define CORENAME "BARCELONA" | #define CORENAME "BARCELONA" | ||||
#endif | #endif | ||||
#if defined(FORCE_BOBCATE) | |||||
#define FORCE | |||||
#define FORCE_INTEL | |||||
#define ARCHITECTURE "X86" | |||||
#define SUBARCHITECTURE "BOBCATE" | |||||
#define ARCHCONFIG "-DBOBCATE " \ | |||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ | |||||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ | |||||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" | |||||
#define LIBNAME "bobcate" | |||||
#define CORENAME "BOBCATE" | |||||
#endif | |||||
#ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
#define FORCE | #define FORCE | ||||
#define FORCE_INTEL | #define FORCE_INTEL | ||||
@@ -794,6 +794,22 @@ static void init_parameter(void) { | |||||
#endif | #endif | ||||
#endif | #endif | ||||
#ifdef BOBCATE | |||||
#ifdef DEBUG | |||||
fprintf(stderr, "Bobcate\n"); | |||||
#endif | |||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
#ifdef EXPRECISION | |||||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
#endif | |||||
#endif | |||||
#ifdef NANO | #ifdef NANO | ||||
#ifdef DEBUG | #ifdef DEBUG | ||||
@@ -0,0 +1,59 @@ | |||||
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
SGEMMINCOPY = | |||||
SGEMMITCOPY = | |||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
SGEMMINCOPYOBJ = | |||||
SGEMMITCOPYOBJ = | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
CGEMMINCOPY = | |||||
CGEMMITCOPY = | |||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
CGEMMINCOPYOBJ = | |||||
CGEMMITCOPYOBJ = | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S |
@@ -69,7 +69,7 @@ | |||||
#define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
#define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
#endif | #endif | ||||
@@ -439,7 +439,7 @@ | |||||
.L22: | .L22: | ||||
mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
@@ -488,7 +488,7 @@ | |||||
movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
#endif | #endif | ||||
mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
@@ -1697,7 +1697,7 @@ | |||||
.L42: | .L42: | ||||
mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
@@ -1727,7 +1727,7 @@ | |||||
addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
#endif | #endif | ||||
mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
@@ -64,7 +64,7 @@ | |||||
#define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
#define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
@@ -437,7 +437,7 @@ | |||||
.L32: | .L32: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
@@ -833,7 +833,7 @@ | |||||
.L22: | .L22: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
@@ -1848,7 +1848,7 @@ | |||||
.L72: | .L72: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
@@ -2109,7 +2109,7 @@ | |||||
ALIGN_4 | ALIGN_4 | ||||
.L62: | .L62: | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
@@ -2429,7 +2429,7 @@ | |||||
.L52: | .L52: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
@@ -2459,7 +2459,7 @@ | |||||
addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
@@ -2952,7 +2952,7 @@ | |||||
.L112: | .L112: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
@@ -3148,7 +3148,7 @@ | |||||
.L102: | .L102: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
@@ -3389,7 +3389,7 @@ | |||||
.L92: | .L92: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
@@ -3404,7 +3404,7 @@ | |||||
mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
@@ -69,7 +69,7 @@ | |||||
#define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
#define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
#endif | #endif | ||||
@@ -910,7 +910,7 @@ | |||||
.L22: | .L22: | ||||
mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
@@ -959,7 +959,7 @@ | |||||
movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
#endif | #endif | ||||
mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
@@ -1439,7 +1439,7 @@ | |||||
.L42: | .L42: | ||||
mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
@@ -1469,7 +1469,7 @@ | |||||
addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
#endif | #endif | ||||
mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
@@ -64,7 +64,7 @@ | |||||
#define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
#define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
@@ -872,7 +872,7 @@ | |||||
.L22: | .L22: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
@@ -1316,7 +1316,7 @@ | |||||
.L32: | .L32: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
@@ -1855,7 +1855,7 @@ | |||||
.L52: | .L52: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
@@ -1885,7 +1885,7 @@ | |||||
addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
@@ -2249,7 +2249,7 @@ | |||||
ALIGN_4 | ALIGN_4 | ||||
.L62: | .L62: | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
@@ -2562,7 +2562,7 @@ | |||||
.L72: | .L72: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
@@ -2957,7 +2957,7 @@ | |||||
.L92: | .L92: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
@@ -2972,7 +2972,7 @@ | |||||
mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
@@ -3280,7 +3280,7 @@ | |||||
.L102: | .L102: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
@@ -3515,7 +3515,7 @@ | |||||
.L112: | .L112: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
@@ -69,7 +69,7 @@ | |||||
#define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
#define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
#endif | #endif | ||||
@@ -1036,7 +1036,7 @@ | |||||
.L42: | .L42: | ||||
mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
@@ -1066,7 +1066,7 @@ | |||||
addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
#endif | #endif | ||||
mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
@@ -2224,7 +2224,7 @@ | |||||
.L22: | .L22: | ||||
mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
@@ -2273,7 +2273,7 @@ | |||||
movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
#endif | #endif | ||||
mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
@@ -64,7 +64,7 @@ | |||||
#define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
#define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
@@ -439,7 +439,7 @@ | |||||
.L92: | .L92: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
@@ -454,7 +454,7 @@ | |||||
mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
@@ -758,7 +758,7 @@ | |||||
.L102: | .L102: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
@@ -993,7 +993,7 @@ | |||||
.L112: | .L112: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
@@ -1324,7 +1324,7 @@ | |||||
.L52: | .L52: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
@@ -1354,7 +1354,7 @@ | |||||
addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
@@ -1718,7 +1718,7 @@ | |||||
ALIGN_4 | ALIGN_4 | ||||
.L62: | .L62: | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
@@ -2031,7 +2031,7 @@ | |||||
.L72: | .L72: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
@@ -2859,7 +2859,7 @@ | |||||
.L22: | .L22: | ||||
mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
@@ -3303,7 +3303,7 @@ | |||||
.L32: | .L32: | ||||
mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
#endif | #endif | ||||
movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
@@ -75,7 +75,7 @@ | |||||
#define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
#define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
#define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
@@ -533,7 +533,7 @@ | |||||
addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
@@ -75,7 +75,7 @@ | |||||
#define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
#define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
#define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
@@ -994,7 +994,7 @@ | |||||
addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
@@ -75,7 +75,7 @@ | |||||
#define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
#define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
#define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
#define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
@@ -1820,7 +1820,7 @@ | |||||
addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
#if defined(OPTERON) || defined(BARCELONA) | |||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) | |||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
#endif | #endif | ||||
addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
@@ -0,0 +1,62 @@ | |||||
ZGEMVNKERNEL = zgemv_n_dup.S | |||||
ZGEMVTKERNEL = zgemv_t_dup.S | |||||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
DGEMMINCOPY = | |||||
DGEMMITCOPY = | |||||
DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
DGEMMINCOPYOBJ = | |||||
DGEMMITCOPYOBJ = | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
CGEMMONCOPY = zgemm_ncopy_2.S | |||||
CGEMMOTCOPY = zgemm_tcopy_2.S | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
ZGEMMINCOPY = | |||||
ZGEMMITCOPY = | |||||
ZGEMMONCOPY = zgemm_ncopy_2.S | |||||
ZGEMMOTCOPY = zgemm_tcopy_2.S | |||||
ZGEMMINCOPYOBJ = | |||||
ZGEMMITCOPYOBJ = | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||||
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||||
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||||
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||||
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||||
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||||
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||||
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||||
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S |
@@ -76,7 +76,7 @@ | |||||
#define movsd movlps | #define movsd movlps | ||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
@@ -76,7 +76,7 @@ | |||||
#define movsd movlpd | #define movsd movlpd | ||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
@@ -76,7 +76,7 @@ | |||||
#define movsd movlps | #define movsd movlps | ||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
@@ -76,7 +76,7 @@ | |||||
#define movsd movlpd | #define movsd movlpd | ||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
@@ -160,7 +160,7 @@ | |||||
#define a3 %xmm14 | #define a3 %xmm14 | ||||
#define xt1 %xmm15 | #define xt1 %xmm15 | ||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | |||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define MOVDDUP(a, b, c) movddup a(b), c | #define MOVDDUP(a, b, c) movddup a(b), c | ||||
#define MOVDDUP2(a, b, c) movddup a##b, c | #define MOVDDUP2(a, b, c) movddup a##b, c | ||||
#else | #else | ||||
@@ -76,7 +76,7 @@ | |||||
#define movsd movlpd | #define movsd movlpd | ||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
@@ -76,7 +76,7 @@ | |||||
#define movsd movlpd | #define movsd movlpd | ||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
@@ -76,7 +76,7 @@ | |||||
#define movsd movlpd | #define movsd movlpd | ||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define PREFETCH prefetch | #define PREFETCH prefetch | ||||
#define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
#define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
@@ -67,6 +67,13 @@ | |||||
#define ALIGNED_ACCESS | #define ALIGNED_ACCESS | ||||
#endif | #endif | ||||
#ifdef BOBCATE | |||||
#define PREFETCH prefetch | |||||
#define PREFETCHW prefetchw | |||||
#define PREFETCHSIZE (128 * 5) | |||||
#define ALIGNED_ACCESS | |||||
#endif | |||||
#ifdef NANO | #ifdef NANO | ||||
#define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
#define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
@@ -85,7 +85,7 @@ | |||||
#define movsd movlps | #define movsd movlps | ||||
#endif | #endif | ||||
#if defined(BARCELONA) || defined(SHANGHAI) | |||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) | |||||
#define ALIGNED_ACCESS | #define ALIGNED_ACCESS | ||||
#define MOVUPS_A movaps | #define MOVUPS_A movaps | ||||
#define MOVUPS_XL movaps | #define MOVUPS_XL movaps | ||||
@@ -1,5 +1,5 @@ | |||||
/***************************************************************************** | /***************************************************************************** | ||||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
All rights reserved. | All rights reserved. | ||||
Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
@@ -208,6 +208,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
#define SGEMM_DEFAULT_R sgemm_r | |||||
#define QGEMM_DEFAULT_R qgemm_r | |||||
#define DGEMM_DEFAULT_R dgemm_r | |||||
#define CGEMM_DEFAULT_R cgemm_r | |||||
#define ZGEMM_DEFAULT_R zgemm_r | |||||
#define XGEMM_DEFAULT_R xgemm_r | |||||
#define SYMV_P 16 | |||||
#define HAVE_EXCLUSIVE_CACHE | |||||
#define GEMM_THREAD gemm_thread_mn | |||||
#endif | |||||
#if defined(BOBCATE) | |||||
#define SNUMOPT 8 | |||||
#define DNUMOPT 4 | |||||
#define GEMM_DEFAULT_OFFSET_A 64 | |||||
#define GEMM_DEFAULT_OFFSET_B 832 | |||||
#define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||||
#ifdef ARCH_X86 | |||||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||||
#define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||||
#else | |||||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||||
#define DGEMM_DEFAULT_UNROLL_M 4 | |||||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||||
#define CGEMM_DEFAULT_UNROLL_M 4 | |||||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||||
#endif | |||||
#define SGEMM_DEFAULT_P 448 | |||||
#define DGEMM_DEFAULT_P 224 | |||||
#define QGEMM_DEFAULT_P 112 | |||||
#define CGEMM_DEFAULT_P 224 | |||||
#define ZGEMM_DEFAULT_P 112 | |||||
#define XGEMM_DEFAULT_P 56 | |||||
#define SGEMM_DEFAULT_Q 224 | |||||
#define DGEMM_DEFAULT_Q 224 | |||||
#define QGEMM_DEFAULT_Q 224 | |||||
#define CGEMM_DEFAULT_Q 224 | |||||
#define ZGEMM_DEFAULT_Q 224 | |||||
#define XGEMM_DEFAULT_Q 224 | |||||
#define SGEMM_DEFAULT_R sgemm_r | #define SGEMM_DEFAULT_R sgemm_r | ||||
#define QGEMM_DEFAULT_R qgemm_r | #define QGEMM_DEFAULT_R qgemm_r | ||||
#define DGEMM_DEFAULT_R dgemm_r | #define DGEMM_DEFAULT_R dgemm_r | ||||