@@ -93,6 +93,12 @@ endif | |||
ifdef TARGET | |||
GETARCH_FLAGS := -DFORCE_$(TARGET) | |||
GETARCH_FLAGS += -DUSER_TARGET | |||
ifeq ($(TARGET), GENERIC) | |||
ifeq ($(DYNAMIC_ARCH), 1) | |||
override NO_EXPRECISION=1 | |||
export NO_EXPRECiSION | |||
endif | |||
endif | |||
endif | |||
# Force fallbacks for 32bit | |||
@@ -84,6 +84,14 @@ if (X86) | |||
set(NO_EXPRECISION 1) | |||
endif () | |||
if (DYNAMIC_ARCH) | |||
if (TARGET) | |||
if (${TARGET} STREQUAL "GENERIC") | |||
set(NO_EXPRECISION 1) | |||
endif () | |||
endif () | |||
endif () | |||
if (UTEST_CHECK) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") | |||
set(SANITY_CHECK 1) | |||
@@ -139,6 +139,36 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
set(CGEMM3M_UNROLL_N 4) | |||
set(ZGEMM3M_UNROLL_M 4) | |||
set(ZGEMM3M_UNROLL_N 4) | |||
elseif ("${TCORE}" STREQUAL "BARCELONA") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_SSE3\n") | |||
elseif ("${TCORE}" STREQUAL "STEAMROLLER") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_SSE3\n") | |||
elseif ("${TCORE}" STREQUAL "EXCAVATOR") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_SSE3\n") | |||
elseif ("${TCORE}" STREQUAL "NEHALEM") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_SSE3\n") | |||
elseif ("${TCORE}" STREQUAL "PRESCOTT") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_SSE3\n") | |||
elseif ("${TCORE}" STREQUAL "SANDYBRIDGE") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_AVX\n") | |||
elseif ("${TCORE}" STREQUAL "HASWELL") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_AVX2\n") | |||
elseif ("${TCORE}" STREQUAL "ZEN") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_AVX2\n") | |||
elseif ("${TCORE}" STREQUAL "SKYLAKEX") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_AVX512\n") | |||
elseif ("${TCORE}" STREQUAL "COOPERLAKE") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define HAVE_AVX512\n") | |||
elseif ("${TCORE}" STREQUAL "ARMV7") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_DATA_SIZE\t65536\n" | |||
@@ -64,12 +64,36 @@ if (DEFINED TARGET) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") | |||
endif() | |||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2") | |||
endif() | |||
endif() | |||
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") | |||
endif() | |||
if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") | |||
endif() | |||
if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx") | |||
endif() | |||
if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR") | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
endif() | |||
if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3") | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
endif() | |||
if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO") | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
endif() | |||
if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM") | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
endif() | |||
if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON") | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
endif() | |||
if (DEFINED HAVE_SSE) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") | |||
endif() | |||
@@ -142,14 +142,8 @@ REALNAME: | |||
#define HUGE_PAGESIZE ( 4 << 20) | |||
#ifndef BUFFERSIZE | |||
#if defined(CORTEXA57) | |||
#define BUFFER_SIZE (20 << 20) | |||
#elif defined(TSV110) || defined(EMAG8180) | |||
#define BUFFER_SIZE (32 << 20) | |||
#else | |||
#define BUFFER_SIZE (16 << 20) | |||
#endif | |||
#else | |||
#define BUFFER_SIZE (32 << BUFFERSIZE) | |||
#endif | |||
@@ -33,7 +33,7 @@ if ($compiler eq "") { | |||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf", | |||
"pathf90", "pathf95", | |||
"pgf95", "pgf90", "pgf77", | |||
"flang", | |||
"flang", "egfortran", | |||
"ifort"); | |||
OUTER: | |||
@@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
i++ ; | |||
} | |||
#if !defined(__POWER__) | |||
#if !defined(__PPC__) | |||
CREAL(result) = dot[0]; | |||
CIMAG(result) = dot[1]; | |||
#else | |||
@@ -34,12 +34,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = dgemm_kernel_power10.c | |||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMINCOPY = | |||
DGEMMITCOPY = | |||
DGEMMONCOPY = dgemm_ncopy_8_power10.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
DGEMMINCOPYOBJ = | |||
DGEMMITCOPYOBJ = | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
@@ -69,7 +69,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
@@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
#endif | |||
) | |||
{ | |||
BLASLONG N = n; | |||
BLASLONG i1; | |||
#if defined(TRMMKERNEL) | |||
BLASLONG off; | |||
@@ -158,85 +157,232 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
off = -offset; | |||
#endif | |||
v4sf_t valpha = { alpha, alpha }; | |||
N = n >> 2; | |||
for (i1 = 0; i1 < N; i1++) | |||
for (i1 = 0; i1 < (n >> 3); i1++) | |||
{ | |||
BLASLONG i, j, temp; | |||
BLASLONG j, temp; | |||
FLOAT *CO; | |||
FLOAT *AO; | |||
#if defined(TRMMKERNEL) && defined(LEFT) | |||
off = offset; | |||
#endif | |||
CO = C; | |||
C += ldc << 2; | |||
C += ldc << 3; | |||
AO = A; | |||
PREFETCH1 (A, 128); | |||
PREFETCH1 (A, 256); | |||
i = m >> 4; | |||
for (j = 0; j < i; j++) | |||
for (j = 0; j < (m >> 3); j++) | |||
{ | |||
FLOAT *BO; | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_POINTERS (16, 4); | |||
REFRESH_POINTERS (8, 8); | |||
#else | |||
BO = B; | |||
temp = k; | |||
#endif | |||
v4sf_t *rowC; | |||
v4sf_t result[4]; | |||
__vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; | |||
BLASLONG l = 0; | |||
PREFETCH1 (CO, 0); | |||
PREFETCH1 (CO + ldc, 0); | |||
PREFETCH1 (CO + ldc + ldc, 0); | |||
PREFETCH1 (CO + ldc + ldc + ldc, 0); | |||
PREFETCH1 (CO, 128); | |||
PREFETCH1 (CO + ldc, 128); | |||
PREFETCH1 (CO + ldc + ldc, 128); | |||
PREFETCH1 (CO + ldc + ldc + ldc, 128); | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
vec_t *rowA = (vec_t *) & AO[0]; | |||
__vector_pair rowB; | |||
vec_t *rb = (vec_t *) & BO[0]; | |||
__vector_pair rowB, rowB1; | |||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | |||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | |||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); | |||
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); | |||
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); | |||
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); | |||
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); | |||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | |||
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); | |||
__builtin_mma_xvf64ger (&acc4, rowB, rowA[2]); | |||
__builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]); | |||
__builtin_mma_xvf64ger (&acc6, rowB, rowA[3]); | |||
__builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]); | |||
for (l = 1; l < temp; l++) | |||
{ | |||
rowA = (vec_t *) & AO[l << 4]; | |||
rb = (vec_t *) & BO[l << 2]; | |||
rowA = (vec_t *) & AO[l << 3]; | |||
rb = (vec_t *) & BO[l << 3]; | |||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); | |||
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); | |||
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); | |||
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); | |||
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); | |||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | |||
__builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); | |||
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]); | |||
__builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]); | |||
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]); | |||
__builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]); | |||
} | |||
SAVE_ACC (&acc0, 0); | |||
SAVE_ACC (&acc2, 4); | |||
SAVE_ACC (&acc1, 2); | |||
SAVE_ACC (&acc3, 6); | |||
SAVE_ACC (&acc4, 8); | |||
SAVE_ACC (&acc6, 12); | |||
SAVE_ACC (&acc5, 10); | |||
SAVE_ACC (&acc7, 14); | |||
AO += temp << 4; | |||
BO += temp << 2; | |||
SAVE_ACC1 (&acc1, 0); | |||
SAVE_ACC (&acc2, 2); | |||
SAVE_ACC1 (&acc3, 2); | |||
SAVE_ACC (&acc4, 4); | |||
SAVE_ACC1 (&acc5, 4); | |||
SAVE_ACC (&acc6, 6); | |||
SAVE_ACC1 (&acc7, 6); | |||
CO += 8; | |||
AO += temp << 3; | |||
BO += temp << 3; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (8, 8) | |||
#endif | |||
} | |||
if (m & 4) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_POINTERS (4, 8); | |||
#else | |||
BO = B; | |||
temp = k; | |||
#endif | |||
v4sf_t *rowC; | |||
v4sf_t result[4]; | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
BLASLONG l = 0; | |||
vec_t *rowA = (vec_t *) & AO[0]; | |||
__vector_pair rowB, rowB1; | |||
vec_t *rb = (vec_t *) & BO[0]; | |||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | |||
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); | |||
for (l = 1; l < temp; l++) | |||
{ | |||
rowA = (vec_t *) & AO[l << 2]; | |||
rb = (vec_t *) & BO[l << 3]; | |||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | |||
__builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); | |||
} | |||
SAVE_ACC (&acc0, 0); | |||
SAVE_ACC1 (&acc1, 0); | |||
SAVE_ACC (&acc2, 2); | |||
SAVE_ACC1 (&acc3, 2); | |||
CO += 4; | |||
AO += temp << 2; | |||
BO += temp << 3; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (4, 8) | |||
#endif | |||
} | |||
if (m & 2) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_POINTERS (2, 8); | |||
#else | |||
BO = B; | |||
temp = k; | |||
#endif | |||
v4sf_t *rowC; | |||
v4sf_t result[4]; | |||
__vector_quad acc0, acc1; | |||
BLASLONG l = 0; | |||
vec_t *rowA = (vec_t *) & AO[0]; | |||
__vector_pair rowB, rowB1; | |||
vec_t *rb = (vec_t *) & BO[0]; | |||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
for (l = 1; l < temp; l++) | |||
{ | |||
rowA = (vec_t *) & AO[l << 1]; | |||
rb = (vec_t *) & BO[l << 3]; | |||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
} | |||
SAVE_ACC (&acc0, 0); | |||
SAVE_ACC1 (&acc1, 0); | |||
CO += 2; | |||
AO += temp << 1; | |||
BO += temp << 3; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (16, 4) | |||
REFRESH_AFTER_SAVE (2, 8) | |||
#endif | |||
CO += 16; | |||
} | |||
i = (m & 15) >> 3; | |||
for (j = 0; j < i; j++) | |||
if (m & 1) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_POINTERS (1, 8); | |||
#else | |||
BO = B; | |||
temp = k; | |||
#endif | |||
BLASLONG l = 0; | |||
v4sf_t t = { 0, 0 }; | |||
v4sf_t t1 = { 0, 0 }; | |||
v4sf_t t2 = { 0, 0 }; | |||
v4sf_t t3 = { 0, 0 }; | |||
for (l = 0; l < temp; l++) | |||
{ | |||
v4sf_t rowA = { AO[l], AO[l] }; | |||
v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] }; | |||
v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] }; | |||
v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] }; | |||
v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] }; | |||
t += rowA * rowB; | |||
t1 += rowA * rowB1; | |||
t2 += rowA * rowB2; | |||
t3 += rowA * rowB3; | |||
} | |||
t = t * valpha; | |||
t1 = t1 * valpha; | |||
t2 = t2 * valpha; | |||
t3 = t3 * valpha; | |||
#if defined(TRMMKERNEL) | |||
CO[0 * ldc] = t[0]; | |||
CO[1 * ldc] = t[1]; | |||
CO[2 * ldc] = t1[0]; | |||
CO[3 * ldc] = t1[1]; | |||
CO[4 * ldc] = t2[0]; | |||
CO[5 * ldc] = t2[1]; | |||
CO[6 * ldc] = t3[0]; | |||
CO[7 * ldc] = t3[1]; | |||
#else | |||
CO[0 * ldc] += t[0]; | |||
CO[1 * ldc] += t[1]; | |||
CO[2 * ldc] += t1[0]; | |||
CO[3 * ldc] += t1[1]; | |||
CO[4 * ldc] += t2[0]; | |||
CO[5 * ldc] += t2[1]; | |||
CO[6 * ldc] += t3[0]; | |||
CO[7 * ldc] += t3[1]; | |||
#endif | |||
CO += 1; | |||
AO += temp; | |||
BO += temp << 3; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (1, 8) | |||
#endif | |||
} | |||
#if defined(TRMMKERNEL) && !defined(LEFT) | |||
off += 8; // number of values in A | |||
#endif | |||
B += k << 3; | |||
} | |||
if (n & 4) | |||
{ | |||
BLASLONG j, temp; | |||
FLOAT *CO; | |||
FLOAT *AO; | |||
#if defined(TRMMKERNEL) && defined(LEFT) | |||
off = offset; | |||
#endif | |||
CO = C; | |||
C += ldc << 2; | |||
AO = A; | |||
PREFETCH1 (A, 128); | |||
PREFETCH1 (A, 256); | |||
for (j = 0; j < (m >> 3); j++) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (8, 4) | |||
#endif | |||
} | |||
i = (m & 7) >> 2; | |||
for (j = 0; j < i; j++) | |||
if (m & 4) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (4, 4) | |||
#endif | |||
} | |||
i = (m & 3) >> 1; | |||
for (j = 0; j < i; j++) | |||
if (m & 2) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (2, 4) | |||
#endif | |||
} | |||
i = (m & 1) >> 0; | |||
for (j = 0; j < i; j++) | |||
if (m & 1) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
#endif | |||
B += k << 2; | |||
} | |||
N = (n & 3) >> 1; | |||
for (i1 = 0; i1 < N; i1++) | |||
if (n & 2) | |||
{ | |||
BLASLONG i, j, temp; | |||
BLASLONG j, temp; | |||
#if defined(TRMMKERNEL) && defined(LEFT) | |||
off = offset; | |||
#endif | |||
@@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
CO = C; | |||
C += ldc << 1; | |||
AO = A; | |||
i = m >> 4; | |||
for (j = 0; j < i; j++) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_POINTERS (16, 2); | |||
#else | |||
BO = B; | |||
temp = k; | |||
#endif | |||
v4sf_t *rowC; | |||
v4sf_t result[4]; | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
BLASLONG l = 0; | |||
FLOAT t[4] = { 0, 0, 0, 0 }; | |||
t[0] = BO[0], t[1] = BO[1]; | |||
__vector_pair rowB; | |||
vec_t *rb = (vec_t *) & t[0]; | |||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
vec_t *rowA = (vec_t *) & AO[0]; | |||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | |||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | |||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); | |||
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); | |||
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); | |||
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); | |||
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); | |||
for (l = 1; l < temp; l++) | |||
{ | |||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
rb = (vec_t *) & t[0]; | |||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
rowA = (vec_t *) & AO[l << 4]; | |||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); | |||
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); | |||
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); | |||
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); | |||
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); | |||
} | |||
SAVE2x4_ACC (&acc0, 0); | |||
SAVE2x4_ACC (&acc1, 2); | |||
SAVE2x4_ACC (&acc2, 4); | |||
SAVE2x4_ACC (&acc3, 6); | |||
SAVE2x4_ACC (&acc4, 8); | |||
SAVE2x4_ACC (&acc5, 10); | |||
SAVE2x4_ACC (&acc6, 12); | |||
SAVE2x4_ACC (&acc7, 14); | |||
CO += 16; | |||
AO += temp << 4; | |||
BO += temp << 1; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (16, 2) | |||
#endif | |||
} | |||
i = (m & 15) >> 3; | |||
for (j = 0; j < i; j++) | |||
for (j = 0; j < (m >> 3); j++) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (8, 2) | |||
#endif | |||
} | |||
i = (m & 7) >> 2; | |||
for (j = 0; j < i; j++) | |||
if (m & 4) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (4, 2) | |||
#endif | |||
} | |||
i = (m & 3) >> 1; | |||
for (j = 0; j < i; j++) | |||
if (m & 2) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (2, 2) | |||
#endif | |||
} | |||
i = (m & 1) >> 0; | |||
for (j = 0; j < i; j++) | |||
if (m & 1) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
#endif | |||
B += k << 1; | |||
} | |||
N = (n & 1) >> 0; | |||
for (i1 = 0; i1 < N; i1++) | |||
if (n & 1) | |||
{ | |||
BLASLONG i, temp; | |||
#if defined(TRMMKERNEL) && defined(LEFT) | |||
@@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
CO = C; | |||
C += ldc; | |||
AO = A; | |||
i = m; | |||
while (i >= 16) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_POINTERS (16, 1) | |||
#else | |||
BO = B; | |||
temp = k; | |||
#endif | |||
BLASLONG l = 0; | |||
v4sf_t t = { 0, 0 }; | |||
v4sf_t t1 = { 0, 0 }; | |||
v4sf_t t2 = { 0, 0 }; | |||
v4sf_t t3 = { 0, 0 }; | |||
v4sf_t t4 = { 0, 0 }; | |||
v4sf_t t5 = { 0, 0 }; | |||
v4sf_t t6 = { 0, 0 }; | |||
v4sf_t t7 = { 0, 0 }; | |||
for (l = 0; l < temp; l++) | |||
{ | |||
v4sf_t rowB = { BO[l], BO[l] }; | |||
v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; | |||
v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; | |||
v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; | |||
v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; | |||
v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; | |||
v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; | |||
v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; | |||
v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; | |||
t += rowA * rowB; | |||
t1 += rowA1 * rowB; | |||
t2 += rowA2 * rowB; | |||
t3 += rowA3 * rowB; | |||
t4 += rowA4 * rowB; | |||
t5 += rowA5 * rowB; | |||
t6 += rowA6 * rowB; | |||
t7 += rowA7 * rowB; | |||
} | |||
t = t * valpha; | |||
t1 = t1 * valpha; | |||
t2 = t2 * valpha; | |||
t3 = t3 * valpha; | |||
t4 = t4 * valpha; | |||
t5 = t5 * valpha; | |||
t6 = t6 * valpha; | |||
t7 = t7 * valpha; | |||
#if defined(TRMMKERNEL) | |||
CO[0] = t[0]; | |||
CO[1] = t[1]; | |||
CO[2] = t1[0]; | |||
CO[3] = t1[1]; | |||
CO[4] = t2[0]; | |||
CO[5] = t2[1]; | |||
CO[6] = t3[0]; | |||
CO[7] = t3[1]; | |||
CO[8] = t4[0]; | |||
CO[9] = t4[1]; | |||
CO[10] = t5[0]; | |||
CO[11] = t5[1]; | |||
CO[12] = t6[0]; | |||
CO[13] = t6[1]; | |||
CO[14] = t7[0]; | |||
CO[15] = t7[1]; | |||
#else | |||
CO[0] += t[0]; | |||
CO[1] += t[1]; | |||
CO[2] += t1[0]; | |||
CO[3] += t1[1]; | |||
CO[4] += t2[0]; | |||
CO[5] += t2[1]; | |||
CO[6] += t3[0]; | |||
CO[7] += t3[1]; | |||
CO[8] += t4[0]; | |||
CO[9] += t4[1]; | |||
CO[10] += t5[0]; | |||
CO[11] += t5[1]; | |||
CO[12] += t6[0]; | |||
CO[13] += t6[1]; | |||
CO[14] += t7[0]; | |||
CO[15] += t7[1]; | |||
#endif | |||
AO += temp << 4; | |||
BO += temp; | |||
CO += 16; | |||
i -= 16; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (16, 1) | |||
#endif | |||
} | |||
while (i >= 8) | |||
for (i = 0; i < (m >> 3); i++) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
AO += temp << 3; | |||
BO += temp; | |||
CO += 8; | |||
i -= 8; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (8, 1) | |||
#endif | |||
} | |||
while (i >= 4) | |||
if (m & 4) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
AO += temp << 2; | |||
BO += temp; | |||
CO += 4; | |||
i -= 4; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (4, 1) | |||
#endif | |||
} | |||
while (i >= 2) | |||
if (m & 2) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
AO += temp << 1; | |||
BO += temp; | |||
CO += 2; | |||
i -= 2; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (2, 1) | |||
#endif | |||
} | |||
while (i >= 1) | |||
if (m & 1) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
CO[0] += t * alpha; | |||
#endif | |||
CO += 1; | |||
i -= 1; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (1, 1) | |||
#endif | |||
@@ -0,0 +1,326 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <altivec.h> | |||
#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG i, j; | |||
IFLOAT *aoffset; | |||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
IFLOAT *boffset; | |||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
IFLOAT ctemp09, ctemp17, ctemp33; | |||
IFLOAT ctemp25, ctemp41; | |||
IFLOAT ctemp49, ctemp57; | |||
aoffset = a; | |||
boffset = b; | |||
j = (n >> 3); | |||
if (j > 0){ | |||
do{ | |||
aoffset1 = aoffset; | |||
aoffset2 = aoffset1 + lda; | |||
aoffset3 = aoffset2 + lda; | |||
aoffset4 = aoffset3 + lda; | |||
aoffset5 = aoffset4 + lda; | |||
aoffset6 = aoffset5 + lda; | |||
aoffset7 = aoffset6 + lda; | |||
aoffset8 = aoffset7 + lda; | |||
aoffset += 8 * lda; | |||
i = (m >> 3); | |||
if (i > 0){ | |||
do{ | |||
PREFETCHA (aoffset1, 384); | |||
PREFETCHA (aoffset2, 384); | |||
PREFETCHA (aoffset3, 384); | |||
PREFETCHA (aoffset4, 384); | |||
PREFETCHA (aoffset5, 384); | |||
PREFETCHA (aoffset6, 384); | |||
PREFETCHA (aoffset7, 384); | |||
PREFETCHA (aoffset8, 384); | |||
__vector double va0 = *(__vector double*)(aoffset1 + 0); | |||
__vector double va1 = *(__vector double*)(aoffset1 + 2); | |||
__vector double va2 = *(__vector double*)(aoffset1 + 4); | |||
__vector double va3 = *(__vector double*)(aoffset1 + 6); | |||
__vector double va4 = *(__vector double*)(aoffset2 + 0); | |||
__vector double va5 = *(__vector double*)(aoffset2 + 2); | |||
__vector double va6 = *(__vector double*)(aoffset2 + 4); | |||
__vector double va7 = *(__vector double*)(aoffset2 + 6); | |||
__vector double va8 = *(__vector double*)(aoffset3 + 0); | |||
__vector double va9 = *(__vector double*)(aoffset3 + 2); | |||
__vector double va10 = *(__vector double*)(aoffset3 + 4); | |||
__vector double va11 = *(__vector double*)(aoffset3 + 6); | |||
__vector double va12 = *(__vector double*)(aoffset4 + 0); | |||
__vector double va13 = *(__vector double*)(aoffset4 + 2); | |||
__vector double va14 = *(__vector double*)(aoffset4 + 4); | |||
__vector double va15 = *(__vector double*)(aoffset4 + 6); | |||
__vector double va16 = *(__vector double*)(aoffset5 + 0); | |||
__vector double va17 = *(__vector double*)(aoffset5 + 2); | |||
__vector double va18 = *(__vector double*)(aoffset5 + 4); | |||
__vector double va19 = *(__vector double*)(aoffset5 + 6); | |||
__vector double va20 = *(__vector double*)(aoffset6 + 0); | |||
__vector double va21 = *(__vector double*)(aoffset6 + 2); | |||
__vector double va22 = *(__vector double*)(aoffset6 + 4); | |||
__vector double va23 = *(__vector double*)(aoffset6 + 6); | |||
__vector double va24 = *(__vector double*)(aoffset7 + 0); | |||
__vector double va25 = *(__vector double*)(aoffset7 + 2); | |||
__vector double va26 = *(__vector double*)(aoffset7 + 4); | |||
__vector double va27 = *(__vector double*)(aoffset7 + 6); | |||
__vector double va28 = *(__vector double*)(aoffset8 + 0); | |||
__vector double va29 = *(__vector double*)(aoffset8 + 2); | |||
__vector double va30 = *(__vector double*)(aoffset8 + 4); | |||
__vector double va31 = *(__vector double*)(aoffset8 + 6); | |||
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0); | |||
*(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0); | |||
*(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0); | |||
*(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0); | |||
*(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3); | |||
*(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3); | |||
*(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3); | |||
*(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3); | |||
*(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0); | |||
*(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0); | |||
*(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0); | |||
*(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0); | |||
*(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3); | |||
*(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3); | |||
*(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3); | |||
*(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3); | |||
*(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0); | |||
*(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0); | |||
*(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0); | |||
*(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0); | |||
*(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3); | |||
*(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3); | |||
*(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3); | |||
*(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3); | |||
*(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0); | |||
*(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0); | |||
*(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0); | |||
*(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0); | |||
*(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3); | |||
*(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3); | |||
*(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3); | |||
*(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3); | |||
aoffset1 += 8; | |||
aoffset2 += 8; | |||
aoffset3 += 8; | |||
aoffset4 += 8; | |||
aoffset5 += 8; | |||
aoffset6 += 8; | |||
aoffset7 += 8; | |||
aoffset8 += 8; | |||
boffset += 64; | |||
i --; | |||
}while(i > 0); | |||
} | |||
i = (m & 7); | |||
if (i > 0){ | |||
do{ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp09 = *(aoffset2 + 0); | |||
ctemp17 = *(aoffset3 + 0); | |||
ctemp25 = *(aoffset4 + 0); | |||
ctemp33 = *(aoffset5 + 0); | |||
ctemp41 = *(aoffset6 + 0); | |||
ctemp49 = *(aoffset7 + 0); | |||
ctemp57 = *(aoffset8 + 0); | |||
*(boffset + 0) = ctemp01; | |||
*(boffset + 1) = ctemp09; | |||
*(boffset + 2) = ctemp17; | |||
*(boffset + 3) = ctemp25; | |||
*(boffset + 4) = ctemp33; | |||
*(boffset + 5) = ctemp41; | |||
*(boffset + 6) = ctemp49; | |||
*(boffset + 7) = ctemp57; | |||
aoffset1 ++; | |||
aoffset2 ++; | |||
aoffset3 ++; | |||
aoffset4 ++; | |||
aoffset5 ++; | |||
aoffset6 ++; | |||
aoffset7 ++; | |||
aoffset8 ++; | |||
boffset += 8; | |||
i --; | |||
}while(i > 0); | |||
} | |||
j--; | |||
}while(j > 0); | |||
} /* end of if(j > 0) */ | |||
if (n & 4){ | |||
aoffset1 = aoffset; | |||
aoffset2 = aoffset1 + lda; | |||
aoffset3 = aoffset2 + lda; | |||
aoffset4 = aoffset3 + lda; | |||
aoffset += 4 * lda; | |||
i = (m >> 2); | |||
if (i > 0){ | |||
do{ | |||
PREFETCHA (aoffset1, 384); | |||
PREFETCHA (aoffset2, 384); | |||
PREFETCHA (aoffset3, 384); | |||
PREFETCHA (aoffset4, 384); | |||
__vector double va0 = *(__vector double*)(aoffset1 + 0); | |||
__vector double va1 = *(__vector double*)(aoffset1 + 2); | |||
__vector double va2 = *(__vector double*)(aoffset2 + 0); | |||
__vector double va3 = *(__vector double*)(aoffset2 + 2); | |||
__vector double va4 = *(__vector double*)(aoffset3 + 0); | |||
__vector double va5 = *(__vector double*)(aoffset3 + 2); | |||
__vector double va6 = *(__vector double*)(aoffset4 + 0); | |||
__vector double va7 = *(__vector double*)(aoffset4 + 2); | |||
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0); | |||
*(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0); | |||
*(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3); | |||
*(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3); | |||
*(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0); | |||
*(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0); | |||
*(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3); | |||
*(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3); | |||
aoffset1 += 4; | |||
aoffset2 += 4; | |||
aoffset3 += 4; | |||
aoffset4 += 4; | |||
boffset += 16; | |||
i --; | |||
}while(i > 0); | |||
} | |||
i = (m & 3); | |||
if (i > 0){ | |||
do{ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset2 + 0); | |||
ctemp03 = *(aoffset3 + 0); | |||
ctemp04 = *(aoffset4 + 0); | |||
*(boffset + 0) = ctemp01; | |||
*(boffset + 1) = ctemp02; | |||
*(boffset + 2) = ctemp03; | |||
*(boffset + 3) = ctemp04; | |||
aoffset1 ++; | |||
aoffset2 ++; | |||
aoffset3 ++; | |||
aoffset4 ++; | |||
boffset += 4; | |||
i --; | |||
}while(i > 0); | |||
} | |||
} /* end of if(j > 0) */ | |||
if (n & 2){ | |||
aoffset1 = aoffset; | |||
aoffset2 = aoffset1 + lda; | |||
aoffset += 2 * lda; | |||
i = (m >> 1); | |||
if (i > 0){ | |||
do{ | |||
__vector double va0 = *(__vector double*)(aoffset1 + 0); | |||
__vector double va1 = *(__vector double*)(aoffset2 + 0); | |||
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0); | |||
*(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3); | |||
aoffset1 += 2; | |||
aoffset2 += 2; | |||
boffset += 4; | |||
i --; | |||
}while(i > 0); | |||
} | |||
if (m & 1){ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset2 + 0); | |||
*(boffset + 0) = ctemp01; | |||
*(boffset + 1) = ctemp02; | |||
aoffset1 ++; | |||
aoffset2 ++; | |||
boffset += 2; | |||
} | |||
} /* end of if(j > 0) */ | |||
if (n & 1){ | |||
aoffset1 = aoffset; | |||
i = m; | |||
if (i > 0){ | |||
do{ | |||
ctemp01 = *(aoffset1 + 0); | |||
*(boffset + 0) = ctemp01; | |||
aoffset1 ++; | |||
boffset ++; | |||
i --; | |||
}while(i > 0); | |||
} | |||
} /* end of if(j > 0) */ | |||
return 0; | |||
} |
@@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
#endif | |||
) | |||
{ | |||
BLASLONG N = n; | |||
BLASLONG i1; | |||
#if defined(TRMMKERNEL) | |||
BLASLONG off; | |||
@@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
#endif | |||
v4sf_t valpha = { alpha, alpha, alpha, alpha }; | |||
N = n >> 3; | |||
for (i1 = 0; i1 < N; i1++) | |||
for (i1 = 0; i1 < (n >> 3); i1++) | |||
{ | |||
BLASLONG i, j, temp; | |||
BLASLONG j, temp; | |||
FLOAT *CO; | |||
FLOAT *AO; | |||
#if defined(TRMMKERNEL) && defined(LEFT) | |||
@@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
AO = A; | |||
PREFETCH1 (A, 128); | |||
PREFETCH1 (A, 256); | |||
i = m >> 4; | |||
for (j = 0; j < i; j++) | |||
for (j = 0; j < (m >> 4); j++) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
#endif | |||
CO += 16; | |||
} | |||
i = (m & 15) >> 3; | |||
for (j = 0; j < i; j++) | |||
if (m & 8) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (8, 8) | |||
#endif | |||
} | |||
i = (m & 7) >> 2; | |||
for (j = 0; j < i; j++) | |||
if (m & 4) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (4, 8) | |||
#endif | |||
} | |||
i = (m & 3) >> 1; | |||
for (j = 0; j < i; j++) | |||
if (m & 2) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (2, 8) | |||
#endif | |||
} | |||
i = (m & 1) >> 0; | |||
for (j = 0; j < i; j++) | |||
if (m & 1) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
B += k << 3; | |||
} | |||
N = (n & 7) >> 2; | |||
for (i1 = 0; i1 < N; i1++) | |||
if (n & 4) | |||
{ | |||
BLASLONG i, j, temp; | |||
#if defined(TRMMKERNEL) && defined(LEFT) | |||
@@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (16, 4) | |||
#endif | |||
} | |||
i = (m & 15) >> 3; | |||
for (j = 0; j < i; j++) | |||
if (m & 8) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (8, 4) | |||
#endif | |||
} | |||
i = (m & 7) >> 2; | |||
for (j = 0; j < i; j++) | |||
if (m & 4) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (4, 4) | |||
#endif | |||
} | |||
i = (m & 3) >> 1; | |||
for (j = 0; j < i; j++) | |||
if (m & 2) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (2, 4) | |||
#endif | |||
} | |||
i = (m & 1) >> 0; | |||
for (j = 0; j < i; j++) | |||
if (m & 1) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
B += k << 2; | |||
} | |||
N = (n & 3) >> 1; | |||
for (i1 = 0; i1 < N; i1++) | |||
if (n & 2) | |||
{ | |||
BLASLONG i, j, temp; | |||
#if defined(TRMMKERNEL) && defined(LEFT) | |||
@@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (16, 2) | |||
#endif | |||
} | |||
i = (m & 15) >> 3; | |||
for (j = 0; j < i; j++) | |||
if (m & 8) | |||
{ | |||
FLOAT *BO; | |||
v4sf_t *rowC; | |||
@@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (8, 2) | |||
#endif | |||
} | |||
i = (m & 7) >> 2; | |||
for (j = 0; j < i; j++) | |||
if (m & 4) | |||
{ | |||
FLOAT *BO; | |||
v4sf_t *rowC; | |||
@@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (4, 2) | |||
#endif | |||
} | |||
i = (m & 3) >> 1; | |||
for (j = 0; j < i; j++) | |||
if (m & 2) | |||
{ | |||
FLOAT *BO; | |||
BLASLONG l = 0; | |||
@@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
REFRESH_AFTER_SAVE (2, 2) | |||
#endif | |||
} | |||
i = (m & 1) >> 0; | |||
for (j = 0; j < i; j++) | |||
if (m & 1) | |||
{ | |||
FLOAT *BO; | |||
BLASLONG l = 0; | |||
@@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
B += k << 1; | |||
} | |||
N = (n & 1) >> 0; | |||
for (i1 = 0; i1 < N; i1++) | |||
if (n & 1) | |||
{ | |||
BLASLONG i, temp; | |||
#if defined(TRMMKERNEL) && defined(LEFT) | |||
@@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
CO = C; | |||
C += ldc; | |||
AO = A; | |||
i = m; | |||
while (i >= 16) | |||
for (i = 0; i < (m >> 4); i++) | |||
{ | |||
FLOAT *BO; | |||
BLASLONG l = 0; | |||
@@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
AO += temp << 4; | |||
BO += temp; | |||
CO += 16; | |||
i -= 16; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (16, 1) | |||
#endif | |||
} | |||
while (i >= 8) | |||
if (m & 8) | |||
{ | |||
FLOAT *BO; | |||
BLASLONG l = 0; | |||
@@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
AO += temp << 3; | |||
BO += temp; | |||
CO += 8; | |||
i -= 8; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (8, 1) | |||
#endif | |||
} | |||
while (i >= 4) | |||
if (m & 4) | |||
{ | |||
FLOAT *BO; | |||
BLASLONG l = 0; | |||
@@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
AO += temp << 2; | |||
BO += temp; | |||
CO += 4; | |||
i -= 4; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (4, 1) | |||
#endif | |||
} | |||
while (i >= 2) | |||
if (m & 2) | |||
{ | |||
FLOAT *BO; | |||
BLASLONG l = 0; | |||
@@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
AO += temp << 1; | |||
BO += temp; | |||
CO += 2; | |||
i -= 2; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (2, 1) | |||
#endif | |||
} | |||
while (i >= 1) | |||
if (m & 1) | |||
{ | |||
FLOAT *BO; | |||
#if defined(TRMMKERNEL) | |||
@@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
CO[0] += t * alpha; | |||
#endif | |||
CO += 1; | |||
i -= 1; | |||
#if defined(TRMMKERNEL) | |||
REFRESH_AFTER_SAVE (1, 1) | |||
#endif | |||
@@ -2436,6 +2436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define SBGEMM_DEFAULT_P 832 | |||
#define SBGEMM_DEFAULT_Q 1026 | |||
#define SBGEMM_DEFAULT_R 4096 | |||
#undef DGEMM_DEFAULT_UNROLL_M | |||
#undef DGEMM_DEFAULT_UNROLL_N | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define DGEMM_DEFAULT_UNROLL_N 8 | |||
#endif | |||
#if defined(SPARC) && defined(V7) | |||