improved power9 zgemm,sgemmtags/v0.3.7
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){ | |||
for (i = 0; i < m * n * COMPSIZE; i++) { | |||
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
} | |||
fprintf(stderr, " SIZE Flops Time\n"); | |||
for (i = from; i <= to; i += step) { | |||
@@ -6,7 +6,7 @@ | |||
STRMMKERNEL = sgemm_kernel_power9.S | |||
DTRMMKERNEL = dgemm_kernel_power9.S | |||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||
ZTRMMKERNEL = zgemm_kernel_power9.S | |||
SGEMMKERNEL = sgemm_kernel_power9.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
@@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
ZGEMMKERNEL = zgemm_kernel_power9.S | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
@@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
std r14, 280(SP) | |||
stxv v20, 288(SP) | |||
stxv v21, 304(SP) | |||
stxv v22, 320(SP) | |||
stxv v23, 336(SP) | |||
stxv v24, 352(SP) | |||
stxv v25, 368(SP) | |||
stxv v26, 384(SP) | |||
stxv v27, 400(SP) | |||
stxv v28, 416(SP) | |||
stxv v29, 432(SP) | |||
stxv v30, 448(SP) | |||
stxv v31, 464(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
stfd f1, ALPHA_SP | |||
@@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
lxv v20, 288(SP) | |||
lxv v21, 304(SP) | |||
lxv v22, 320(SP) | |||
lxv v23, 336(SP) | |||
lxv v24, 352(SP) | |||
lxv v25, 368(SP) | |||
lxv v26, 384(SP) | |||
lxv v27, 400(SP) | |||
lxv v28, 416(SP) | |||
lxv v29, 432(SP) | |||
lxv v30, 448(SP) | |||
lxv v31, 464(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define LOAD ld | |||
#define STACKSIZE (512 ) | |||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
@@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
PROFCODE | |||
addi SP, SP, -STACKSIZE | |||
li r0, 0 | |||
mflr r0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
@@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
std r14, 280(SP) | |||
stxv v20, 288(SP) | |||
stxv v21, 304(SP) | |||
stxv v22, 320(SP) | |||
stxv v23, 336(SP) | |||
stxv v24, 352(SP) | |||
stxv v25, 368(SP) | |||
stxv v26, 384(SP) | |||
stxv v27, 400(SP) | |||
stxv v28, 416(SP) | |||
stxv v29, 432(SP) | |||
stxv v30, 448(SP) | |||
stxv v31, 464(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
std r0, FLINK_SAVE(SP) | |||
#if defined(TRMMKERNEL) | |||
@@ -157,72 +158,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
slwi LDC, LDC, 2 | |||
/* cmpwi cr0, M, 0 | |||
ble .L999_H1 | |||
cmpwi cr0, N, 0 | |||
ble .L999_H1 | |||
cmpwi cr0, K, 0 | |||
ble .L999_H1 | |||
*/ | |||
/*alpha is stored in f1. convert to single and splat*/ | |||
xscvdpspn alpha_r,vs1 | |||
xxspltw alpha_r,alpha_r,0 | |||
xxspltw alpha_r,alpha_r,0 | |||
/*load reverse permute mask for big endian | |||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
*/ | |||
lis T2, perm_const2@highest | |||
ori T2, T2, perm_const2@higher | |||
rldicr T2, T2, 32, 31 | |||
oris T2, T2, perm_const2@h | |||
ori T2, T2, perm_const2@l | |||
lis T1, perm_const1@highest | |||
lis T3, save_permute_12@highest | |||
lis T4, save_permute_11@highest | |||
lis T5, save_permute_22@highest | |||
lis T6, save_permute_21@highest | |||
ori T2, T2, perm_const2@higher | |||
ori T1, T1, perm_const1@higher | |||
ori T3, T3, save_permute_12@higher | |||
ori T4, T4, save_permute_11@higher | |||
ori T5, T5, save_permute_22@higher | |||
ori T6, T6, save_permute_21@higher | |||
rldicr T2, T2, 32, 31 | |||
rldicr T1, T1, 32, 31 | |||
rldicr T3, T3, 32, 31 | |||
rldicr T4, T4, 32, 31 | |||
rldicr T5, T5, 32, 31 | |||
rldicr T6, T6, 32, 31 | |||
oris T2, T2, perm_const2@h | |||
oris T1, T1, perm_const1@h | |||
oris T3, T3, save_permute_12@h | |||
oris T4, T4, save_permute_11@h | |||
oris T5, T5, save_permute_22@h | |||
oris T6, T6, save_permute_21@h | |||
ori T2, T2, perm_const2@l | |||
ori T1, T1, perm_const1@l | |||
ori T3, T3, save_permute_12@l | |||
ori T4, T4, save_permute_11@l | |||
ori T5, T5, save_permute_22@l | |||
ori T6, T6, save_permute_21@l | |||
li r0,0 | |||
mtvsrdd permute_mask,T2,T1 | |||
lis T2, save_permute_12@highest | |||
ori T2, T2, save_permute_12@higher | |||
rldicr T2, T2, 32, 31 | |||
oris T2, T2, save_permute_12@h | |||
ori T2, T2, save_permute_12@l | |||
lis T1, save_permute_11@highest | |||
ori T1, T1, save_permute_11@higher | |||
rldicr T1, T1, 32, 31 | |||
oris T1, T1, save_permute_11@h | |||
ori T1, T1, save_permute_11@l | |||
mtvsrdd save_permute_1,T2,T1 | |||
lis T2, save_permute_22@highest | |||
ori T2, T2, save_permute_22@higher | |||
rldicr T2, T2, 32, 31 | |||
oris T2, T2, save_permute_22@h | |||
ori T2, T2, save_permute_22@l | |||
lis T1, save_permute_21@highest | |||
ori T1, T1, save_permute_21@higher | |||
rldicr T1, T1, 32, 31 | |||
oris T1, T1, save_permute_21@h | |||
ori T1, T1, save_permute_21@l | |||
mtvsrdd save_permute_2,T2,T1 | |||
mtvsrdd save_permute_1,T3,T4 | |||
mtvsrdd save_permute_2,T5,T6 | |||
#include "sgemm_logic_power9.S" | |||
.L999: | |||
addi r3, 0, 0 | |||
.L999: | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
@@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
lxv v20, 288(SP) | |||
lxv v21, 304(SP) | |||
lxv v22, 320(SP) | |||
lxv v23, 336(SP) | |||
lxv v24, 352(SP) | |||
lxv v25, 368(SP) | |||
lxv v26, 384(SP) | |||
lxv v27, 400(SP) | |||
lxv v28, 416(SP) | |||
lxv v29, 432(SP) | |||
lxv v30, 448(SP) | |||
lxv v31, 464(SP) | |||
ld r0, FLINK_SAVE(SP) | |||
addi SP, SP, STACKSIZE | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
mtlr r0 | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif |
@@ -1,5 +1,94 @@ | |||
#define MY_ALIGN .align 3 | |||
b L8 | |||
MY_ALIGN | |||
LSGEMM_L8x16_LMAIN_SUB: | |||
LOAD8x16_0 | |||
mtctr L | |||
MY_ALIGN | |||
LSGEMM_L8x16_LOOP: | |||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||
KERNEL8x16_I1_L4_2 64,32, 3,0 | |||
KERNEL8x16_I1_L4_2 64,32, 4,0 | |||
KERNEL8x16_I1_L4_2 64,32, 5,0 | |||
KERNEL8x16_I1_L4_2 64,32, 6,0 | |||
KERNEL8x16_I1_L4_2 64,32, 7,0 | |||
KERNEL8x16_I1_L4_2 64,32, 8,0 | |||
KERNEL8x16_I1_L4_2 64,32, 9,0 | |||
KERNEL8x16_I1_L4_2 64,32, 10,0 | |||
KERNEL8x16_I1_L4_2 64,32, 11,0 | |||
KERNEL8x16_I1_L4_2 64,32, 12,0 | |||
KERNEL8x16_I1_L4_2 64,32, 13,0 | |||
KERNEL8x16_I1_L4_2 64,32, 14,0 | |||
KERNEL8x16_I1_L4_2 64,32, 15,0 | |||
KERNEL8x16_I1_L4_2 64,32, 16,0 | |||
KERNEL8x16_I1_L4_2 64,32, 17,0 | |||
KERNEL8x16_I1_L4_2 64,32, 18,0 | |||
KERNEL8x16_I1_L4_2 64,32, 19,0 | |||
KERNEL8x16_I1_L4_2 64,32, 20,0 | |||
KERNEL8x16_I1_L4_2 64,32, 21,0 | |||
KERNEL8x16_I1_L4_2 64,32, 22,0 | |||
KERNEL8x16_I1_L4_2 64,32, 23,0 | |||
KERNEL8x16_I1_L4_2 64,32, 24,0 | |||
KERNEL8x16_I1_L4_2 64,32, 25,0 | |||
KERNEL8x16_I1_L4_2 64,32, 26,0 | |||
KERNEL8x16_I1_L4_2 64,32, 27,0 | |||
KERNEL8x16_I1_L4_2 64,32, 28,0 | |||
KERNEL8x16_I1_L4_2 64,32, 29,0 | |||
KERNEL8x16_I1_L4_2 64,32, 30,0 | |||
KERNEL8x16_I1_L4_2 64,32, 31,1 | |||
bdnz LSGEMM_L8x16_LOOP | |||
MY_ALIGN | |||
LSGEMM_L8x16_LOOP_END: | |||
END8x16 0, AO, BO, 64, 32 | |||
blr | |||
MY_ALIGN | |||
LSGEMM_L8x16_L64_SUB: | |||
LOAD8x16_0 | |||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||
KERNEL8x16_I1_L4_2 64,32, 3,0 | |||
KERNEL8x16_I1_L4_2 64,32, 4,0 | |||
KERNEL8x16_I1_L4_2 64,32, 5,0 | |||
KERNEL8x16_I1_L4_2 64,32, 6,0 | |||
KERNEL8x16_I1_L4_2 64,32, 7,0 | |||
KERNEL8x16_I1_L4_2 64,32, 8,0 | |||
KERNEL8x16_I1_L4_2 64,32, 9,0 | |||
KERNEL8x16_I1_L4_2 64,32, 10,0 | |||
KERNEL8x16_I1_L4_2 64,32, 11,0 | |||
KERNEL8x16_I1_L4_2 64,32, 12,0 | |||
KERNEL8x16_I1_L4_2 64,32, 13,0 | |||
KERNEL8x16_I1_L4_2 64,32, 14,0 | |||
KERNEL8x16_I1_L4_3 64,32, 15,1 | |||
blr | |||
LSGEMM_L8x16_L32_SUB: | |||
LOAD8x16_0 | |||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||
KERNEL8x16_I1_L4_2 64,32, 3,0 | |||
KERNEL8x16_I1_L4_2 64,32, 4,0 | |||
KERNEL8x16_I1_L4_2 64,32, 5,0 | |||
KERNEL8x16_I1_L4_2 64,32, 6,0 | |||
KERNEL8x16_I1_L4_3 64,32, 7,1 | |||
blr | |||
LSGEMM_L8x16_L16_SUB: | |||
LOAD8x16_0 | |||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||
KERNEL8x16_I1_L4_3 64,32, 3,1 | |||
blr | |||
L8: | |||
#if defined(TRMMKERNEL) && !defined(LEFT) | |||
neg TEMP_REG, OFFSET | |||
#endif | |||
@@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN: | |||
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 | |||
mr T12, T11 | |||
addi T12,T12, -1 | |||
srawi. L, T12, 6 /**(T11-1) % 64x */ | |||
srawi. L, T12, 7 /**(T11-1) % 128x */ | |||
#else | |||
mr T12, K | |||
addi T12,T12, -1 | |||
srawi. L, T12, 6 /**(K-1) % 64x */ | |||
srawi. L, T12, 7 /**(K-1) % 128x */ | |||
#endif | |||
ZERO8x16 | |||
ble LSGEMM_L8x16_SUB0 | |||
MY_ALIGN | |||
LSGEMM_L8x16_LOOP_START: | |||
LOAD8x16_0 /*we already zeroed */ | |||
##OffsetA=64 OffsetB=32 | |||
addi AO,AO,2112 | |||
addi BO,BO,32 | |||
mtctr L | |||
MY_ALIGN | |||
LSGEMM_L8x16_LOOP: | |||
KERNEL8x16_I1_L4_2 -2048,0, 0,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 1,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 2,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 3,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 4,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 5,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 6,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 7,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 8,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 9,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 10,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 11,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 12,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 13,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 14,0 | |||
KERNEL8x16_I1_L4_2 -2048,0, 15,1 | |||
bdnz LSGEMM_L8x16_LOOP | |||
MY_ALIGN | |||
LSGEMM_L8x16_LOOP_END: | |||
END8x16 0, AO, BO, -2048, 0 | |||
b LSGEMM_L8x16_SUB1 | |||
bl LSGEMM_L8x16_LMAIN_SUB | |||
andi. L, T12, 127 | |||
ble LSGEMM_L8x16_SAVE | |||
b LSGEMM_L8x16_SUB2 | |||
MY_ALIGN | |||
LSGEMM_L8x16_SUB0: | |||
#if defined(TRMMKERNEL) | |||
andi. L, T11, 127 | |||
andi. L, T11, 255 | |||
cmpwi T11,128 | |||
#else | |||
andi. L, K, 127 | |||
andi. L, K, 255 | |||
cmpwi K,128 | |||
#endif | |||
b LSGEMM_L8x16_SUB2 | |||
MY_ALIGN | |||
LSGEMM_L8x16_SUB1: | |||
#if defined(TRMMKERNEL) | |||
andi. L, T12, 63 | |||
#else | |||
andi. L, T12, 63 | |||
#endif | |||
ble LSGEMM_L8x16_SAVE | |||
bne LSGEMM_L8x16_SUB2 | |||
MY_ALIGN | |||
LSGEMM_L8x16_SUB2_128: | |||
bl LSGEMM_L8x16_L64_SUB | |||
bl LSGEMM_L8x16_L64_SUB | |||
b LSGEMM_L8x16_SAVE | |||
MY_ALIGN | |||
LSGEMM_L8x16_SUB2: | |||
srawi. T10,L, 5 | |||
andi. T10,L,64 | |||
ble LSGEMM_L8x16_SUB2_32 | |||
bl LSGEMM_L8x16_L64_SUB | |||
MY_ALIGN | |||
LSGEMM_L8x16_SUB2_32: | |||
andi. T10,L, 32 | |||
ble LSGEMM_L8x16_SUB2_16 | |||
mtctr T10 | |||
MY_ALIGN | |||
LSGEMM_L8x16_SUB2_LOOP: | |||
LOAD8x16_0 | |||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||
KERNEL8x16_I1_L4_2 64,32, 3,0 | |||
KERNEL8x16_I1_L4_2 64,32, 4,0 | |||
KERNEL8x16_I1_L4_2 64,32, 5,0 | |||
KERNEL8x16_I1_L4_2 64,32, 6,0 | |||
KERNEL8x16_I1_L4_3 64,32, 7,1 | |||
bdnz LSGEMM_L8x16_SUB2_LOOP | |||
MY_ALIGN | |||
bl LSGEMM_L8x16_L32_SUB | |||
MY_ALIGN | |||
LSGEMM_L8x16_SUB2_16: | |||
andi. T10,L, 16 | |||
ble LSGEMM_L8x16_SUB2_8 | |||
LOAD8x16_0 | |||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||
KERNEL8x16_I1_L4_3 64,32, 3,1 | |||
bl LSGEMM_L8x16_L16_SUB | |||
MY_ALIGN | |||
LSGEMM_L8x16_SUB2_8: | |||
andi. T10,L, 8 | |||
@@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1: | |||
andi. T10,L, 1 | |||
ble LSGEMM_L8x16_SAVE | |||
KERNEL8x16 0 | |||
# addic. L, L, -1 | |||
# bgt LSGEMM_L8x16_SUB2 | |||
MY_ALIGN | |||
LSGEMM_L8x16_SAVE: | |||
@@ -0,0 +1,245 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE 512 | |||
#define FZERO 312+192(SP) | |||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r8 | |||
#define B r9 | |||
#define C r10 | |||
#define LDC r6 | |||
#define OFFSET r7 | |||
#define o0 0 | |||
#define alpha_r vs30 | |||
#define alpha_i vs31 | |||
#define VECSAVE r11 | |||
#define FRAMEPOINTER r12 | |||
#define T10 r14 | |||
#define L r15 | |||
#define T8 r16 | |||
#define T5 r17 | |||
#define T2 r19 | |||
#define TEMP_REG r20 | |||
#define T6 r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define T7 r27 | |||
#define T3 r28 | |||
#define T4 r29 | |||
#define PRE r30 | |||
#define T1 r31 | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
mr FRAMEPOINTER, SP | |||
addi SP, SP, -STACKSIZE | |||
mflr r0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
std r0, FLINK_SAVE(SP) | |||
#ifdef linux | |||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
#endif | |||
#ifdef TRMMKERNEL | |||
#if defined(linux) && defined(__64BIT__) | |||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
#endif | |||
#endif | |||
#include "zgemm_macros_power9.S" | |||
slwi LDC, LDC, ZBASE_SHIFT | |||
li PRE, 512 | |||
li r0, 0 | |||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
/*negate for this case as we will use addition -1*(a+b) */ | |||
xvnegdp alpha_r,alpha_r | |||
xvnegdp alpha_i,alpha_i | |||
#endif | |||
.align 4 | |||
#include "zgemm_logic_power9.S" | |||
L999: | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
ld r0, FLINK_SAVE(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
mtlr r0 | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif |
@@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ZGEMM_DEFAULT_UNROLL_M 8 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define SGEMM_DEFAULT_P 640 | |||
#define SGEMM_DEFAULT_P 832 | |||
#define DGEMM_DEFAULT_P 128 | |||
#define CGEMM_DEFAULT_P 640 | |||
#define ZGEMM_DEFAULT_P 320 | |||
#define ZGEMM_DEFAULT_P 256 | |||
#define SGEMM_DEFAULT_Q 1408 | |||
#define SGEMM_DEFAULT_Q 1025 | |||
#define DGEMM_DEFAULT_Q 384 | |||
#define CGEMM_DEFAULT_Q 640 | |||
#define ZGEMM_DEFAULT_Q 640 | |||
#define ZGEMM_DEFAULT_Q 1026 | |||
#define SYMV_P 8 | |||