@@ -332,6 +332,13 @@ typedef int blasint; | |||
#endif | |||
#endif | |||
#ifdef POWER8 | |||
#ifndef YIELDING | |||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
#endif | |||
#endif | |||
/* | |||
#ifdef PILEDRIVER | |||
#ifndef YIELDING | |||
@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
* LAPACK-TEST : OK | |||
**************************************************************************************/ | |||
#define MY_ALIGN .align 3 | |||
srawi. J, N, 2 | |||
ble LDGEMM_L4_END | |||
@@ -53,7 +54,7 @@ LDGEMM_L4_BEGIN: | |||
srawi. I, M, 4 | |||
ble LDGEMM_L4x16_END | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_BEGIN_FIRST: | |||
li L, -128 | |||
@@ -90,7 +91,7 @@ LDGEMM_L4x16_BEGIN_FIRST: | |||
cmpwi cr0, L, 1 | |||
ble LDGEMM_L4x16_SUB4_FIRST | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_LOOP_START_FIRST: | |||
li T2, 512 | |||
@@ -115,7 +116,7 @@ LDGEMM_L4x16_LOOP_START_FIRST: | |||
ble LDGEMM_L4x16_LOOP_END_FIRST | |||
mtctr L | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_LOOP_FIRST: | |||
@@ -132,7 +133,7 @@ LDGEMM_L4x16_LOOP_FIRST: | |||
bdnz LDGEMM_L4x16_LOOP_FIRST | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_LOOP_END_FIRST: | |||
@@ -175,7 +176,7 @@ LDGEMM_L4x16_SUB2_FIRST: | |||
addic. L, L, -1 | |||
bgt LDGEMM_L4x16_SUB2_FIRST | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_SAVE_FIRST: | |||
SAVE4x16 | |||
@@ -185,7 +186,8 @@ LDGEMM_L4x16_SAVE_FIRST: | |||
LDGEMM_L4x16_END_FIRST: | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_BEGIN: | |||
li L, -128 | |||
@@ -222,7 +224,8 @@ LDGEMM_L4x16_BEGIN: | |||
cmpwi cr0, L, 1 | |||
ble- LDGEMM_L4x16_SUB4 | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_LOOP_START: | |||
li o40, 40 | |||
@@ -239,20 +242,19 @@ LDGEMM_L4x16_LOOP_START: | |||
ble- LDGEMM_L4x16_LOOP_END | |||
mtctr L | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_LOOP: | |||
dcbt AO, PRE | |||
KERNEL4x16_L1 | |||
dcbt AO, PRE | |||
// addic. L, L, -1 | |||
KERNEL4x16_L2 | |||
bdnz+ LDGEMM_L4x16_LOOP | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_LOOP_END: | |||
@@ -261,6 +263,8 @@ LDGEMM_L4x16_LOOP_END: | |||
b LDGEMM_L4x16_SUB1 | |||
MY_ALIGN | |||
LDGEMM_L4x16_SUB4: | |||
KERNEL4x16_SUBI1 | |||
@@ -268,6 +272,8 @@ LDGEMM_L4x16_SUB4: | |||
b LDGEMM_L4x16_SUB1 | |||
MY_ALIGN | |||
LDGEMM_L4x16_SUB0: | |||
andi. L, K, 1 | |||
@@ -278,11 +284,15 @@ LDGEMM_L4x16_SUB0: | |||
ble LDGEMM_L4x16_SAVE | |||
b LDGEMM_L4x16_SUB2 | |||
MY_ALIGN | |||
LDGEMM_L4x16_SUB1: | |||
andi. L, K, 1 | |||
ble LDGEMM_L4x16_SAVE | |||
MY_ALIGN | |||
LDGEMM_L4x16_SUB2: | |||
KERNEL4x16_SUB1 | |||
@@ -290,7 +300,8 @@ LDGEMM_L4x16_SUB2: | |||
addic. L, L, -1 | |||
bgt LDGEMM_L4x16_SUB2 | |||
.align 4 | |||
MY_ALIGN | |||
LDGEMM_L4x16_SAVE: | |||
SAVE4x16 | |||
@@ -334,7 +345,7 @@ LDGEMM_L4x8_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L4x8_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L4x8_LOOP: | |||
@@ -441,7 +452,7 @@ LDGEMM_L4x4_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L4x4_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L4x4_LOOP: | |||
@@ -543,7 +554,7 @@ LDGEMM_L4x2_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L4x2_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L4x2_LOOP: | |||
@@ -643,7 +654,7 @@ LDGEMM_L4x1_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L4x1_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L4x1_LOOP: | |||
@@ -778,7 +789,7 @@ LDGEMM_L2x16_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L2x16_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L2x16_LOOP: | |||
@@ -907,7 +918,7 @@ LDGEMM_L2x8_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L2x8_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L2x8_LOOP: | |||
@@ -1011,7 +1022,7 @@ LDGEMM_L2x4_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L2x4_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L2x4_LOOP: | |||
@@ -1111,7 +1122,7 @@ LDGEMM_L2x2_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L2x2_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L2x2_LOOP: | |||
@@ -1211,7 +1222,7 @@ LDGEMM_L2x1_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L2x1_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L2x1_LOOP: | |||
@@ -1331,7 +1342,7 @@ LDGEMM_L1x16_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L1x16_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L1x16_LOOP: | |||
@@ -1460,7 +1471,7 @@ LDGEMM_L1x8_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L1x8_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L1x8_LOOP: | |||
@@ -1564,7 +1575,7 @@ LDGEMM_L1x4_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L1x4_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L1x4_LOOP: | |||
@@ -1664,7 +1675,7 @@ LDGEMM_L1x2_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L1x2_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L1x2_LOOP: | |||
@@ -1764,7 +1775,7 @@ LDGEMM_L1x1_LOOP_START: | |||
addic. L, L, -2 | |||
ble LDGEMM_L1x1_LOOP_END | |||
.align 5 | |||
MY_ALIGN | |||
LDGEMM_L1x1_LOOP: | |||
@@ -127,6 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xxpermdi vs62, vs7, vs15, 3 | |||
xxpermdi vs63, vs23, vs31, 3 | |||
dcbt BO, PREB | |||
stxvd2x vs32, o0, BO | |||
stxvd2x vs33, o16, BO | |||
@@ -138,6 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stxvd2x vs39, o112, BO | |||
addi BO, BO, 128 | |||
dcbt BO, PREB | |||
stxvd2x vs40, o0, BO | |||
stxvd2x vs41, o16, BO | |||
stxvd2x vs42, o32, BO | |||
@@ -148,6 +151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stxvd2x vs47, o112, BO | |||
addi BO, BO, 128 | |||
dcbt BO, PREB | |||
stxvd2x vs48, o0, BO | |||
stxvd2x vs49, o16, BO | |||
stxvd2x vs50, o32, BO | |||
@@ -158,6 +163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stxvd2x vs55, o112, BO | |||
addi BO, BO, 128 | |||
dcbt BO, PREB | |||
stxvd2x vs56, o0, BO | |||
stxvd2x vs57, o16, BO | |||
stxvd2x vs58, o32, BO | |||
@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
add B2, B2, B | |||
add B1, B1, B | |||
li PREA, 256 | |||
li PREA, 384 | |||
addi PREB, M16, 128 | |||
li o8, 8 | |||
@@ -52,31 +52,31 @@ DCOPYT_L4_BEGIN: | |||
ble DCOPYT_L4x8_BEGIN | |||
mr BO, B16 | |||
addi T2, M16, 384 | |||
mtctr J | |||
.align 5 | |||
DCOPYT_L4x16_LOOP: | |||
/* | |||
addi T1, PREB, 128 | |||
addi T2, PREB, 256 | |||
*/ | |||
addi T1, M16, 256 | |||
dcbt A0, PREA | |||
dcbt A1, PREA | |||
dcbt A2, PREA | |||
dcbt A3, PREA | |||
/* | |||
dcbtst BO, M16 | |||
dcbtst BO, PREB | |||
dcbtst BO, T1 | |||
dcbtst BO, T2 | |||
*/ | |||
dcbt BO, M16 | |||
dcbt BO, PREB | |||
dcbt BO, T1 | |||
dcbt BO, T2 | |||
COPY_4x16 | |||
add BO, BO, M16 | |||
addic. J, J, -1 | |||
bgt DCOPYT_L4x16_LOOP | |||
// addic. J, J, -1 | |||
bdnz+ DCOPYT_L4x16_LOOP | |||
DCOPYT_L4x8_BEGIN: | |||
@@ -46,52 +46,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
lxvd2x vs35, o48, A0 | |||
addi A0, A0, 64 | |||
lxvd2x vs36, o0, A0 | |||
lxvd2x vs37, o16, A0 | |||
lxvd2x vs38, o32, A0 | |||
lxvd2x vs39, o48, A0 | |||
addi A0, A0, 64 | |||
lxvd2x vs40, o0, A1 | |||
lxvd2x vs41, o16, A1 | |||
lxvd2x vs42, o32, A1 | |||
lxvd2x vs43, o48, A1 | |||
addi A1, A1, 64 | |||
lxvd2x vs44, o0, A1 | |||
lxvd2x vs45, o16, A1 | |||
lxvd2x vs46, o32, A1 | |||
lxvd2x vs47, o48, A1 | |||
addi A1, A1, 64 | |||
lxvd2x vs48, o0, A2 | |||
lxvd2x vs49, o16, A2 | |||
lxvd2x vs50, o32, A2 | |||
lxvd2x vs51, o48, A2 | |||
addi A2, A2, 64 | |||
lxvd2x vs52, o0, A2 | |||
lxvd2x vs53, o16, A2 | |||
lxvd2x vs54, o32, A2 | |||
lxvd2x vs55, o48, A2 | |||
addi A2, A2, 64 | |||
lxvd2x vs56, o0, A3 | |||
lxvd2x vs57, o16, A3 | |||
lxvd2x vs58, o32, A3 | |||
lxvd2x vs59, o48, A3 | |||
addi A3, A3, 64 | |||
lxvd2x vs36, o0, A0 | |||
lxvd2x vs37, o16, A0 | |||
lxvd2x vs38, o32, A0 | |||
lxvd2x vs39, o48, A0 | |||
addi A0, A0, 64 | |||
lxvd2x vs44, o0, A1 | |||
lxvd2x vs45, o16, A1 | |||
lxvd2x vs46, o32, A1 | |||
lxvd2x vs47, o48, A1 | |||
addi A1, A1, 64 | |||
lxvd2x vs52, o0, A2 | |||
lxvd2x vs53, o16, A2 | |||
lxvd2x vs54, o32, A2 | |||
lxvd2x vs55, o48, A2 | |||
addi A2, A2, 64 | |||
lxvd2x vs60, o0, A3 | |||
lxvd2x vs61, o16, A3 | |||
lxvd2x vs62, o32, A3 | |||
lxvd2x vs63, o48, A3 | |||
addi A3, A3, 64 | |||
mr T1, BO | |||
stxvd2x vs32, o0, T1 | |||
@@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); | |||
if (blocking > GEMM_Q) blocking = GEMM_Q; | |||
if (blocking <= GEMM_UNROLL_N * 2) { | |||
#ifdef POWER8 | |||
if (blocking <= GEMM_UNROLL_N) { | |||
info = GETF2(args, NULL, range_n, sa, sb, 0); | |||
return info; | |||
} | |||
#else | |||
if (blocking <= GEMM_UNROLL_N*2) { | |||
info = GETF2(args, NULL, range_n, sa, sb, 0); | |||
return info; | |||
} | |||
#endif | |||
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||