Browse Source

optimized dgemm and dgetrf for POWER8

tags/v0.2.19^2
Werner Saar 9 years ago
parent
commit
6a2bde7a2d
7 changed files with 90 additions and 62 deletions
  1. +7
    -0
      common.h
  2. +37
    -26
      kernel/power/dgemm_logic_16x4_power8.S
  3. +7
    -0
      kernel/power/dgemm_ncopy_macros_4_power8.S
  4. +1
    -1
      kernel/power/dgemm_tcopy_16_power8.S
  5. +12
    -12
      kernel/power/dgemm_tcopy_logic_16_power8.S
  6. +18
    -22
      kernel/power/dgemm_tcopy_macros_16_power8.S
  7. +8
    -1
      lapack/getrf/getrf_parallel_omp.c

+ 7
- 0
common.h View File

@@ -332,6 +332,13 @@ typedef int blasint;
#endif #endif
#endif #endif


#ifdef POWER8
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif


/* /*
#ifdef PILEDRIVER #ifdef PILEDRIVER
#ifndef YIELDING #ifndef YIELDING


+ 37
- 26
kernel/power/dgemm_logic_16x4_power8.S View File

@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* LAPACK-TEST : OK * LAPACK-TEST : OK
**************************************************************************************/ **************************************************************************************/


#define MY_ALIGN .align 3


srawi. J, N, 2 srawi. J, N, 2
ble LDGEMM_L4_END ble LDGEMM_L4_END
@@ -53,7 +54,7 @@ LDGEMM_L4_BEGIN:
srawi. I, M, 4 srawi. I, M, 4
ble LDGEMM_L4x16_END ble LDGEMM_L4x16_END


.align 4
MY_ALIGN
LDGEMM_L4x16_BEGIN_FIRST: LDGEMM_L4x16_BEGIN_FIRST:


li L, -128 li L, -128
@@ -90,7 +91,7 @@ LDGEMM_L4x16_BEGIN_FIRST:
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble LDGEMM_L4x16_SUB4_FIRST ble LDGEMM_L4x16_SUB4_FIRST


.align 4
MY_ALIGN
LDGEMM_L4x16_LOOP_START_FIRST: LDGEMM_L4x16_LOOP_START_FIRST:


li T2, 512 li T2, 512
@@ -115,7 +116,7 @@ LDGEMM_L4x16_LOOP_START_FIRST:
ble LDGEMM_L4x16_LOOP_END_FIRST ble LDGEMM_L4x16_LOOP_END_FIRST
mtctr L mtctr L


.align 4
MY_ALIGN


LDGEMM_L4x16_LOOP_FIRST: LDGEMM_L4x16_LOOP_FIRST:


@@ -132,7 +133,7 @@ LDGEMM_L4x16_LOOP_FIRST:


bdnz LDGEMM_L4x16_LOOP_FIRST bdnz LDGEMM_L4x16_LOOP_FIRST


.align 4
MY_ALIGN


LDGEMM_L4x16_LOOP_END_FIRST: LDGEMM_L4x16_LOOP_END_FIRST:


@@ -175,7 +176,7 @@ LDGEMM_L4x16_SUB2_FIRST:
addic. L, L, -1 addic. L, L, -1
bgt LDGEMM_L4x16_SUB2_FIRST bgt LDGEMM_L4x16_SUB2_FIRST


.align 4
MY_ALIGN
LDGEMM_L4x16_SAVE_FIRST: LDGEMM_L4x16_SAVE_FIRST:


SAVE4x16 SAVE4x16
@@ -185,7 +186,8 @@ LDGEMM_L4x16_SAVE_FIRST:


LDGEMM_L4x16_END_FIRST: LDGEMM_L4x16_END_FIRST:


.align 4
MY_ALIGN

LDGEMM_L4x16_BEGIN: LDGEMM_L4x16_BEGIN:


li L, -128 li L, -128
@@ -222,7 +224,8 @@ LDGEMM_L4x16_BEGIN:
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble- LDGEMM_L4x16_SUB4 ble- LDGEMM_L4x16_SUB4


.align 4
MY_ALIGN

LDGEMM_L4x16_LOOP_START: LDGEMM_L4x16_LOOP_START:


li o40, 40 li o40, 40
@@ -239,20 +242,19 @@ LDGEMM_L4x16_LOOP_START:
ble- LDGEMM_L4x16_LOOP_END ble- LDGEMM_L4x16_LOOP_END
mtctr L mtctr L


.align 4
MY_ALIGN


LDGEMM_L4x16_LOOP: LDGEMM_L4x16_LOOP:



dcbt AO, PRE dcbt AO, PRE
KERNEL4x16_L1 KERNEL4x16_L1
dcbt AO, PRE dcbt AO, PRE
// addic. L, L, -1
KERNEL4x16_L2 KERNEL4x16_L2


bdnz+ LDGEMM_L4x16_LOOP bdnz+ LDGEMM_L4x16_LOOP


.align 4

MY_ALIGN


LDGEMM_L4x16_LOOP_END: LDGEMM_L4x16_LOOP_END:


@@ -261,6 +263,8 @@ LDGEMM_L4x16_LOOP_END:


b LDGEMM_L4x16_SUB1 b LDGEMM_L4x16_SUB1


MY_ALIGN

LDGEMM_L4x16_SUB4: LDGEMM_L4x16_SUB4:


KERNEL4x16_SUBI1 KERNEL4x16_SUBI1
@@ -268,6 +272,8 @@ LDGEMM_L4x16_SUB4:


b LDGEMM_L4x16_SUB1 b LDGEMM_L4x16_SUB1


MY_ALIGN

LDGEMM_L4x16_SUB0: LDGEMM_L4x16_SUB0:


andi. L, K, 1 andi. L, K, 1
@@ -278,11 +284,15 @@ LDGEMM_L4x16_SUB0:
ble LDGEMM_L4x16_SAVE ble LDGEMM_L4x16_SAVE
b LDGEMM_L4x16_SUB2 b LDGEMM_L4x16_SUB2


MY_ALIGN

LDGEMM_L4x16_SUB1: LDGEMM_L4x16_SUB1:


andi. L, K, 1 andi. L, K, 1
ble LDGEMM_L4x16_SAVE ble LDGEMM_L4x16_SAVE


MY_ALIGN

LDGEMM_L4x16_SUB2: LDGEMM_L4x16_SUB2:


KERNEL4x16_SUB1 KERNEL4x16_SUB1
@@ -290,7 +300,8 @@ LDGEMM_L4x16_SUB2:
addic. L, L, -1 addic. L, L, -1
bgt LDGEMM_L4x16_SUB2 bgt LDGEMM_L4x16_SUB2


.align 4
MY_ALIGN

LDGEMM_L4x16_SAVE: LDGEMM_L4x16_SAVE:


SAVE4x16 SAVE4x16
@@ -334,7 +345,7 @@ LDGEMM_L4x8_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L4x8_LOOP_END ble LDGEMM_L4x8_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L4x8_LOOP: LDGEMM_L4x8_LOOP:


@@ -441,7 +452,7 @@ LDGEMM_L4x4_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L4x4_LOOP_END ble LDGEMM_L4x4_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L4x4_LOOP: LDGEMM_L4x4_LOOP:


@@ -543,7 +554,7 @@ LDGEMM_L4x2_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L4x2_LOOP_END ble LDGEMM_L4x2_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L4x2_LOOP: LDGEMM_L4x2_LOOP:


@@ -643,7 +654,7 @@ LDGEMM_L4x1_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L4x1_LOOP_END ble LDGEMM_L4x1_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L4x1_LOOP: LDGEMM_L4x1_LOOP:


@@ -778,7 +789,7 @@ LDGEMM_L2x16_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x16_LOOP_END ble LDGEMM_L2x16_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L2x16_LOOP: LDGEMM_L2x16_LOOP:


@@ -907,7 +918,7 @@ LDGEMM_L2x8_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x8_LOOP_END ble LDGEMM_L2x8_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L2x8_LOOP: LDGEMM_L2x8_LOOP:


@@ -1011,7 +1022,7 @@ LDGEMM_L2x4_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x4_LOOP_END ble LDGEMM_L2x4_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L2x4_LOOP: LDGEMM_L2x4_LOOP:


@@ -1111,7 +1122,7 @@ LDGEMM_L2x2_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x2_LOOP_END ble LDGEMM_L2x2_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L2x2_LOOP: LDGEMM_L2x2_LOOP:


@@ -1211,7 +1222,7 @@ LDGEMM_L2x1_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x1_LOOP_END ble LDGEMM_L2x1_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L2x1_LOOP: LDGEMM_L2x1_LOOP:


@@ -1331,7 +1342,7 @@ LDGEMM_L1x16_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x16_LOOP_END ble LDGEMM_L1x16_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L1x16_LOOP: LDGEMM_L1x16_LOOP:


@@ -1460,7 +1471,7 @@ LDGEMM_L1x8_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x8_LOOP_END ble LDGEMM_L1x8_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L1x8_LOOP: LDGEMM_L1x8_LOOP:


@@ -1564,7 +1575,7 @@ LDGEMM_L1x4_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x4_LOOP_END ble LDGEMM_L1x4_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L1x4_LOOP: LDGEMM_L1x4_LOOP:


@@ -1664,7 +1675,7 @@ LDGEMM_L1x2_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x2_LOOP_END ble LDGEMM_L1x2_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L1x2_LOOP: LDGEMM_L1x2_LOOP:


@@ -1764,7 +1775,7 @@ LDGEMM_L1x1_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x1_LOOP_END ble LDGEMM_L1x1_LOOP_END


.align 5
MY_ALIGN


LDGEMM_L1x1_LOOP: LDGEMM_L1x1_LOOP:




+ 7
- 0
kernel/power/dgemm_ncopy_macros_4_power8.S View File

@@ -127,6 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs62, vs7, vs15, 3 xxpermdi vs62, vs7, vs15, 3
xxpermdi vs63, vs23, vs31, 3 xxpermdi vs63, vs23, vs31, 3


dcbt BO, PREB


stxvd2x vs32, o0, BO stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO stxvd2x vs33, o16, BO
@@ -138,6 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs39, o112, BO stxvd2x vs39, o112, BO
addi BO, BO, 128 addi BO, BO, 128


dcbt BO, PREB

stxvd2x vs40, o0, BO stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO stxvd2x vs42, o32, BO
@@ -148,6 +151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs47, o112, BO stxvd2x vs47, o112, BO
addi BO, BO, 128 addi BO, BO, 128


dcbt BO, PREB

stxvd2x vs48, o0, BO stxvd2x vs48, o0, BO
stxvd2x vs49, o16, BO stxvd2x vs49, o16, BO
stxvd2x vs50, o32, BO stxvd2x vs50, o32, BO
@@ -158,6 +163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs55, o112, BO stxvd2x vs55, o112, BO
addi BO, BO, 128 addi BO, BO, 128


dcbt BO, PREB

stxvd2x vs56, o0, BO stxvd2x vs56, o0, BO
stxvd2x vs57, o16, BO stxvd2x vs57, o16, BO
stxvd2x vs58, o32, BO stxvd2x vs58, o32, BO


+ 1
- 1
kernel/power/dgemm_tcopy_16_power8.S View File

@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add B2, B2, B add B2, B2, B
add B1, B1, B add B1, B1, B


li PREA, 256
li PREA, 384
addi PREB, M16, 128 addi PREB, M16, 128


li o8, 8 li o8, 8


+ 12
- 12
kernel/power/dgemm_tcopy_logic_16_power8.S View File

@@ -52,31 +52,31 @@ DCOPYT_L4_BEGIN:
ble DCOPYT_L4x8_BEGIN ble DCOPYT_L4x8_BEGIN


mr BO, B16 mr BO, B16
addi T2, M16, 384
mtctr J


.align 5 .align 5


DCOPYT_L4x16_LOOP: DCOPYT_L4x16_LOOP:


/*
addi T1, PREB, 128
addi T2, PREB, 256
*/
addi T1, M16, 256

dcbt A0, PREA dcbt A0, PREA
dcbt A1, PREA dcbt A1, PREA
dcbt A2, PREA dcbt A2, PREA
dcbt A3, PREA dcbt A3, PREA
/*
dcbtst BO, M16
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2
*/
dcbt BO, M16
dcbt BO, PREB
dcbt BO, T1
dcbt BO, T2
COPY_4x16 COPY_4x16


add BO, BO, M16 add BO, BO, M16


addic. J, J, -1
bgt DCOPYT_L4x16_LOOP
// addic. J, J, -1
bdnz+ DCOPYT_L4x16_LOOP


DCOPYT_L4x8_BEGIN: DCOPYT_L4x8_BEGIN:




+ 18
- 22
kernel/power/dgemm_tcopy_macros_16_power8.S View File

@@ -46,52 +46,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs35, o48, A0 lxvd2x vs35, o48, A0
addi A0, A0, 64 addi A0, A0, 64


lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64


lxvd2x vs40, o0, A1 lxvd2x vs40, o0, A1
lxvd2x vs41, o16, A1 lxvd2x vs41, o16, A1
lxvd2x vs42, o32, A1 lxvd2x vs42, o32, A1
lxvd2x vs43, o48, A1 lxvd2x vs43, o48, A1
addi A1, A1, 64 addi A1, A1, 64


lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64


lxvd2x vs48, o0, A2 lxvd2x vs48, o0, A2
lxvd2x vs49, o16, A2 lxvd2x vs49, o16, A2
lxvd2x vs50, o32, A2 lxvd2x vs50, o32, A2
lxvd2x vs51, o48, A2 lxvd2x vs51, o48, A2
addi A2, A2, 64 addi A2, A2, 64


lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
addi A2, A2, 64


lxvd2x vs56, o0, A3 lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3 lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3 lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3 lxvd2x vs59, o48, A3
addi A3, A3, 64 addi A3, A3, 64


lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64

lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64

lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
addi A2, A2, 64

lxvd2x vs60, o0, A3 lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3 lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3 lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3 lxvd2x vs63, o48, A3
addi A3, A3, 64 addi A3, A3, 64



mr T1, BO mr T1, BO


stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1


+ 8
- 1
lapack/getrf/getrf_parallel_omp.c View File

@@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q; if (blocking > GEMM_Q) blocking = GEMM_Q;


if (blocking <= GEMM_UNROLL_N * 2) {
#ifdef POWER8
if (blocking <= GEMM_UNROLL_N) {
info = GETF2(args, NULL, range_n, sa, sb, 0); info = GETF2(args, NULL, range_n, sa, sb, 0);
return info; return info;
} }
#else
if (blocking <= GEMM_UNROLL_N*2) {
info = GETF2(args, NULL, range_n, sa, sb, 0);
return info;
}
#endif


sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);




Loading…
Cancel
Save