Browse Source

optimized dgemm for POWER8

tags/v0.2.19^2
Werner Saar 9 years ago
parent
commit
0d0c6f7d7d
9 changed files with 3996 additions and 492 deletions
  1. +6
    -6
      kernel/power/KERNEL.POWER8
  2. +4
    -1
      kernel/power/dgemm_kernel_16x4_power8.S
  3. +378
    -384
      kernel/power/dgemm_logic_16x4_power8.S
  4. +75
    -96
      kernel/power/dgemm_macros_16x4_power8.S
  5. +1
    -1
      kernel/power/dgemm_tcopy_16_power8.S
  6. +4
    -0
      kernel/power/dgemm_tcopy_logic_16_power8.S
  7. +1
    -1
      kernel/power/dtrmm_kernel_16x4_power8.S
  8. +3431
    -0
      kernel/power/dtrmm_macros_16x4_power8.S
  9. +96
    -3
      param.h

+ 6
- 6
kernel/power/KERNEL.POWER8 View File

@@ -21,12 +21,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c


+ 4
- 1
kernel/power/dgemm_kernel_16x4_power8.S View File

@@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define o0 0

#define T4 r12
#define T3 r11

#define o8 r15
#define o24 r16
#define ALPHA r17
@@ -265,7 +268,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi ALPHA, SP, 224
#endif

li PRE, 256
li PRE, 384
li o8 , 8
li o16, 16
li o24, 24


+ 378
- 384
kernel/power/dgemm_logic_16x4_power8.S
File diff suppressed because it is too large
View File


+ 75
- 96
kernel/power/dgemm_macros_16x4_power8.S View File

@@ -431,6 +431,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mr T1, CO
addi T2, T1, 64
add T3, T1, LDC
addi T4, T3, 64

#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
@@ -442,6 +444,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2

lxvd2x vs8, 0, T3
lxvd2x vs9, o16, T3
lxvd2x vs10, o32, T3
lxvd2x vs11, o48, T3

lxvd2x vs12, 0, T4
lxvd2x vs13, o16, T4
lxvd2x vs14, o32, T4
lxvd2x vs15, o48, T4
#endif

#ifndef TRMMKERNEL
@@ -453,6 +465,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
#else
xvmuldp vs0, vs32, alpha_r
xvmuldp vs1, vs33, alpha_r
@@ -462,6 +482,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs5, vs37, alpha_r
xvmuldp vs6, vs38, alpha_r
xvmuldp vs7, vs39, alpha_r
xvmuldp vs8, vs40, alpha_r
xvmuldp vs9, vs41, alpha_r
xvmuldp vs10, vs42, alpha_r
xvmuldp vs11, vs43, alpha_r
xvmuldp vs12, vs44, alpha_r
xvmuldp vs13, vs45, alpha_r
xvmuldp vs14, vs46, alpha_r
xvmuldp vs15, vs47, alpha_r
#endif

stxvd2x vs0, 0, T1
@@ -469,62 +497,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1

dcbt T1, PRE

stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2

add T1, T1, LDC
add T2, T2, LDC

#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1
stxvd2x vs8, 0, T3
stxvd2x vs9, o16, T3
stxvd2x vs10, o32, T3
stxvd2x vs11, o48, T3

lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif
stxvd2x vs12, 0, T4
stxvd2x vs13, o16, T4
stxvd2x vs14, o32, T4
stxvd2x vs15, o48, T4

#ifndef TRMMKERNEL
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
#else
xvmuldp vs8, vs40, alpha_r
xvmuldp vs9, vs41, alpha_r
xvmuldp vs10, vs42, alpha_r
xvmuldp vs11, vs43, alpha_r
xvmuldp vs12, vs44, alpha_r
xvmuldp vs13, vs45, alpha_r
xvmuldp vs14, vs46, alpha_r
xvmuldp vs15, vs47, alpha_r
#endif

stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1

dcbt T1, PRE

stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2

add T1, T1, LDC
add T2, T2, LDC
slwi T4, LDC, 1
add T1, T1, T4
add T3, T3, T4
addi T2, T1, 64
addi T4, T3, 64

#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
@@ -536,6 +528,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2

lxvd2x vs8, 0, T3
lxvd2x vs9, o16, T3
lxvd2x vs10, o32, T3
lxvd2x vs11, o48, T3

lxvd2x vs12, 0, T4
lxvd2x vs13, o16, T4
lxvd2x vs14, o32, T4
lxvd2x vs15, o48, T4
#endif

#ifndef TRMMKERNEL
@@ -547,6 +549,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs53, alpha_r
xvmaddadp vs6, vs54, alpha_r
xvmaddadp vs7, vs55, alpha_r
xvmaddadp vs8, vs56, alpha_r
xvmaddadp vs9, vs57, alpha_r
xvmaddadp vs10, vs58, alpha_r
xvmaddadp vs11, vs59, alpha_r
xvmaddadp vs12, vs60, alpha_r
xvmaddadp vs13, vs61, alpha_r
xvmaddadp vs14, vs62, alpha_r
xvmaddadp vs15, vs63, alpha_r
#else
xvmuldp vs0, vs48, alpha_r
xvmuldp vs1, vs49, alpha_r
@@ -556,6 +566,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs5, vs53, alpha_r
xvmuldp vs6, vs54, alpha_r
xvmuldp vs7, vs55, alpha_r
xvmuldp vs8, vs56, alpha_r
xvmuldp vs9, vs57, alpha_r
xvmuldp vs10, vs58, alpha_r
xvmuldp vs11, vs59, alpha_r
xvmuldp vs12, vs60, alpha_r
xvmuldp vs13, vs61, alpha_r
xvmuldp vs14, vs62, alpha_r
xvmuldp vs15, vs63, alpha_r
#endif

stxvd2x vs0, 0, T1
@@ -563,59 +581,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1

dcbt T1, PRE

stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2

add T1, T1, LDC
add T2, T2, LDC

#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1

lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif

#ifndef TRMMKERNEL
xvmaddadp vs8, vs56, alpha_r
xvmaddadp vs9, vs57, alpha_r
xvmaddadp vs10, vs58, alpha_r
xvmaddadp vs11, vs59, alpha_r
xvmaddadp vs12, vs60, alpha_r
xvmaddadp vs13, vs61, alpha_r
xvmaddadp vs14, vs62, alpha_r
xvmaddadp vs15, vs63, alpha_r
#else
xvmuldp vs8, vs56, alpha_r
xvmuldp vs9, vs57, alpha_r
xvmuldp vs10, vs58, alpha_r
xvmuldp vs11, vs59, alpha_r
xvmuldp vs12, vs60, alpha_r
xvmuldp vs13, vs61, alpha_r
xvmuldp vs14, vs62, alpha_r
xvmuldp vs15, vs63, alpha_r
#endif

stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1

dcbt T1, PRE
stxvd2x vs8, 0, T3
stxvd2x vs9, o16, T3
stxvd2x vs10, o32, T3
stxvd2x vs11, o48, T3

stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
stxvd2x vs12, 0, T4
stxvd2x vs13, o16, T4
stxvd2x vs14, o32, T4
stxvd2x vs15, o48, T4

addi CO, CO, 128



+ 1
- 1
kernel/power/dgemm_tcopy_16_power8.S View File

@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add B2, B2, B
add B1, B1, B

li PREA, 768
li PREA, 256
addi PREB, M16, 128

li o8, 8


+ 4
- 0
kernel/power/dgemm_tcopy_logic_16_power8.S View File

@@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN:

DCOPYT_L4x16_LOOP:

/*
addi T1, PREB, 128
addi T2, PREB, 256
*/
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
/*
dcbtst BO, M16
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2
*/
COPY_4x16

add BO, BO, M16


+ 1
- 1
kernel/power/dtrmm_kernel_16x4_power8.S View File

@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PRE r30
#define T2 r31

#include "dgemm_macros_16x4_power8.S"
#include "dtrmm_macros_16x4_power8.S"


#ifndef NEEDPARAM


+ 3431
- 0
kernel/power/dtrmm_macros_16x4_power8.S
File diff suppressed because it is too large
View File


+ 96
- 3
param.h View File

@@ -410,7 +410,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#endif

#if defined(STEAMROLLER) || defined(EXCAVATOR)
#ifdef STEAMROLLER
#define SNUMOPT 8
#define DNUMOPT 4

#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL



#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1

#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#else
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_M 2
#define XGEMM_DEFAULT_UNROLL_M 1
#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 4
#define ZGEMM3M_DEFAULT_UNROLL_M 4
#define GEMV_UNROLL 8
#endif

#if defined(ARCH_X86_64)
#define SGEMM_DEFAULT_P 768
#define DGEMM_DEFAULT_P 576
#define ZGEMM_DEFAULT_P 288
#define CGEMM_DEFAULT_P 576
#else
#define SGEMM_DEFAULT_P 448
#define DGEMM_DEFAULT_P 480
#define ZGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
#endif
#define QGEMM_DEFAULT_P 112
#define XGEMM_DEFAULT_P 56

#if defined(ARCH_X86_64)
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 160
#define ZGEMM_DEFAULT_Q 160
#define CGEMM_DEFAULT_Q 160
#else
#define SGEMM_DEFAULT_Q 224
#define DGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 224
#define CGEMM_DEFAULT_Q 224
#endif
#define QGEMM_DEFAULT_Q 224
#define XGEMM_DEFAULT_Q 224

#define CGEMM3M_DEFAULT_P 448
#define ZGEMM3M_DEFAULT_P 224
#define XGEMM3M_DEFAULT_P 112
#define CGEMM3M_DEFAULT_Q 224
#define ZGEMM3M_DEFAULT_Q 224
#define XGEMM3M_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_R 12288
#define ZGEMM3M_DEFAULT_R 12288
#define XGEMM3M_DEFAULT_R 12288

#define SGEMM_DEFAULT_R 12288
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R 12288
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r
#define XGEMM_DEFAULT_R xgemm_r

#define SYMV_P 16
#define HAVE_EXCLUSIVE_CACHE

#define GEMM_THREAD gemm_thread_mn

#endif


#ifdef EXCAVATOR
#define SNUMOPT 8
#define DNUMOPT 4

@@ -1885,12 +1978,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_N 2

#define SGEMM_DEFAULT_P 1280
#define DGEMM_DEFAULT_P 640
#define DGEMM_DEFAULT_P 768
#define CGEMM_DEFAULT_P 640
#define ZGEMM_DEFAULT_P 320

#define SGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 768
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640



Loading…
Cancel
Save