@@ -13,10 +13,10 @@ endif | |||
ifeq ($(CORE), POWER8) | |||
ifeq ($(USE_OPENMP), 1) | |||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp | |||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
else | |||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math | |||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math | |||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||
endif | |||
endif | |||
@@ -803,7 +803,7 @@ Lmcount$lazy_ptr: | |||
#elif defined(PPC440FP2) | |||
#define BUFFER_SIZE ( 16 << 20) | |||
#elif defined(POWER8) | |||
#define BUFFER_SIZE ( 32 << 20) | |||
#define BUFFER_SIZE ( 64 << 20) | |||
#else | |||
#define BUFFER_SIZE ( 16 << 20) | |||
#endif | |||
@@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
LDGEMM_L4_BEGIN: | |||
mr CO, C | |||
li T1, 128 | |||
li T2, 256 | |||
mr AO, A | |||
slwi T1, LDC , 2 | |||
add C, C, T1 | |||
mr CO, C | |||
slwi T3, LDC , 2 | |||
add C, C, T3 | |||
dcbt A, T1 | |||
dcbt A, T2 | |||
srawi. I, M, 4 | |||
ble LDGEMM_L4x16_END | |||
.align 4 | |||
LDGEMM_L4x16_BEGIN_FIRST: | |||
li L, -128 | |||
mr T1, CO | |||
add T2, T1, LDC | |||
add T3, T2, LDC | |||
add T4, T3, LDC | |||
and T1, T1, L | |||
and T2, T2, L | |||
and T3, T3, L | |||
and T4, T4, L | |||
dcbt T1, r0 | |||
dcbt T2, r0 | |||
dcbt T3, r0 | |||
dcbt T4, r0 | |||
mr BO, B | |||
srawi. L, K, 2 | |||
addi T1, T1, 128 | |||
addi T2, T2, 128 | |||
addi T3, T3, 128 | |||
addi T4, T4, 128 | |||
dcbt T1, r0 | |||
dcbt T2, r0 | |||
dcbt T3, r0 | |||
dcbt T4, r0 | |||
ble LDGEMM_L4x16_SUB0_FIRST | |||
cmpwi cr0, L, 1 | |||
ble LDGEMM_L4x16_SUB4_FIRST | |||
.align 4 | |||
LDGEMM_L4x16_LOOP_START_FIRST: | |||
li T2, 512 | |||
li o40, 40 | |||
li o56, 56 | |||
dcbt AO, PRE | |||
dcbt BO, T2 | |||
LOAD4x16_1 | |||
dcbt AO, PRE | |||
KERNEL4x16_I1 | |||
dcbt AO, PRE | |||
addic. L, L, -2 | |||
KERNEL4x16_L2 | |||
dcbt AO, PRE | |||
KERNEL4x16_L1 | |||
dcbt AO, PRE | |||
dcbt BO, T2 | |||
KERNEL4x16_L2 | |||
ble LDGEMM_L4x16_LOOP_END_FIRST | |||
mtctr L | |||
.align 4 | |||
LDGEMM_L4x16_LOOP_FIRST: | |||
dcbt AO, PRE | |||
KERNEL4x16_L1 | |||
dcbt AO, PRE | |||
KERNEL4x16_L2 | |||
dcbt AO, PRE | |||
KERNEL4x16_L1 | |||
dcbt AO, PRE | |||
dcbt BO, T2 | |||
KERNEL4x16_L2 | |||
bdnz LDGEMM_L4x16_LOOP_FIRST | |||
.align 4 | |||
LDGEMM_L4x16_LOOP_END_FIRST: | |||
KERNEL4x16_L1 | |||
KERNEL4x16_L2 | |||
KERNEL4x16_1 | |||
KERNEL4x16_E2 | |||
b LDGEMM_L4x16_SUB1_FIRST | |||
LDGEMM_L4x16_SUB4_FIRST: | |||
KERNEL4x16_SUBI1 | |||
KERNEL4x16_SUB1 | |||
KERNEL4x16_SUB1 | |||
KERNEL4x16_SUB1 | |||
b LDGEMM_L4x16_SUB1_FIRST | |||
LDGEMM_L4x16_SUB0_FIRST: | |||
andi. L, K, 3 | |||
KERNEL4x16_SUBI1 | |||
addic. L, L, -1 | |||
ble LDGEMM_L4x16_SAVE_FIRST | |||
b LDGEMM_L4x16_SUB2_FIRST | |||
LDGEMM_L4x16_SUB1_FIRST: | |||
andi. L, K, 3 | |||
ble LDGEMM_L4x16_SAVE_FIRST | |||
LDGEMM_L4x16_SUB2_FIRST: | |||
KERNEL4x16_SUB1 | |||
addic. L, L, -1 | |||
bgt LDGEMM_L4x16_SUB2_FIRST | |||
.align 4 | |||
LDGEMM_L4x16_SAVE_FIRST: | |||
SAVE4x16 | |||
addic. I, I, -1 | |||
ble LDGEMM_L4x16_END | |||
LDGEMM_L4x16_END_FIRST: | |||
.align 4 | |||
LDGEMM_L4x16_BEGIN: | |||
@@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN: | |||
dcbt T3, r0 | |||
dcbt T4, r0 | |||
ble LDGEMM_L4x16_SUB0 | |||
ble- LDGEMM_L4x16_SUB0 | |||
cmpwi cr0, L, 1 | |||
ble LDGEMM_L4x16_SUB4 | |||
ble- LDGEMM_L4x16_SUB4 | |||
.align 4 | |||
LDGEMM_L4x16_LOOP_START: | |||
@@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START: | |||
addic. L, L, -2 | |||
KERNEL4x16_L2 | |||
ble LDGEMM_L4x16_LOOP_END | |||
ble- LDGEMM_L4x16_LOOP_END | |||
mtctr L | |||
.align 4 | |||
@@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP: | |||
dcbt AO, PRE | |||
KERNEL4x16_L1 | |||
dcbt AO, PRE | |||
addic. L, L, -1 | |||
// addic. L, L, -1 | |||
KERNEL4x16_L2 | |||
bgt LDGEMM_L4x16_LOOP | |||
bdnz+ LDGEMM_L4x16_LOOP | |||
.align 4 | |||
@@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE: | |||
SAVE4x16 | |||
addic. I, I, -1 | |||
bgt LDGEMM_L4x16_BEGIN | |||
bgt+ LDGEMM_L4x16_BEGIN | |||
LDGEMM_L4x16_END: | |||
@@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro SAVE4x16 | |||
mr T1, CO | |||
add T2, T1, LDC | |||
add T3, T2, LDC | |||
add T4, T3, LDC | |||
add T2, CO, LDC | |||
lxvd2x vs0, 0, CO | |||
lxvd2x vs1, o16, CO | |||
@@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
lxvd2x vs3, o48, CO | |||
lxvd2x vs4, o64, CO | |||
lxvd2x vs5, o80, CO | |||
add T3, T2, LDC | |||
lxvd2x vs6, o96, CO | |||
lxvd2x vs7, o112, CO | |||
@@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
lxvd2x vs11, o48, T2 | |||
lxvd2x vs12, o64, T2 | |||
lxvd2x vs13, o80, T2 | |||
add T4, T3, LDC | |||
lxvd2x vs14, o96, T2 | |||
lxvd2x vs15, o112, T2 | |||
@@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
lxvd2x vs31, o112, T3 | |||
xvmaddadp vs0, vs32, alpha_r | |||
xvmaddadp vs1, vs33, alpha_r | |||
xvmaddadp vs2, vs34, alpha_r | |||
xvmaddadp vs3, vs35, alpha_r | |||
xvmaddadp vs4, vs36, alpha_r | |||
xvmaddadp vs5, vs37, alpha_r | |||
xvmaddadp vs6, vs38, alpha_r | |||
xvmaddadp vs7, vs39, alpha_r | |||
lxvd2x vs32, 0, T4 | |||
xvmaddadp vs1, vs33, alpha_r | |||
lxvd2x vs33, o16, T4 | |||
xvmaddadp vs2, vs34, alpha_r | |||
lxvd2x vs34, o32, T4 | |||
xvmaddadp vs3, vs35, alpha_r | |||
lxvd2x vs35, o48, T4 | |||
xvmaddadp vs4, vs36, alpha_r | |||
lxvd2x vs36, o64, T4 | |||
xvmaddadp vs5, vs37, alpha_r | |||
lxvd2x vs37, o80, T4 | |||
xvmaddadp vs6, vs38, alpha_r | |||
lxvd2x vs38, o96, T4 | |||
xvmaddadp vs7, vs39, alpha_r | |||
lxvd2x vs39, o112, T4 | |||
xvmaddadp vs8, vs40, alpha_r | |||
@@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvmaddadp vs10, vs42, alpha_r | |||
xvmaddadp vs11, vs43, alpha_r | |||
stxvd2x vs0, 0, T1 | |||
stxvd2x vs1, o16, T1 | |||
stxvd2x vs2, o32, T1 | |||
stxvd2x vs3, o48, T1 | |||
xvmaddadp vs12, vs44, alpha_r | |||
xvmaddadp vs13, vs45, alpha_r | |||
xvmaddadp vs14, vs46, alpha_r | |||
xvmaddadp vs15, vs47, alpha_r | |||
stxvd2x vs4, o64, T1 | |||
stxvd2x vs5, o80, T1 | |||
stxvd2x vs6, o96, T1 | |||
stxvd2x vs7, o112, T1 | |||
xvmaddadp vs24, vs48, alpha_r | |||
xvmaddadp vs25, vs49, alpha_r | |||
xvmaddadp vs26, vs50, alpha_r | |||
xvmaddadp vs27, vs51, alpha_r | |||
stxvd2x vs8, o0, T2 | |||
stxvd2x vs9, o16, T2 | |||
stxvd2x vs10, o32, T2 | |||
stxvd2x vs11, o48, T2 | |||
xvmaddadp vs28, vs52, alpha_r | |||
xvmaddadp vs29, vs53, alpha_r | |||
xvmaddadp vs30, vs54, alpha_r | |||
xvmaddadp vs31, vs55, alpha_r | |||
stxvd2x vs12, o64, T2 | |||
stxvd2x vs13, o80, T2 | |||
stxvd2x vs14, o96, T2 | |||
stxvd2x vs15, o112, T2 | |||
stxvd2x vs0, 0, CO | |||
stxvd2x vs1, o16, CO | |||
stxvd2x vs2, o32, CO | |||
stxvd2x vs3, o48, CO | |||
stxvd2x vs4, o64, CO | |||
stxvd2x vs5, o80, CO | |||
stxvd2x vs6, o96, CO | |||
stxvd2x vs7, o112, CO | |||
xvmaddadp vs32, vs56, alpha_r | |||
xvmaddadp vs33, vs57, alpha_r | |||
xvmaddadp vs34, vs58, alpha_r | |||
xvmaddadp vs35, vs59, alpha_r | |||
stxvd2x vs24, 0, T3 | |||
stxvd2x vs25, o16, T3 | |||
stxvd2x vs26, o32, T3 | |||
stxvd2x vs27, o48, T3 | |||
xvmaddadp vs36, vs60, alpha_r | |||
xvmaddadp vs37, vs61, alpha_r | |||
xvmaddadp vs38, vs62, alpha_r | |||
xvmaddadp vs39, vs63, alpha_r | |||
addi CO, CO, 128 | |||
stxvd2x vs8, o0, T2 | |||
stxvd2x vs9, o16, T2 | |||
stxvd2x vs10, o32, T2 | |||
stxvd2x vs11, o48, T2 | |||
stxvd2x vs12, o64, T2 | |||
stxvd2x vs13, o80, T2 | |||
stxvd2x vs14, o96, T2 | |||
stxvd2x vs15, o112, T2 | |||
stxvd2x vs24, 0, T3 | |||
stxvd2x vs25, o16, T3 | |||
stxvd2x vs28, o64, T3 | |||
stxvd2x vs29, o80, T3 | |||
stxvd2x vs26, o32, T3 | |||
stxvd2x vs27, o48, T3 | |||
stxvd2x vs30, o96, T3 | |||
stxvd2x vs31, o112, T3 | |||
@@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stxvd2x vs34, o32, T4 | |||
stxvd2x vs35, o48, T4 | |||
addi CO, CO, 128 | |||
stxvd2x vs36, o64, T4 | |||
stxvd2x vs37, o80, T4 | |||
stxvd2x vs38, o96, T4 | |||
@@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define DNUMOPT 8 | |||
#define GEMM_DEFAULT_OFFSET_A 0 | |||
#define GEMM_DEFAULT_OFFSET_B 4096 | |||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
#define GEMM_DEFAULT_OFFSET_B 65536 | |||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||
#define SGEMM_DEFAULT_UNROLL_N 8 | |||
@@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ZGEMM_DEFAULT_P 320 | |||
#define SGEMM_DEFAULT_Q 640 | |||
#define DGEMM_DEFAULT_Q 640 | |||
#define DGEMM_DEFAULT_Q 720 | |||
#define CGEMM_DEFAULT_Q 640 | |||
#define ZGEMM_DEFAULT_Q 640 | |||