power9 cgemm/ctrmm. new sgemm 8x16tags/v0.3.7
@@ -5,7 +5,7 @@ | |||||
STRMMKERNEL = sgemm_kernel_power9.S | STRMMKERNEL = sgemm_kernel_power9.S | ||||
DTRMMKERNEL = dgemm_kernel_power9.S | DTRMMKERNEL = dgemm_kernel_power9.S | ||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||||
CTRMMKERNEL = cgemm_kernel_power9.S | |||||
ZTRMMKERNEL = zgemm_kernel_power9.S | ZTRMMKERNEL = zgemm_kernel_power9.S | ||||
SGEMMKERNEL = sgemm_kernel_power9.S | SGEMMKERNEL = sgemm_kernel_power9.S | ||||
@@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||||
CGEMMKERNEL = cgemm_kernel_power9.S | |||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ||||
CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | ||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | ||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
@@ -0,0 +1,293 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
/************************************************************************************** | |||||
* Abdelrauf(quickwritereader@gmail.com) | |||||
* BLASTEST : OK | |||||
* CTEST : OK | |||||
* TEST : OK | |||||
* LAPACK-TEST : OK | |||||
**************************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE (512 ) | |||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r8 | |||||
#define B r9 | |||||
#define C r10 | |||||
#define LDC r6 | |||||
#define OFFSET r7 | |||||
#define alpha_r vs19 | |||||
#define alpha_i vs20 | |||||
#define save_permute_1 vs21 | |||||
#define permute_mask vs22 | |||||
#define o0 0 | |||||
#define T1 r11 | |||||
#define T2 r12 | |||||
#define T3 r14 | |||||
#define T4 r15 | |||||
#define T5 r16 | |||||
#define T6 r17 | |||||
#define L r18 | |||||
#define T7 r19 | |||||
#define T8 r20 | |||||
#define TEMP_REG r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define T9 r27 | |||||
#define T10 r28 | |||||
#define PRE r29 | |||||
#define T12 r30 | |||||
#define T13 r31 | |||||
#include "cgemm_macros_power9.S" | |||||
.equ perm_const1, 0x0405060700010203 | |||||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||||
.equ save_permute_11, 0x0405060714151617 | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
mflr r0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
std r0, FLINK_SAVE(SP) | |||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#ifdef TRMMKERNEL | |||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
#endif | |||||
slwi LDC, LDC, ZBASE_SHIFT | |||||
/*alpha is stored in f1. convert to single and splat*/ | |||||
xscvdpspn alpha_r,vs1 | |||||
xscvdpspn alpha_i,vs2 | |||||
xxspltw alpha_r,alpha_r,0 | |||||
xxspltw alpha_i,alpha_i,0 | |||||
/*load reverse permute mask for big endian | |||||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||||
*/ | |||||
lis T2, perm_const2@highest | |||||
lis T1, perm_const1@highest | |||||
lis T3, save_permute_12@highest | |||||
lis T4, save_permute_11@highest | |||||
ori T2, T2, perm_const2@higher | |||||
ori T1, T1, perm_const1@higher | |||||
ori T3, T3, save_permute_12@higher | |||||
ori T4, T4, save_permute_11@higher | |||||
rldicr T2, T2, 32, 31 | |||||
rldicr T1, T1, 32, 31 | |||||
rldicr T3, T3, 32, 31 | |||||
rldicr T4, T4, 32, 31 | |||||
oris T2, T2, perm_const2@h | |||||
oris T1, T1, perm_const1@h | |||||
oris T3, T3, save_permute_12@h | |||||
oris T4, T4, save_permute_11@h | |||||
ori T2, T2, perm_const2@l | |||||
ori T1, T1, perm_const1@l | |||||
ori T3, T3, save_permute_12@l | |||||
ori T4, T4, save_permute_11@l | |||||
li r0,0 | |||||
li PRE,512 | |||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||||
/*negate for this case as we will use addition -1*(a+b) */ | |||||
xvnegsp alpha_r,alpha_r | |||||
xvnegsp alpha_i,alpha_i | |||||
#endif | |||||
mtvsrdd permute_mask,T2,T1 | |||||
mtvsrdd save_permute_1,T3,T4 | |||||
/*mask is reverse permute so we have to make it inner permute */ | |||||
xxpermdi permute_mask, permute_mask, permute_mask,2 | |||||
#include "cgemm_logic_power9.S" | |||||
.L999: | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
ld r0, FLINK_SAVE(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
mtlr r0 | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif |
@@ -3,89 +3,89 @@ b L8 | |||||
MY_ALIGN | MY_ALIGN | ||||
LSGEMM_L8x16_LMAIN_SUB: | LSGEMM_L8x16_LMAIN_SUB: | ||||
LOAD8x16_0 | |||||
mtctr L | |||||
LOAD8x16_2 | |||||
MY_ALIGN | MY_ALIGN | ||||
LSGEMM_L8x16_LOOP: | LSGEMM_L8x16_LOOP: | ||||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 3,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 4,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 5,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 6,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 7,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 8,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 9,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 10,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 11,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 12,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 13,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 14,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 15,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 16,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 17,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 18,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 19,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 20,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 21,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 22,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 23,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 24,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 25,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 26,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 27,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 28,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 29,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 30,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 31,1 | |||||
KERNEL8x16_L2 128,64,0,0 | |||||
LSGEMM_L8x16_K128: | |||||
KERNEL8x16_L2 128,64,1,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 1,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 2,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 3,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 4,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 5,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 6,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 7,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 8,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 9,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 10,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 11,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 12,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 13,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 14,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 15,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 16,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 17,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 18,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 19,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 20,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 21,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 22,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 23,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 24,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 25,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 26,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 27,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 28,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 29,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 30,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 31,1 | |||||
bdnz LSGEMM_L8x16_LOOP | bdnz LSGEMM_L8x16_LOOP | ||||
MY_ALIGN | MY_ALIGN | ||||
LSGEMM_L8x16_LOOP_END: | LSGEMM_L8x16_LOOP_END: | ||||
END8x16 0, AO, BO, 64, 32 | |||||
END8x16_2 | |||||
blr | blr | ||||
MY_ALIGN | MY_ALIGN | ||||
LSGEMM_L8x16_L64_SUB: | LSGEMM_L8x16_L64_SUB: | ||||
LOAD8x16_0 | |||||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 3,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 4,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 5,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 6,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 7,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 8,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 9,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 10,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 11,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 12,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 13,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 14,0 | |||||
KERNEL8x16_I1_L4_3 64,32, 15,1 | |||||
LOAD8x16_2 | |||||
KERNEL8x16_I1_L4_2 128,64, 0,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 1,0 | |||||
KERNEL8x16_I1_L4_2 128,64, 2,0 | |||||
KERNEL8x16_I1_L4_2 128,64,3,0 | |||||
KERNEL8x16_I1_L4_2 128,64,4,0 | |||||
KERNEL8x16_I1_L4_2 128,64,5,0 | |||||
KERNEL8x16_I1_L4_2 128,64,6,0 | |||||
KERNEL8x16_I1_L4_2 128,64,7,0 | |||||
KERNEL8x16_I1_L4_2 128,64,8,0 | |||||
KERNEL8x16_I1_L4_2 128,64,9,0 | |||||
KERNEL8x16_I1_L4_2 128,64,10,0 | |||||
KERNEL8x16_I1_L4_2 128,64,11,0 | |||||
KERNEL8x16_I1_L4_2 128,64,12,0 | |||||
KERNEL8x16_I1_L4_2 128,64,13,0 | |||||
KERNEL8x16_I1_L4_2 128,64,14,0 | |||||
KERNEL8x16_I1_L4_3 128,64,15,1 | |||||
blr | blr | ||||
LSGEMM_L8x16_L32_SUB: | LSGEMM_L8x16_L32_SUB: | ||||
LOAD8x16_0 | |||||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 3,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 4,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 5,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 6,0 | |||||
KERNEL8x16_I1_L4_3 64,32, 7,1 | |||||
LOAD8x16_2 | |||||
KERNEL8x16_I1_L4_2 128,64,0,0 | |||||
KERNEL8x16_I1_L4_2 128,64,1,0 | |||||
KERNEL8x16_I1_L4_2 128,64,2,0 | |||||
KERNEL8x16_I1_L4_2 128,64,3,0 | |||||
KERNEL8x16_I1_L4_2 128,64,4,0 | |||||
KERNEL8x16_I1_L4_2 128,64,5,0 | |||||
KERNEL8x16_I1_L4_2 128,64,6,0 | |||||
KERNEL8x16_I1_L4_3 128,64,7,1 | |||||
blr | blr | ||||
LSGEMM_L8x16_L16_SUB: | LSGEMM_L8x16_L16_SUB: | ||||
LOAD8x16_0 | |||||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
KERNEL8x16_I1_L4_3 64,32, 3,1 | |||||
LOAD8x16_2 | |||||
KERNEL8x16_I1_L4_2 128,64,0,0 | |||||
KERNEL8x16_I1_L4_2 128,64,1,0 | |||||
KERNEL8x16_I1_L4_2 128,64,2,0 | |||||
KERNEL8x16_I1_L4_3 128,64,3,1 | |||||
blr | blr | ||||
L8: | L8: | ||||
@@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN: | |||||
#if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 | REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 | ||||
mr T12, T11 | mr T12, T11 | ||||
addi T12,T12, -1 | |||||
srawi. L, T12, 7 /**(T11-1) % 128x */ | |||||
addi T12,T12, -2 | |||||
srawi. L, T12, 7 /**(T11-2) % 128x */ | |||||
#else | #else | ||||
mr T12, K | mr T12, K | ||||
addi T12,T12, -1 | |||||
srawi. L, T12, 7 /**(K-1) % 128x */ | |||||
addi T12,T12, -2 | |||||
srawi. L, T12, 7 /**(K-2) % 128x */ | |||||
#endif | #endif | ||||
ZERO8x16 | |||||
ZERO8x16 | |||||
mtctr L | |||||
ble LSGEMM_L8x16_SUB0 | ble LSGEMM_L8x16_SUB0 | ||||
bl LSGEMM_L8x16_LMAIN_SUB | bl LSGEMM_L8x16_LMAIN_SUB | ||||
andi. L, T12, 127 | andi. L, T12, 127 | ||||
@@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0: | |||||
cmpwi T11,128 | cmpwi T11,128 | ||||
#else | #else | ||||
andi. L, K, 255 | andi. L, K, 255 | ||||
cmpwi K,129 | |||||
#endif | |||||
li T10,1 | |||||
bne CMP8x16_128K | |||||
addi BO,BO,-32 | |||||
addi AO,AO,-64 | |||||
LOAD8x16 64,32 | |||||
END8x16_WITHOUT_ADD | |||||
LOAD8x16_2O AO,BO, 128, 64 | |||||
mtctr T10 | |||||
bl LSGEMM_L8x16_K128 | |||||
b LSGEMM_L8x16_SAVE | |||||
CMP8x16_128K: | |||||
/*----------------------------------------*/ | |||||
#if defined(TRMMKERNEL) | |||||
cmpwi T11,128 | |||||
#else | |||||
cmpwi K,128 | cmpwi K,128 | ||||
#endif | |||||
bne LSGEMM_L8x16_SUB2 | |||||
MY_ALIGN | |||||
LSGEMM_L8x16_SUB2_128: | |||||
bl LSGEMM_L8x16_L64_SUB | |||||
bl LSGEMM_L8x16_L64_SUB | |||||
b LSGEMM_L8x16_SAVE | |||||
#endif | |||||
bne LSGEMM_L8x16_SUB2 | |||||
MY_ALIGN | |||||
mtctr T10 | |||||
addi BO,BO,-64 | |||||
addi AO,AO,-128 | |||||
LOAD8x16_2O AO,BO, 128,64 | |||||
bl LSGEMM_L8x16_K128 | |||||
b LSGEMM_L8x16_SAVE | |||||
MY_ALIGN | MY_ALIGN | ||||
LSGEMM_L8x16_SUB2: | LSGEMM_L8x16_SUB2: | ||||
andi. T10,L,64 | andi. T10,L,64 | ||||
@@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16: | |||||
LSGEMM_L8x16_SUB2_8: | LSGEMM_L8x16_SUB2_8: | ||||
andi. T10,L, 8 | andi. T10,L, 8 | ||||
ble LSGEMM_L8x16_SUB2_4 | ble LSGEMM_L8x16_SUB2_4 | ||||
LOAD8x16_0 | |||||
KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
KERNEL8x16_I1_L4_3 64,32, 1,1 | |||||
LOAD8x16_2 | |||||
KERNEL8x16_I1_L4_2 128,64, 0,0 | |||||
KERNEL8x16_I1_L4_3 128,64, 1,1 | |||||
MY_ALIGN | MY_ALIGN | ||||
LSGEMM_L8x16_SUB2_4: | LSGEMM_L8x16_SUB2_4: | ||||
andi. T10,L, 4 | andi. T10,L, 4 | ||||
ble LSGEMM_L8x16_SUB2_2 | ble LSGEMM_L8x16_SUB2_2 | ||||
LOAD8x16_0 | |||||
KERNEL8x16_I1_L4_3 64,32, 0,1 | |||||
LOAD8x16_2 | |||||
KERNEL8x16_I1_L4_3 128,64, 0,1 | |||||
MY_ALIGN | MY_ALIGN | ||||
LSGEMM_L8x16_SUB2_2: | LSGEMM_L8x16_SUB2_2: | ||||
andi. T10,L, 2 | andi. T10,L, 2 | ||||
ble LSGEMM_L8x16_SUB2_1 | ble LSGEMM_L8x16_SUB2_1 | ||||
LOAD8x16_0 | |||||
KERNEL8x16_I1_L2_3 64,32, 0,1 | |||||
LOAD8x16_2 | |||||
KERNEL8x16_E2 128,64, 0,1 | |||||
MY_ALIGN | MY_ALIGN | ||||
LSGEMM_L8x16_SUB2_1: | LSGEMM_L8x16_SUB2_1: | ||||
andi. T10,L, 1 | andi. T10,L, 1 | ||||
@@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
* Macros for N=8 and M=16 | * Macros for N=8 and M=16 | ||||
**********************************************************************************************/ | **********************************************************************************************/ | ||||
.macro LOAD8x16_1 | |||||
LOAD8x16 1 | |||||
.endm | |||||
.macro LOAD8x16_0 | |||||
LOAD8x16 0 | |||||
.endm | |||||
.macro KERNEL8x16_L1_L4 Index,IsLast | .macro KERNEL8x16_L1_L4 Index,IsLast | ||||
KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 | KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 | ||||
@@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast | .macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast | ||||
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 | KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 | ||||
.endm | .endm | ||||
.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast | |||||
KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 | |||||
.endm | |||||
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast | .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast | ||||
KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 | KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 | ||||
.endm | .endm | ||||
@@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
xxlxor vs63, vs63, vs63 | xxlxor vs63, vs63, vs63 | ||||
.endm | .endm | ||||
.macro LOAD8x16 Zero | |||||
.macro LOAD8x16 OffsetA,OffsetB | |||||
lxv vs24, 0(BO) | |||||
lxv vs28, 16(BO) | |||||
lxv vs24, (\OffsetB+0)(BO) | |||||
lxv vs28, (\OffsetB+16)(BO) | |||||
xxperm vs26, vs24, permute_mask | xxperm vs26, vs24, permute_mask | ||||
xxperm vs30, vs28, permute_mask | xxperm vs30, vs28, permute_mask | ||||
lxv vs0, 0(AO) | |||||
lxv vs1, 16(AO) | |||||
lxv vs0, (\OffsetA+0)(AO) | |||||
lxv vs1, (\OffsetA+16)(AO) | |||||
xxpermdi vs25, vs24, vs24,2 | xxpermdi vs25, vs24, vs24,2 | ||||
xxpermdi vs29, vs28, vs28,2 | xxpermdi vs29, vs28, vs28,2 | ||||
lxv vs2, 32(AO) | |||||
lxv vs3, 48(AO) | |||||
lxv vs2, (\OffsetA+32)(AO) | |||||
lxv vs3, (\OffsetA+48)(AO) | |||||
xxpermdi vs27, vs26, vs26,2 | xxpermdi vs27, vs26, vs26,2 | ||||
xxpermdi vs31, vs30, vs30,2 | xxpermdi vs31, vs30, vs30,2 | ||||
.if \Zero==1 | |||||
xxlxor vs32, vs32, vs32 | |||||
xxlxor vs33, vs33, vs33 | |||||
xxlxor vs34, vs34, vs34 | |||||
xxlxor vs35, vs35, vs35 | |||||
xxlxor vs36, vs36, vs36 | |||||
xxlxor vs37, vs37, vs37 | |||||
xxlxor vs38, vs38, vs38 | |||||
xxlxor vs39, vs39, vs39 | |||||
xxlxor vs40, vs40, vs40 | |||||
xxlxor vs41, vs41, vs41 | |||||
xxlxor vs42, vs42, vs42 | |||||
xxlxor vs43, vs43, vs43 | |||||
xxlxor vs44, vs44, vs44 | |||||
xxlxor vs45, vs45, vs45 | |||||
xxlxor vs46, vs46, vs46 | |||||
xxlxor vs47, vs47, vs47 | |||||
xxlxor vs48, vs48, vs48 | |||||
xxlxor vs49, vs49, vs49 | |||||
xxlxor vs50, vs50, vs50 | |||||
xxlxor vs51, vs51, vs51 | |||||
xxlxor vs52, vs52, vs52 | |||||
xxlxor vs53, vs53, vs53 | |||||
xxlxor vs54, vs54, vs54 | |||||
xxlxor vs55, vs55, vs55 | |||||
xxlxor vs56, vs56, vs56 | |||||
xxlxor vs57, vs57, vs57 | |||||
xxlxor vs58, vs58, vs58 | |||||
xxlxor vs59, vs59, vs59 | |||||
xxlxor vs60, vs60, vs60 | |||||
xxlxor vs61, vs61, vs61 | |||||
xxlxor vs62, vs62, vs62 | |||||
xxlxor vs63, vs63, vs63 | |||||
.endif | |||||
.endm | .endm | ||||
.macro END8x16_NORMAL | .macro END8x16_NORMAL | ||||
END8x16 0, AO, BO, 64,32 | END8x16 0, AO, BO, 64,32 | ||||
.endm | .endm | ||||
.macro END8x16_WITHOUT_ADD | |||||
END8x16 0, AO,BO,0,0 | |||||
.endm | |||||
.macro END8x16 First, AREG, BREG, OffsetA, OffsetB | .macro END8x16 First, AREG, BREG, OffsetA, OffsetB | ||||
.if \OffsetB != 0 | .if \OffsetB != 0 | ||||
@@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete | .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete | ||||
KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 | |||||
KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete | |||||
KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 | |||||
KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete | |||||
.endm | .endm | ||||
.macro KERNEL8x16 First | .macro KERNEL8x16 First | ||||
LOAD8x16 0 | |||||
LOAD8x16 0,0 | |||||
END8x16 \First, AO, BO, 64,32 | END8x16 \First, AO, BO, 64,32 | ||||
.endm | .endm | ||||
.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete | |||||
lxv vs8, DISP16(\Index,\OffsetB)(\BREG) | |||||
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) | |||||
.macro LOAD8x16_2 | |||||
LOAD8x16_2O AO,BO, 0,0 | |||||
.endm | |||||
xvmaddasp vs32, vs0,vs24 | |||||
xvmaddasp vs36, vs0,vs25 | |||||
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) | |||||
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) | |||||
xxperm vs10, vs8, permute_mask | |||||
xxperm vs14, vs12, permute_mask | |||||
xvmaddasp vs40, vs0,vs26 | |||||
xvmaddasp vs44, vs0,vs27 | |||||
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) | |||||
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) | |||||
xvmaddasp vs48, vs0,vs28 | |||||
xvmaddasp vs52, vs0,vs29 | |||||
.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB | |||||
lxv vs8, (\OffsetB)(\BREG) | |||||
lxv vs12, (16+\OffsetB)(\BREG) | |||||
lxv vs24, (32+\OffsetB)(\BREG) | |||||
lxv vs28, (32+16+\OffsetB)(\BREG) | |||||
lxv vs4, (0+\OffsetA)(\AREG) | |||||
lxv vs5, (16+\OffsetA)(\AREG) | |||||
xxperm vs10, vs8, permute_mask | |||||
xxperm vs14, vs12, permute_mask | |||||
lxv vs6, (32+\OffsetA)(\AREG) | |||||
lxv vs7, (48+\OffsetA)(\AREG) | |||||
xxpermdi vs9, vs8, vs8,2 | |||||
xxpermdi vs13, vs12, vs12,2 | |||||
lxv vs0, (64+\OffsetA)(\AREG) | |||||
lxv vs1, (64+16+\OffsetA)(\AREG) | |||||
xxpermdi vs11, vs10, vs10,2 | |||||
xxpermdi vs15, vs14, vs14,2 | |||||
lxv vs2, (64+32+\OffsetA)(\AREG) | |||||
lxv vs3, (64+48+\OffsetA)(\AREG) | |||||
xxpermdi vs9, vs8, vs8,2 | |||||
xxpermdi vs13, vs12, vs12,2 | |||||
xxperm vs26, vs24, permute_mask | |||||
xxperm vs30, vs28, permute_mask | |||||
xxpermdi vs25, vs24, vs24,2 | |||||
xxpermdi vs29, vs28, vs28,2 | |||||
xxpermdi vs27, vs26, vs26,2 | |||||
xxpermdi vs31, vs30, vs30,2 | |||||
.endm | |||||
xvmaddasp vs56, vs0,vs30 | |||||
xvmaddasp vs60, vs0,vs31 | |||||
.macro END8x16_2 | |||||
/*for load2 offset will be 128 and 64*/ | |||||
KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 | |||||
.endm | |||||
xxpermdi vs11, vs10, vs10,2 | |||||
xxpermdi vs15, vs14, vs14,2 | |||||
.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast | |||||
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 | |||||
.endm | |||||
xvmaddasp vs33, vs1,vs24 | |||||
xvmaddasp vs37, vs1,vs25 | |||||
.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast | |||||
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 | |||||
.endm | |||||
xvmaddasp vs41, vs1,vs26 | |||||
xvmaddasp vs45, vs1,vs27 | |||||
xvmaddasp vs49, vs1,vs28 | |||||
xvmaddasp vs53, vs1,vs29 | |||||
xvmaddasp vs57, vs1,vs30 | |||||
xvmaddasp vs61, vs1,vs31 | |||||
.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete | |||||
xvmaddasp vs32, vs4,vs8 | |||||
xvmaddasp vs33, vs5,vs8 | |||||
xvmaddasp vs48, vs4,vs12 | |||||
xvmaddasp vs49, vs5,vs12 | |||||
xvmaddasp vs40, vs4,vs10 | |||||
xvmaddasp vs41, vs5,vs10 | |||||
xvmaddasp vs56, vs4,vs14 | |||||
xvmaddasp vs57, vs5,vs14 | |||||
xvmaddasp vs36, vs4,vs9 | |||||
xvmaddasp vs37, vs5,vs9 | |||||
xvmaddasp vs52, vs4,vs13 | |||||
xvmaddasp vs53, vs5,vs13 | |||||
xvmaddasp vs44, vs4,vs11 | |||||
xvmaddasp vs45, vs5,vs11 | |||||
xvmaddasp vs60, vs4,vs15 | |||||
xvmaddasp vs61, vs5,vs15 | |||||
.if \Complete==0 | |||||
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) | |||||
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) | |||||
.endif | |||||
xvmaddasp vs34, vs6,vs8 | |||||
xvmaddasp vs35, vs7,vs8 | |||||
xvmaddasp vs50, vs6,vs12 | |||||
xvmaddasp vs51, vs7,vs12 | |||||
.if \Complete==0 | |||||
lxv vs8, DISP16(\Index,\OffsetB)(\BREG) | |||||
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) | |||||
.endif | |||||
xvmaddasp vs42, vs6,vs10 | |||||
xvmaddasp vs43, vs7,vs10 | |||||
xvmaddasp vs58, vs6,vs14 | |||||
xvmaddasp vs59, vs7,vs14 | |||||
.if \Complete==0 | |||||
xxperm vs10, vs8, permute_mask | |||||
xxperm vs14, vs12, permute_mask | |||||
.endif | |||||
xvmaddasp vs38, vs6,vs9 | |||||
xvmaddasp vs39, vs7,vs9 | |||||
xvmaddasp vs54, vs6,vs13 | |||||
xvmaddasp vs55, vs7,vs13 | |||||
.if \Complete==0 | .if \Complete==0 | ||||
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) | |||||
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) | |||||
xxpermdi vs9, vs8, vs8,2 | |||||
xxpermdi vs13, vs12, vs12,2 | |||||
.endif | |||||
xvmaddasp vs46, vs6,vs11 | |||||
xvmaddasp vs47, vs7,vs11 | |||||
xvmaddasp vs62, vs6,vs15 | |||||
xvmaddasp vs63, vs7,vs15 | |||||
.if \Complete==0 | |||||
xxpermdi vs11, vs10, vs10,2 | |||||
xxpermdi vs15, vs14, vs14,2 | |||||
.endif | |||||
.if \Complete==0 | |||||
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) | |||||
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) | |||||
.endif | |||||
xvmaddasp vs32, vs0,vs24 | |||||
xvmaddasp vs33, vs1,vs24 | |||||
xvmaddasp vs48, vs0,vs28 | |||||
xvmaddasp vs49, vs1,vs28 | |||||
xvmaddasp vs40, vs0,vs26 | |||||
xvmaddasp vs41, vs1,vs26 | |||||
xvmaddasp vs56, vs0,vs30 | |||||
xvmaddasp vs57, vs1,vs30 | |||||
xvmaddasp vs36, vs0,vs25 | |||||
xvmaddasp vs37, vs1,vs25 | |||||
xvmaddasp vs52, vs0,vs29 | |||||
xvmaddasp vs53, vs1,vs29 | |||||
xvmaddasp vs44, vs0,vs27 | |||||
xvmaddasp vs45, vs1,vs27 | |||||
xvmaddasp vs60, vs0,vs31 | |||||
xvmaddasp vs61, vs1,vs31 | |||||
.if \Complete==0 | |||||
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) | |||||
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) | |||||
.endif | .endif | ||||
xvmaddasp vs34, vs2,vs24 | |||||
xvmaddasp vs38, vs2,vs25 | |||||
xvmaddasp vs42, vs2,vs26 | |||||
xvmaddasp vs46, vs2,vs27 | |||||
xvmaddasp vs50, vs2,vs28 | |||||
xvmaddasp vs54, vs2,vs29 | |||||
xvmaddasp vs58, vs2,vs30 | |||||
xvmaddasp vs62, vs2,vs31 | |||||
xvmaddasp vs35, vs3,vs24 | |||||
xvmaddasp vs39, vs3,vs25 | |||||
xvmaddasp vs43, vs3,vs26 | |||||
xvmaddasp vs47, vs3,vs27 | |||||
xvmaddasp vs51, vs3,vs28 | |||||
xvmaddasp vs55, vs3,vs29 | |||||
xvmaddasp vs59, vs3,vs30 | |||||
xvmaddasp vs63, vs3,vs31 | |||||
.if \Complete==0 | |||||
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) | |||||
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) | |||||
xvmaddasp vs34, vs2,vs24 | |||||
xvmaddasp vs35, vs3,vs24 | |||||
xvmaddasp vs50, vs2,vs28 | |||||
xvmaddasp vs51, vs3,vs28 | |||||
.if \Complete==0 | |||||
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) | |||||
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) | |||||
.endif | |||||
xvmaddasp vs42, vs2,vs26 | |||||
xvmaddasp vs43, vs3,vs26 | |||||
xvmaddasp vs58, vs2,vs30 | |||||
xvmaddasp vs59, vs3,vs30 | |||||
.if \Complete==0 | |||||
xxperm vs26, vs24, permute_mask | |||||
xxperm vs30, vs28, permute_mask | |||||
.endif | |||||
xvmaddasp vs38, vs2,vs25 | |||||
xvmaddasp vs39, vs3,vs25 | |||||
xvmaddasp vs54, vs2,vs29 | |||||
xvmaddasp vs55, vs3,vs29 | |||||
.if \Complete==0 | |||||
xxpermdi vs25, vs24, vs24,2 | |||||
xxpermdi vs29, vs28, vs28,2 | |||||
.endif | |||||
xvmaddasp vs46, vs2,vs27 | |||||
xvmaddasp vs47, vs3,vs27 | |||||
xvmaddasp vs62, vs2,vs31 | |||||
xvmaddasp vs63, vs3,vs31 | |||||
.if \Complete==0 | |||||
xxpermdi vs27, vs26, vs26,2 | |||||
xxpermdi vs31, vs30, vs30,2 | |||||
.endif | .endif | ||||
xvmaddasp vs32, vs4,vs8 | |||||
xvmaddasp vs36, vs4,vs9 | |||||
.if \Complete==0 | .if \Complete==0 | ||||
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) | |||||
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) | |||||
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) | |||||
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) | |||||
.endif | .endif | ||||
.if \IsLast==1 | .if \IsLast==1 | ||||
.if \Complete==1 | .if \Complete==1 | ||||
addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) | |||||
addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) | |||||
addi \BREG, \BREG, DISP16(\Index,\OffsetB) | |||||
addi \AREG, \AREG, DISP32(\Index,\OffsetA) | |||||
.else | .else | ||||
addi \AREG, \AREG, DISP32(\Index,128) | |||||
addi \BREG, \BREG, DISP16(\Index,64) | addi \BREG, \BREG, DISP16(\Index,64) | ||||
addi \AREG, \AREG, DISP32(\Index,128) | |||||
.endif | .endif | ||||
.endif | .endif | ||||
xvmaddasp vs40, vs4,vs10 | |||||
xvmaddasp vs44, vs4,vs11 | |||||
.if \Complete==0 | |||||
xxperm vs26, vs24, permute_mask | |||||
xxperm vs30, vs28, permute_mask | |||||
.endif | |||||
xvmaddasp vs48, vs4,vs12 | |||||
xvmaddasp vs52, vs4,vs13 | |||||
.if \Complete==0 | |||||
xxpermdi vs25, vs24, vs24,2 | |||||
xxpermdi vs29, vs28, vs28,2 | |||||
.endif | |||||
xvmaddasp vs56, vs4,vs14 | |||||
xvmaddasp vs60, vs4,vs15 | |||||
.if \Complete==0 | |||||
xxpermdi vs27, vs26, vs26,2 | |||||
xxpermdi vs31, vs30, vs30,2 | |||||
.endif | |||||
xvmaddasp vs33, vs5,vs8 | |||||
xvmaddasp vs37, vs5,vs9 | |||||
xvmaddasp vs41, vs5,vs10 | |||||
xvmaddasp vs45, vs5,vs11 | |||||
xvmaddasp vs49, vs5,vs12 | |||||
xvmaddasp vs53, vs5,vs13 | |||||
xvmaddasp vs57, vs5,vs14 | |||||
xvmaddasp vs61, vs5,vs15 | |||||
xvmaddasp vs34, vs6,vs8 | |||||
xvmaddasp vs38, vs6,vs9 | |||||
xvmaddasp vs42, vs6,vs10 | |||||
xvmaddasp vs46, vs6,vs11 | |||||
xvmaddasp vs50, vs6,vs12 | |||||
xvmaddasp vs54, vs6,vs13 | |||||
xvmaddasp vs58, vs6,vs14 | |||||
xvmaddasp vs62, vs6,vs15 | |||||
xvmaddasp vs35, vs7,vs8 | |||||
xvmaddasp vs39, vs7,vs9 | |||||
xvmaddasp vs43, vs7,vs10 | |||||
xvmaddasp vs47, vs7,vs11 | |||||
xvmaddasp vs51, vs7,vs12 | |||||
xvmaddasp vs55, vs7,vs13 | |||||
xvmaddasp vs59, vs7,vs14 | |||||
xvmaddasp vs63, vs7,vs15 | |||||
.endm | .endm | ||||
@@ -1353,7 +1353,7 @@ ZGEMM_L1: | |||||
ZGEMM_L1_BEGIN: | ZGEMM_L1_BEGIN: | ||||
/*----------------------------------------*/ | /*----------------------------------------*/ | ||||
mr CO, C | mr CO, C | ||||
slwi T1, LDC , 1 | |||||
add T2,C,LDC | add T2,C,LDC | ||||
mr AO, A | mr AO, A | ||||
add C, C, T1 | add C, C, T1 | ||||
@@ -2250,12 +2250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define SGEMM_DEFAULT_P 832 | #define SGEMM_DEFAULT_P 832 | ||||
#define DGEMM_DEFAULT_P 128 | #define DGEMM_DEFAULT_P 128 | ||||
#define CGEMM_DEFAULT_P 640 | |||||
#define CGEMM_DEFAULT_P 512 | |||||
#define ZGEMM_DEFAULT_P 256 | #define ZGEMM_DEFAULT_P 256 | ||||
#define SGEMM_DEFAULT_Q 1025 | |||||
#define SGEMM_DEFAULT_Q 1026 | |||||
#define DGEMM_DEFAULT_Q 384 | #define DGEMM_DEFAULT_Q 384 | ||||
#define CGEMM_DEFAULT_Q 640 | |||||
#define CGEMM_DEFAULT_Q 1026 | |||||
#define ZGEMM_DEFAULT_Q 1026 | #define ZGEMM_DEFAULT_Q 1026 | ||||
#define SYMV_P 8 | #define SYMV_P 8 | ||||