Power8 blas3 copy-pack routinestags/v0.3.0
@@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "cgemm_tcopy_macros_8_power8.S" | #include "cgemm_tcopy_macros_8_power8.S" | ||||
#define STACKSIZE 576 | |||||
#define STACKSIZE 144 | |||||
PROLOGUE | PROLOGUE | ||||
@@ -119,49 +119,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
addi SP, SP, -STACKSIZE | addi SP, SP, -STACKSIZE | ||||
li r0, 0 | li r0, 0 | ||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
addi r11, SP, 288 | |||||
stvx v20, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v21, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v22, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v23, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v24, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v25, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v26, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v27, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v28, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v29, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v30, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v31, r11, r0 | |||||
li r11, 0 | |||||
std r14, 0(SP) | |||||
std r15, 8(SP) | |||||
std r16, 16(SP) | |||||
std r17, 24(SP) | |||||
std r18, 32(SP) | |||||
std r19, 40(SP) | |||||
std r20, 48(SP) | |||||
std r21, 56(SP) | |||||
std r22, 64(SP) | |||||
std r23, 72(SP) | |||||
std r24, 80(SP) | |||||
std r25, 88(SP) | |||||
std r26, 96(SP) | |||||
std r27, 104(SP) | |||||
std r28, 112(SP) | |||||
std r29, 120(SP) | |||||
std r30, 128(SP) | |||||
std r31, 136(SP) | |||||
cmpwi cr0, M, 0 | cmpwi cr0, M, 0 | ||||
ble- L999 | ble- L999 | ||||
@@ -203,51 +178,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
L999: | L999: | ||||
li r3, 0 | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
addi r11, SP, 288 | |||||
lvx v20, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v21, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v22, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v23, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v24, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v25, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v26, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v27, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v28, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v29, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v30, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v31, r11, r3 | |||||
li r11, 0 | |||||
ld r14, 0(SP) | |||||
ld r15, 8(SP) | |||||
ld r16, 16(SP) | |||||
ld r17, 24(SP) | |||||
ld r18, 32(SP) | |||||
ld r19, 40(SP) | |||||
ld r20, 48(SP) | |||||
ld r21, 56(SP) | |||||
ld r22, 64(SP) | |||||
ld r23, 72(SP) | |||||
ld r24, 80(SP) | |||||
ld r25, 88(SP) | |||||
ld r26, 96(SP) | |||||
ld r27, 104(SP) | |||||
ld r28, 112(SP) | |||||
ld r29, 120(SP) | |||||
ld r30, 128(SP) | |||||
ld r31, 136(SP) | |||||
addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
blr | blr | ||||
@@ -109,80 +109,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "dgemm_ncopy_macros_4_power8.S" | #include "dgemm_ncopy_macros_4_power8.S" | ||||
#define STACKSIZE 384 | |||||
#define STACKSIZE 576 | |||||
#define STACKSIZE 144 | |||||
PROLOGUE | PROLOGUE | ||||
PROFCODE | PROFCODE | ||||
addi SP, SP, -STACKSIZE | addi SP, SP, -STACKSIZE | ||||
//addi SP, SP, -208 | |||||
li r0, 0 | li r0, 0 | ||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
std r14, 0(SP) | |||||
std r15, 8(SP) | |||||
std r16, 16(SP) | |||||
std r17, 24(SP) | |||||
std r18, 32(SP) | |||||
std r19, 40(SP) | |||||
std r20, 48(SP) | |||||
std r21, 56(SP) | |||||
std r22, 64(SP) | |||||
std r23, 72(SP) | |||||
std r24, 80(SP) | |||||
std r25, 88(SP) | |||||
std r26, 96(SP) | |||||
std r27, 104(SP) | |||||
std r28, 112(SP) | |||||
std r29, 120(SP) | |||||
std r30, 128(SP) | |||||
std r31, 136(SP) | |||||
addi r11,SP,288 | |||||
stvx v20, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v21, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v22, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v23, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v24, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v25, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v26, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v27, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v28, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v29, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v30, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v31, r11,r0 | |||||
li r11,0 | |||||
cmpwi cr0, M, 0 | cmpwi cr0, M, 0 | ||||
ble- L999 | ble- L999 | ||||
@@ -191,10 +146,8 @@ li r11,0 | |||||
slwi LDA, LDA, BASE_SHIFT | slwi LDA, LDA, BASE_SHIFT | ||||
//li PREA, 384 | |||||
//li PREB, 384 | |||||
li PREA, 576 | |||||
li PREB, 576 | |||||
li PREA, 384 | |||||
li PREB, 384 | |||||
li o8, 8 | li o8, 8 | ||||
@@ -210,70 +163,24 @@ li r11,0 | |||||
L999: | L999: | ||||
li r3, 0 | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
addi r11,SP,288 | |||||
lvx v20, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v21, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v22, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v23, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v24, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v25, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v26, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v27, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v28, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v29, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v30, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v31, r11,r3 | |||||
li r11,0 | |||||
ld r14, 0(SP) | |||||
ld r15, 8(SP) | |||||
ld r16, 16(SP) | |||||
ld r17, 24(SP) | |||||
ld r18, 32(SP) | |||||
ld r19, 40(SP) | |||||
ld r20, 48(SP) | |||||
ld r21, 56(SP) | |||||
ld r22, 64(SP) | |||||
ld r23, 72(SP) | |||||
ld r24, 80(SP) | |||||
ld r25, 88(SP) | |||||
ld r26, 96(SP) | |||||
ld r27, 104(SP) | |||||
ld r28, 112(SP) | |||||
ld r29, 120(SP) | |||||
ld r30, 128(SP) | |||||
ld r31, 136(SP) | |||||
addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
//addi SP, SP, 208 | //addi SP, SP, 208 | ||||
@@ -41,94 +41,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro COPY_4x16 | .macro COPY_4x16 | ||||
lxvd2x vs0, o0, A0 | lxvd2x vs0, o0, A0 | ||||
lxvd2x vs8, o0, A1 | |||||
lxvd2x vs24, o0, A3 | |||||
lxvd2x vs16, o0, A2 | |||||
lxvd2x vs1, o0, A1 | |||||
lxvd2x vs2, o0, A2 | |||||
lxvd2x vs3, o0, A3 | |||||
lxvd2x vs1, o16, A0 | |||||
lxvd2x vs9, o16, A1 | |||||
lxvd2x vs17, o16, A2 | |||||
lxvd2x vs25, o16, A3 | |||||
lxvd2x vs4, o16, A0 | |||||
lxvd2x vs5, o16, A1 | |||||
lxvd2x vs6, o16, A2 | |||||
lxvd2x vs7, o16, A3 | |||||
lxvd2x vs2, o32, A0 | |||||
lxvd2x vs10, o32, A1 | |||||
lxvd2x vs18, o32, A2 | |||||
lxvd2x vs26, o32, A3 | |||||
xxpermdi vs32, vs0, vs1, 0 | |||||
xxpermdi vs33, vs2, vs3, 0 | |||||
xxpermdi vs34, vs0, vs1, 3 | |||||
xxpermdi vs35, vs2, vs3, 3 | |||||
lxvd2x vs3, o48, A0 | |||||
lxvd2x vs11, o48, A1 | |||||
lxvd2x vs19, o48, A2 | |||||
lxvd2x vs27, o48, A3 | |||||
xxpermdi vs36, vs4, vs5, 0 | |||||
xxpermdi vs37, vs6, vs7, 0 | |||||
xxpermdi vs38, vs4, vs5, 3 | |||||
xxpermdi vs39, vs6, vs7, 3 | |||||
lxvd2x vs4, o64, A0 | |||||
lxvd2x vs12, o64, A1 | |||||
lxvd2x vs20, o64, A2 | |||||
lxvd2x vs28, o64, A3 | |||||
lxvd2x vs0, o32, A0 | |||||
lxvd2x vs1, o32, A1 | |||||
lxvd2x vs2, o32, A2 | |||||
lxvd2x vs3, o32, A3 | |||||
lxvd2x vs5, o80, A0 | |||||
lxvd2x vs13, o80, A1 | |||||
lxvd2x vs21, o80, A2 | |||||
lxvd2x vs29, o80, A3 | |||||
lxvd2x vs4, o48, A0 | |||||
lxvd2x vs5, o48, A1 | |||||
lxvd2x vs6, o48, A2 | |||||
lxvd2x vs7, o48, A3 | |||||
lxvd2x vs6, o96, A0 | |||||
lxvd2x vs14, o96, A1 | |||||
lxvd2x vs22, o96, A2 | |||||
lxvd2x vs30, o96, A3 | |||||
lxvd2x vs7, o112, A0 | |||||
lxvd2x vs15, o112, A1 | |||||
lxvd2x vs23, o112, A2 | |||||
lxvd2x vs31, o112, A3 | |||||
xxpermdi vs40, vs0, vs1, 0 | |||||
xxpermdi vs41, vs2, vs3, 0 | |||||
xxpermdi vs42, vs0, vs1, 3 | |||||
xxpermdi vs43, vs2, vs3, 3 | |||||
xxpermdi vs44, vs4, vs5, 0 | |||||
xxpermdi vs45, vs6, vs7, 0 | |||||
xxpermdi vs46, vs4, vs5, 3 | |||||
xxpermdi vs47, vs6, vs7, 3 | |||||
xxpermdi vs32, vs0, vs8, 0 | |||||
xxpermdi vs33, vs16, vs24, 0 | |||||
xxpermdi vs34, vs0, vs8, 3 | |||||
xxpermdi vs35, vs16, vs24, 3 | |||||
lxvd2x vs0, o64, A0 | |||||
lxvd2x vs1, o64, A1 | |||||
lxvd2x vs2, o64, A2 | |||||
lxvd2x vs3, o64, A3 | |||||
xxpermdi vs36, vs1, vs9, 0 | |||||
xxpermdi vs37, vs17, vs25, 0 | |||||
xxpermdi vs38, vs1, vs9, 3 | |||||
xxpermdi vs39, vs17, vs25, 3 | |||||
lxvd2x vs4, o80, A0 | |||||
lxvd2x vs5, o80, A1 | |||||
lxvd2x vs6, o80, A2 | |||||
lxvd2x vs7, o80, A3 | |||||
xxpermdi vs40, vs2, vs10, 0 | |||||
xxpermdi vs41, vs18, vs26, 0 | |||||
xxpermdi vs42, vs2, vs10, 3 | |||||
xxpermdi vs43, vs18, vs26, 3 | |||||
xxpermdi vs44, vs3, vs11, 0 | |||||
xxpermdi vs45, vs19, vs27, 0 | |||||
xxpermdi vs46, vs3, vs11, 3 | |||||
xxpermdi vs47, vs19, vs27, 3 | |||||
xxpermdi vs48, vs0, vs1, 0 | |||||
xxpermdi vs49, vs2, vs3, 0 | |||||
xxpermdi vs50, vs0, vs1, 3 | |||||
xxpermdi vs51, vs2, vs3, 3 | |||||
xxpermdi vs8, vs4, vs5, 0 | |||||
xxpermdi vs9, vs6, vs7, 0 | |||||
xxpermdi vs10, vs4, vs5, 3 | |||||
xxpermdi vs11, vs6, vs7, 3 | |||||
lxvd2x vs0, o96, A0 | |||||
lxvd2x vs1, o96, A1 | |||||
lxvd2x vs2, o96, A2 | |||||
lxvd2x vs3, o96, A3 | |||||
lxvd2x vs6, o112, A0 | |||||
lxvd2x vs7, o112, A1 | |||||
lxvd2x vs12, o112, A2 | |||||
lxvd2x vs13, o112, A3 | |||||
xxpermdi vs48, vs4, vs12, 0 | |||||
xxpermdi vs49, vs20, vs28, 0 | |||||
xxpermdi vs50, vs4, vs12, 3 | |||||
xxpermdi vs51, vs20, vs28, 3 | |||||
xxpermdi vs52, vs5, vs13, 0 | |||||
xxpermdi vs53, vs21, vs29, 0 | |||||
xxpermdi vs54, vs5, vs13, 3 | |||||
xxpermdi vs55, vs21, vs29, 3 | |||||
xxpermdi vs4, vs0, vs1, 0 | |||||
xxpermdi vs5, vs2, vs3, 0 | |||||
xxpermdi vs0, vs0, vs1, 3 | |||||
xxpermdi vs2, vs2, vs3, 3 | |||||
addi A0, A0, 128 | addi A0, A0, 128 | ||||
addi A1, A1, 128 | addi A1, A1, 128 | ||||
xxpermdi vs56, vs6, vs14, 0 | |||||
xxpermdi vs57, vs22, vs30, 0 | |||||
xxpermdi vs58, vs6, vs14, 3 | |||||
xxpermdi vs59, vs22, vs30, 3 | |||||
xxpermdi vs1, vs6, vs7, 0 | |||||
xxpermdi vs3, vs12, vs13, 0 | |||||
xxpermdi vs6, vs6, vs7, 3 | |||||
xxpermdi vs12, vs12, vs13, 3 | |||||
dcbt BO, PREB | |||||
addi A3, A3, 128 | addi A3, A3, 128 | ||||
addi A2, A2, 128 | addi A2, A2, 128 | ||||
xxpermdi vs60, vs7, vs15, 0 | |||||
xxpermdi vs61, vs23, vs31, 0 | |||||
xxpermdi vs62, vs7, vs15, 3 | |||||
xxpermdi vs63, vs23, vs31, 3 | |||||
dcbt BO, PREB | |||||
stxvd2x vs32, o0, BO | stxvd2x vs32, o0, BO | ||||
stxvd2x vs33, o16, BO | stxvd2x vs33, o16, BO | ||||
stxvd2x vs34, o32, BO | stxvd2x vs34, o32, BO | ||||
@@ -157,22 +161,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
stxvd2x vs49, o16, BO | stxvd2x vs49, o16, BO | ||||
stxvd2x vs50, o32, BO | stxvd2x vs50, o32, BO | ||||
stxvd2x vs51, o48, BO | stxvd2x vs51, o48, BO | ||||
stxvd2x vs52, o64, BO | |||||
stxvd2x vs53, o80, BO | |||||
stxvd2x vs54, o96, BO | |||||
stxvd2x vs55, o112, BO | |||||
stxvd2x vs8, o64, BO | |||||
stxvd2x vs9, o80, BO | |||||
stxvd2x vs10, o96, BO | |||||
stxvd2x vs11, o112, BO | |||||
addi BO, BO, 128 | addi BO, BO, 128 | ||||
dcbt BO, PREB | dcbt BO, PREB | ||||
stxvd2x vs56, o0, BO | |||||
stxvd2x vs57, o16, BO | |||||
stxvd2x vs58, o32, BO | |||||
stxvd2x vs59, o48, BO | |||||
stxvd2x vs60, o64, BO | |||||
stxvd2x vs61, o80, BO | |||||
stxvd2x vs62, o96, BO | |||||
stxvd2x vs63, o112, BO | |||||
stxvd2x vs4, o0, BO | |||||
stxvd2x vs5, o16, BO | |||||
stxvd2x vs0, o32, BO | |||||
stxvd2x vs2, o48, BO | |||||
stxvd2x vs1, o64, BO | |||||
stxvd2x vs3, o80, BO | |||||
stxvd2x vs6, o96, BO | |||||
stxvd2x vs12, o112, BO | |||||
addi BO, BO, 128 | addi BO, BO, 128 | ||||
@@ -199,39 +203,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
addi A1, A1, 64 | addi A1, A1, 64 | ||||
lxvd2x vs16, o0, A2 | |||||
lxvd2x vs17, o16, A2 | |||||
lxvd2x vs18, o32, A2 | |||||
lxvd2x vs19, o48, A2 | |||||
lxvd2x vs4, o0, A2 | |||||
lxvd2x vs5, o16, A2 | |||||
lxvd2x vs6, o32, A2 | |||||
lxvd2x vs7, o48, A2 | |||||
addi A2, A2, 64 | addi A2, A2, 64 | ||||
lxvd2x vs24, o0, A3 | |||||
lxvd2x vs25, o16, A3 | |||||
lxvd2x vs26, o32, A3 | |||||
lxvd2x vs27, o48, A3 | |||||
lxvd2x vs12, o0, A3 | |||||
lxvd2x vs13, o16, A3 | |||||
lxvd2x vs50, o32, A3 | |||||
lxvd2x vs51, o48, A3 | |||||
addi A3, A3, 64 | addi A3, A3, 64 | ||||
xxpermdi vs32, vs0, vs8, 0 | xxpermdi vs32, vs0, vs8, 0 | ||||
xxpermdi vs33, vs16, vs24, 0 | |||||
xxpermdi vs33, vs4, vs12, 0 | |||||
xxpermdi vs34, vs0, vs8, 3 | xxpermdi vs34, vs0, vs8, 3 | ||||
xxpermdi vs35, vs16, vs24, 3 | |||||
xxpermdi vs35, vs4, vs12, 3 | |||||
xxpermdi vs36, vs1, vs9, 0 | xxpermdi vs36, vs1, vs9, 0 | ||||
xxpermdi vs37, vs17, vs25, 0 | |||||
xxpermdi vs37, vs5, vs13, 0 | |||||
xxpermdi vs38, vs1, vs9, 3 | xxpermdi vs38, vs1, vs9, 3 | ||||
xxpermdi vs39, vs17, vs25, 3 | |||||
xxpermdi vs39, vs5, vs13, 3 | |||||
xxpermdi vs40, vs2, vs10, 0 | xxpermdi vs40, vs2, vs10, 0 | ||||
xxpermdi vs41, vs18, vs26, 0 | |||||
xxpermdi vs41, vs6, vs50, 0 | |||||
xxpermdi vs42, vs2, vs10, 3 | xxpermdi vs42, vs2, vs10, 3 | ||||
xxpermdi vs43, vs18, vs26, 3 | |||||
xxpermdi vs43, vs6, vs50, 3 | |||||
xxpermdi vs44, vs3, vs11, 0 | xxpermdi vs44, vs3, vs11, 0 | ||||
xxpermdi vs45, vs19, vs27, 0 | |||||
xxpermdi vs45, vs7, vs51, 0 | |||||
xxpermdi vs46, vs3, vs11, 3 | xxpermdi vs46, vs3, vs11, 3 | ||||
xxpermdi vs47, vs19, vs27, 3 | |||||
xxpermdi vs47, vs7, vs51, 3 | |||||
stxvd2x vs32, o0, BO | stxvd2x vs32, o0, BO | ||||
@@ -274,25 +278,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
addi A1, A1, 32 | addi A1, A1, 32 | ||||
lxvd2x vs16, o0, A2 | |||||
lxvd2x vs17, o16, A2 | |||||
lxvd2x vs10, o0, A2 | |||||
lxvd2x vs11, o16, A2 | |||||
addi A2, A2, 32 | addi A2, A2, 32 | ||||
lxvd2x vs24, o0, A3 | |||||
lxvd2x vs25, o16, A3 | |||||
lxvd2x vs12, o0, A3 | |||||
lxvd2x vs13, o16, A3 | |||||
addi A3, A3, 32 | addi A3, A3, 32 | ||||
xxpermdi vs32, vs0, vs8, 0 | xxpermdi vs32, vs0, vs8, 0 | ||||
xxpermdi vs33, vs16, vs24, 0 | |||||
xxpermdi vs33, vs10, vs12, 0 | |||||
xxpermdi vs34, vs0, vs8, 3 | xxpermdi vs34, vs0, vs8, 3 | ||||
xxpermdi vs35, vs16, vs24, 3 | |||||
xxpermdi vs35, vs10, vs12, 3 | |||||
xxpermdi vs36, vs1, vs9, 0 | xxpermdi vs36, vs1, vs9, 0 | ||||
xxpermdi vs37, vs17, vs25, 0 | |||||
xxpermdi vs37, vs11, vs13, 0 | |||||
xxpermdi vs38, vs1, vs9, 3 | xxpermdi vs38, vs1, vs9, 3 | ||||
xxpermdi vs39, vs17, vs25, 3 | |||||
xxpermdi vs39, vs11, vs13, 3 | |||||
stxvd2x vs32, o0, BO | stxvd2x vs32, o0, BO | ||||
@@ -323,18 +327,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
addi A1, A1, 16 | addi A1, A1, 16 | ||||
lxvd2x vs16, o0, A2 | |||||
lxvd2x vs9, o0, A2 | |||||
addi A2, A2, 16 | addi A2, A2, 16 | ||||
lxvd2x vs24, o0, A3 | |||||
lxvd2x vs10, o0, A3 | |||||
addi A3, A3, 16 | addi A3, A3, 16 | ||||
xxpermdi vs32, vs0, vs8, 0 | xxpermdi vs32, vs0, vs8, 0 | ||||
xxpermdi vs33, vs16, vs24, 0 | |||||
xxpermdi vs33, vs9, vs10, 0 | |||||
xxpermdi vs34, vs0, vs8, 3 | xxpermdi vs34, vs0, vs8, 3 | ||||
xxpermdi vs35, vs16, vs24, 3 | |||||
xxpermdi vs35, vs9, vs10, 3 | |||||
stxvd2x vs32, o0, BO | stxvd2x vs32, o0, BO | ||||
@@ -361,16 +365,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
addi A1, A1, 8 | addi A1, A1, 8 | ||||
lxsdx vs16, o0, A2 | |||||
lxsdx vs9, o0, A2 | |||||
addi A2, A2, 8 | addi A2, A2, 8 | ||||
lxsdx vs24, o0, A3 | |||||
lxsdx vs10, o0, A3 | |||||
addi A3, A3, 8 | addi A3, A3, 8 | ||||
xxpermdi vs32, vs0, vs8, 0 | xxpermdi vs32, vs0, vs8, 0 | ||||
xxpermdi vs33, vs16, vs24, 0 | |||||
xxpermdi vs33, vs9, vs10, 0 | |||||
stxvd2x vs32, o0, BO | stxvd2x vs32, o0, BO | ||||
@@ -404,8 +408,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
lxvd2x vs11, o48, A1 | lxvd2x vs11, o48, A1 | ||||
lxvd2x vs12, o64, A1 | lxvd2x vs12, o64, A1 | ||||
lxvd2x vs13, o80, A1 | lxvd2x vs13, o80, A1 | ||||
lxvd2x vs14, o96, A1 | |||||
lxvd2x vs15, o112, A1 | |||||
lxvd2x vs48, o96, A1 | |||||
lxvd2x vs49, o112, A1 | |||||
addi A1, A1, 128 | addi A1, A1, 128 | ||||
@@ -427,11 +431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
xxpermdi vs42, vs5, vs13, 0 | xxpermdi vs42, vs5, vs13, 0 | ||||
xxpermdi vs43, vs5, vs13, 3 | xxpermdi vs43, vs5, vs13, 3 | ||||
xxpermdi vs44, vs6, vs14, 0 | |||||
xxpermdi vs45, vs6, vs14, 3 | |||||
xxpermdi vs44, vs6, vs48, 0 | |||||
xxpermdi vs45, vs6, vs48, 3 | |||||
xxpermdi vs46, vs7, vs15, 0 | |||||
xxpermdi vs47, vs7, vs15, 3 | |||||
xxpermdi vs46, vs7, vs49, 0 | |||||
xxpermdi vs47, vs7, vs49, 3 | |||||
stxvd2x vs32, o0, BO | stxvd2x vs32, o0, BO | ||||
@@ -109,61 +109,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "dgemm_tcopy_macros_16_power8.S" | #include "dgemm_tcopy_macros_16_power8.S" | ||||
#define STACKSIZE 384 | |||||
#define STACKSIZE 576 | |||||
#define STACKSIZE 144 | |||||
PROLOGUE | PROLOGUE | ||||
PROFCODE | PROFCODE | ||||
addi SP, SP, -STACKSIZE | addi SP, SP, -STACKSIZE | ||||
//addi SP, SP, -208 | |||||
li r0, 0 | li r0, 0 | ||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
addi r11,SP,288 | |||||
stvx v20, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v21, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v22, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v23, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v24, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v25, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v26, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v27, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v28, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v29, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v30, r11,r0 | |||||
addi r11,r11,16 | |||||
stvx v31, r11,r0 | |||||
li r11,0 | |||||
std r14,0(SP) | |||||
std r15,8(SP) | |||||
std r16,16(SP) | |||||
std r17,24(SP) | |||||
std r18,32(SP) | |||||
std r19,40(SP) | |||||
std r20,48(SP) | |||||
std r21,56(SP) | |||||
std r22,64(SP) | |||||
std r23,72(SP) | |||||
std r24,80(SP) | |||||
std r25,88(SP) | |||||
std r26,96(SP) | |||||
std r27,104(SP) | |||||
std r28,112(SP) | |||||
std r29,120(SP) | |||||
std r30,128(SP) | |||||
std r31,136(SP) | |||||
cmpwi cr0, M, 0 | cmpwi cr0, M, 0 | ||||
ble- L999 | ble- L999 | ||||
@@ -198,8 +172,7 @@ li r11,0 | |||||
add B2, B2, B | add B2, B2, B | ||||
add B1, B1, B | add B1, B1, B | ||||
//li PREA, 384 | |||||
li PREA, 576 | |||||
li PREA, 384 | |||||
addi PREB, M16, 128 | addi PREB, M16, 128 | ||||
li o8, 8 | li o8, 8 | ||||
@@ -213,52 +186,27 @@ L999: | |||||
li r3, 0 | li r3, 0 | ||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
addi r11,SP,288 | |||||
lvx v20, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v21, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v22, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v23, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v24, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v25, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v26, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v27, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v28, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v29, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v30, r11,r3 | |||||
addi r11,r11,16 | |||||
lvx v31, r11,r3 | |||||
li r11,0 | |||||
ld r14,0(SP) | |||||
ld r15,8(SP) | |||||
ld r16,16(SP) | |||||
ld r17,24(SP) | |||||
ld r18,32(SP) | |||||
ld r19,40(SP) | |||||
ld r20,48(SP) | |||||
ld r21,56(SP) | |||||
ld r22,64(SP) | |||||
ld r23,72(SP) | |||||
ld r24,80(SP) | |||||
ld r25,88(SP) | |||||
ld r26,96(SP) | |||||
ld r27,104(SP) | |||||
ld r28,112(SP) | |||||
ld r29,120(SP) | |||||
ld r30,128(SP) | |||||
ld r31,136(SP) | |||||
addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
//addi SP, SP, 208 | |||||
blr | blr | ||||
EPILOGUE | EPILOGUE | ||||
@@ -58,10 +58,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
lxvd2x vs51, o48, A2 | lxvd2x vs51, o48, A2 | ||||
addi A2, A2, 64 | addi A2, A2, 64 | ||||
lxvd2x vs56, o0, A3 | |||||
lxvd2x vs57, o16, A3 | |||||
lxvd2x vs58, o32, A3 | |||||
lxvd2x vs59, o48, A3 | |||||
lxvd2x vs4, o0, A3 | |||||
lxvd2x vs5, o16, A3 | |||||
lxvd2x vs6, o32, A3 | |||||
lxvd2x vs7, o48, A3 | |||||
addi A3, A3, 64 | addi A3, A3, 64 | ||||
lxvd2x vs36, o0, A0 | lxvd2x vs36, o0, A0 | ||||
@@ -76,16 +76,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
lxvd2x vs47, o48, A1 | lxvd2x vs47, o48, A1 | ||||
addi A1, A1, 64 | addi A1, A1, 64 | ||||
lxvd2x vs52, o0, A2 | |||||
lxvd2x vs53, o16, A2 | |||||
lxvd2x vs54, o32, A2 | |||||
lxvd2x vs55, o48, A2 | |||||
lxvd2x vs12, o0, A2 | |||||
lxvd2x vs13, o16, A2 | |||||
lxvd2x vs2, o32, A2 | |||||
lxvd2x vs3, o48, A2 | |||||
addi A2, A2, 64 | addi A2, A2, 64 | ||||
lxvd2x vs60, o0, A3 | |||||
lxvd2x vs61, o16, A3 | |||||
lxvd2x vs62, o32, A3 | |||||
lxvd2x vs63, o48, A3 | |||||
lxvd2x vs8, o0, A3 | |||||
lxvd2x vs9, o16, A3 | |||||
lxvd2x vs10, o32, A3 | |||||
lxvd2x vs11, o48, A3 | |||||
addi A3, A3, 64 | addi A3, A3, 64 | ||||
mr T1, BO | mr T1, BO | ||||
@@ -122,23 +122,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
stxvd2x vs51, o48, T1 | stxvd2x vs51, o48, T1 | ||||
addi T1, T1, 64 | addi T1, T1, 64 | ||||
stxvd2x vs52, o0, T1 | |||||
stxvd2x vs53, o16, T1 | |||||
stxvd2x vs54, o32, T1 | |||||
stxvd2x vs55, o48, T1 | |||||
stxvd2x vs12, o0, T1 | |||||
stxvd2x vs13, o16, T1 | |||||
stxvd2x vs2, o32, T1 | |||||
stxvd2x vs3, o48, T1 | |||||
addi T1, T1, 64 | addi T1, T1, 64 | ||||
stxvd2x vs56, o0, T1 | |||||
stxvd2x vs57, o16, T1 | |||||
stxvd2x vs58, o32, T1 | |||||
stxvd2x vs59, o48, T1 | |||||
stxvd2x vs4, o0, T1 | |||||
stxvd2x vs5, o16, T1 | |||||
stxvd2x vs6, o32, T1 | |||||
stxvd2x vs7, o48, T1 | |||||
addi T1, T1, 64 | addi T1, T1, 64 | ||||
stxvd2x vs60, o0, T1 | |||||
stxvd2x vs61, o16, T1 | |||||
stxvd2x vs62, o32, T1 | |||||
stxvd2x vs63, o48, T1 | |||||
stxvd2x vs8, o0, T1 | |||||
stxvd2x vs9, o16, T1 | |||||
stxvd2x vs10, o32, T1 | |||||
stxvd2x vs11, o48, T1 | |||||
.endm | .endm | ||||
@@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "sgemm_tcopy_macros_16_power8.S" | #include "sgemm_tcopy_macros_16_power8.S" | ||||
#define STACKSIZE 576 | |||||
#define STACKSIZE 144 | |||||
PROLOGUE | PROLOGUE | ||||
PROFCODE | PROFCODE | ||||
@@ -118,49 +118,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
addi SP, SP, -STACKSIZE | addi SP, SP, -STACKSIZE | ||||
li r0, 0 | li r0, 0 | ||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
addi r11 ,SP, 288 | |||||
stvx v20, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v21, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v22, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v23, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v24, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v25, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v26, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v27, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v28, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v29, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v30, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v31, r11, r0 | |||||
li r11, 0 | |||||
std r14, 0(SP) | |||||
std r15, 8(SP) | |||||
std r16, 16(SP) | |||||
std r17, 24(SP) | |||||
std r18, 32(SP) | |||||
std r19, 40(SP) | |||||
std r20, 48(SP) | |||||
std r21, 56(SP) | |||||
std r22, 64(SP) | |||||
std r23, 72(SP) | |||||
std r24, 80(SP) | |||||
std r25, 88(SP) | |||||
std r26, 96(SP) | |||||
std r27, 104(SP) | |||||
std r28, 112(SP) | |||||
std r29, 120(SP) | |||||
std r30, 128(SP) | |||||
std r31, 136(SP) | |||||
cmpwi cr0, M, 0 | cmpwi cr0, M, 0 | ||||
ble- L999 | ble- L999 | ||||
@@ -207,51 +182,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
L999: | L999: | ||||
li r3, 0 | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
addi r11, SP, 288 | |||||
lvx v20, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v21, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v22, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v23, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v24, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v25, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v26, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v27, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v28, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v29, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v30, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v31, r11, r3 | |||||
li r11, 0 | |||||
ld r14, 0(SP) | |||||
ld r15, 8(SP) | |||||
ld r16, 16(SP) | |||||
ld r17, 24(SP) | |||||
ld r18, 32(SP) | |||||
ld r19, 40(SP) | |||||
ld r20, 48(SP) | |||||
ld r21, 56(SP) | |||||
ld r22, 64(SP) | |||||
ld r23, 72(SP) | |||||
ld r24, 80(SP) | |||||
ld r25, 88(SP) | |||||
ld r26, 96(SP) | |||||
ld r27, 104(SP) | |||||
ld r28, 112(SP) | |||||
ld r29, 120(SP) | |||||
ld r30, 128(SP) | |||||
ld r31, 136(SP) | |||||
addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
blr | blr | ||||
@@ -110,57 +110,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "sgemm_tcopy_macros_8_power8.S" | #include "sgemm_tcopy_macros_8_power8.S" | ||||
#define STACKSIZE 576 | |||||
#define STACKSIZE 144 | |||||
PROLOGUE | PROLOGUE | ||||
PROFCODE | PROFCODE | ||||
addi SP, SP, -STACKSIZE | addi SP, SP, -STACKSIZE | ||||
li r0, 0 | li r0, 0 | ||||
std r14, 0(SP) | |||||
std r15, 8(SP) | |||||
std r16, 16(SP) | |||||
std r17, 24(SP) | |||||
std r18, 32(SP) | |||||
std r19, 40(SP) | |||||
std r20, 48(SP) | |||||
std r21, 56(SP) | |||||
std r22, 64(SP) | |||||
std r23, 72(SP) | |||||
std r24, 80(SP) | |||||
std r25, 88(SP) | |||||
std r26, 96(SP) | |||||
std r27, 104(SP) | |||||
std r28, 112(SP) | |||||
std r29, 120(SP) | |||||
std r30, 128(SP) | |||||
std r31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
addi r11, SP, 288 | |||||
stvx v20, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v21, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v22, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v23, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v24, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v25, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v26, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v27, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v28, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v29, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v30, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v31, r11, r0 | |||||
li r11, 0 | |||||
cmpwi cr0, M, 0 | cmpwi cr0, M, 0 | ||||
ble- L999 | ble- L999 | ||||
@@ -202,51 +177,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
L999: | L999: | ||||
li r3, 0 | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
addi r11,SP,288 | |||||
lvx v20, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v21, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v22, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v23, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v24, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v25, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v26, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v27, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v28, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v29, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v30, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v31, r11, r3 | |||||
li r11, 0 | |||||
ld r14, 0(SP) | |||||
ld r15, 8(SP) | |||||
ld r16, 16(SP) | |||||
ld r17, 24(SP) | |||||
ld r18, 32(SP) | |||||
ld r19, 40(SP) | |||||
ld r20, 48(SP) | |||||
ld r21, 56(SP) | |||||
ld r22, 64(SP) | |||||
ld r23, 72(SP) | |||||
ld r24, 80(SP) | |||||
ld r25, 88(SP) | |||||
ld r26, 96(SP) | |||||
ld r27, 104(SP) | |||||
ld r28, 112(SP) | |||||
ld r29, 120(SP) | |||||
ld r30, 128(SP) | |||||
ld r31, 136(SP) | |||||
addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
blr | blr | ||||
@@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "zgemm_tcopy_macros_8_power8.S" | #include "zgemm_tcopy_macros_8_power8.S" | ||||
#define STACKSIZE 384 | |||||
#define STACKSIZE 576 | |||||
#define STACKSIZE 144 | |||||
PROLOGUE | PROLOGUE | ||||
@@ -119,49 +119,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
addi SP, SP, -STACKSIZE | addi SP, SP, -STACKSIZE | ||||
li r0, 0 | li r0, 0 | ||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
addi r11, SP ,288 | |||||
stvx v20, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v21, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v22, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v23, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v24, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v25, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v26, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v27, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v28, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v29, r11, r0 | |||||
addi r11, r11, 16 | |||||
stvx v30, r11, r0 | |||||
addi r11, r11 ,16 | |||||
stvx v31, r11, r0 | |||||
li r11,0 | |||||
std r14, 0(SP) | |||||
std r15, 8(SP) | |||||
std r16, 16(SP) | |||||
std r17, 24(SP) | |||||
std r18, 32(SP) | |||||
std r19, 40(SP) | |||||
std r20, 48(SP) | |||||
std r21, 56(SP) | |||||
std r22, 64(SP) | |||||
std r23, 72(SP) | |||||
std r24, 80(SP) | |||||
std r25, 88(SP) | |||||
std r26, 96(SP) | |||||
std r27, 104(SP) | |||||
std r28, 112(SP) | |||||
std r29, 120(SP) | |||||
std r30, 128(SP) | |||||
std r31, 136(SP) | |||||
cmpwi cr0, M, 0 | cmpwi cr0, M, 0 | ||||
ble- L999 | ble- L999 | ||||
@@ -204,49 +180,24 @@ L999: | |||||
li r3, 0 | li r3, 0 | ||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
addi r11, SP, 288 | |||||
lvx v20, r11,r3 | |||||
addi r11, r11, 16 | |||||
lvx v21, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v22, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v23, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v24, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v25, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v26, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v27, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v28, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v29, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v30, r11, r3 | |||||
addi r11, r11, 16 | |||||
lvx v31, r11, r3 | |||||
li r11,0 | |||||
ld r14, 0(SP) | |||||
ld r15, 8(SP) | |||||
ld r16, 16(SP) | |||||
ld r17, 24(SP) | |||||
ld r18, 32(SP) | |||||
ld r19, 40(SP) | |||||
ld r20, 48(SP) | |||||
ld r21, 56(SP) | |||||
ld r22, 64(SP) | |||||
ld r23, 72(SP) | |||||
ld r24, 80(SP) | |||||
ld r25, 88(SP) | |||||
ld r26, 96(SP) | |||||
ld r27, 104(SP) | |||||
ld r28, 112(SP) | |||||
ld r29, 120(SP) | |||||
ld r30, 128(SP) | |||||
ld r31, 136(SP) | |||||
addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
blr | blr | ||||
@@ -72,23 +72,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
lxvd2x vs51, o48, A2 | lxvd2x vs51, o48, A2 | ||||
addi A2, A2, 64 | addi A2, A2, 64 | ||||
lxvd2x vs52, o0, A2 | |||||
lxvd2x vs53, o16, A2 | |||||
lxvd2x vs54, o32, A2 | |||||
lxvd2x vs55, o48, A2 | |||||
lxvd2x vs2, o0, A2 | |||||
lxvd2x vs3, o16, A2 | |||||
lxvd2x vs4, o32, A2 | |||||
lxvd2x vs5, o48, A2 | |||||
addi A2, A2, 64 | addi A2, A2, 64 | ||||
lxvd2x vs56, o0, A3 | |||||
lxvd2x vs57, o16, A3 | |||||
lxvd2x vs58, o32, A3 | |||||
lxvd2x vs59, o48, A3 | |||||
lxvd2x vs6, o0, A3 | |||||
lxvd2x vs7, o16, A3 | |||||
lxvd2x vs8, o32, A3 | |||||
lxvd2x vs9, o48, A3 | |||||
addi A3, A3, 64 | addi A3, A3, 64 | ||||
lxvd2x vs60, o0, A3 | |||||
lxvd2x vs61, o16, A3 | |||||
lxvd2x vs62, o32, A3 | |||||
lxvd2x vs63, o48, A3 | |||||
lxvd2x vs10, o0, A3 | |||||
lxvd2x vs11, o16, A3 | |||||
lxvd2x vs12, o32, A3 | |||||
lxvd2x vs13, o48, A3 | |||||
addi A3, A3, 64 | addi A3, A3, 64 | ||||
@@ -126,23 +126,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
stxvd2x vs51, o48, T1 | stxvd2x vs51, o48, T1 | ||||
addi T1, T1, 64 | addi T1, T1, 64 | ||||
stxvd2x vs52, o0, T1 | |||||
stxvd2x vs53, o16, T1 | |||||
stxvd2x vs54, o32, T1 | |||||
stxvd2x vs55, o48, T1 | |||||
stxvd2x vs2, o0, T1 | |||||
stxvd2x vs3, o16, T1 | |||||
stxvd2x vs4, o32, T1 | |||||
stxvd2x vs5, o48, T1 | |||||
addi T1, T1, 64 | addi T1, T1, 64 | ||||
stxvd2x vs56, o0, T1 | |||||
stxvd2x vs57, o16, T1 | |||||
stxvd2x vs58, o32, T1 | |||||
stxvd2x vs59, o48, T1 | |||||
stxvd2x vs6, o0, T1 | |||||
stxvd2x vs7, o16, T1 | |||||
stxvd2x vs8, o32, T1 | |||||
stxvd2x vs9, o48, T1 | |||||
addi T1, T1, 64 | addi T1, T1, 64 | ||||
stxvd2x vs60, o0, T1 | |||||
stxvd2x vs61, o16, T1 | |||||
stxvd2x vs62, o32, T1 | |||||
stxvd2x vs63, o48, T1 | |||||
stxvd2x vs10, o0, T1 | |||||
stxvd2x vs11, o16, T1 | |||||
stxvd2x vs12, o32, T1 | |||||
stxvd2x vs13, o48, T1 | |||||
.endm | .endm | ||||
@@ -27,8 +27,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#ifdef Z13_A | |||||
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) | |||||
{ | |||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t" | |||||
"lgdr %%r0,%[alpha] \n\t" | |||||
"vlvgp %%v0,%%r0,%%r0 \n\t" | |||||
"srlg %[n],%[n],4 \n\t" | |||||
"vlr %%v1,%%v0 \n\t" | |||||
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" | |||||
"la %[x_ptr], 128(%[x_ptr]) \n\t" | |||||
"aghik %[n], %[n], -1 \n\t" | |||||
"jle 2f \n\t" | |||||
".align 16 \n\t" | |||||
"1: \n\t" | |||||
"vfmdb %%v24, %%v16, %%v0 \n\t" | |||||
"vfmdb %%v25, %%v17, %%v0 \n\t" | |||||
"vfmdb %%v26, %%v18, %%v0 \n\t" | |||||
"vfmdb %%v27, %%v19, %%v1 \n\t" | |||||
"vlm %%v16,%%v19, 0(%[x_ptr]) \n\t" | |||||
"vfmdb %%v28, %%v20, %%v0 \n\t" | |||||
"vfmdb %%v29, %%v21, %%v1 \n\t" | |||||
"vfmdb %%v30, %%v22, %%v0 \n\t" | |||||
"vfmdb %%v31, %%v23, %%v1 \n\t" | |||||
"vlm %%v20,%%v23, 64(%[x_ptr]) \n\t" | |||||
"lay %[x_ptr], -128(%[x_ptr]) \n\t" | |||||
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" | |||||
"la %[x_ptr],256(%[x_ptr]) \n\t" | |||||
"brctg %[n],1b \n\t" | |||||
"2: \n\t" | |||||
"vfmdb %%v24, %%v16, %%v0 \n\t" | |||||
"vfmdb %%v25, %%v17, %%v1 \n\t" | |||||
"vfmdb %%v26, %%v18, %%v0 \n\t" | |||||
"vfmdb %%v27, %%v19, %%v1 \n\t" | |||||
"lay %[x_ptr] , -128(%[x_ptr]) \n\t" | |||||
"vfmdb %%v28, %%v20, %%v0 \n\t" | |||||
"vfmdb %%v29, %%v21, %%v1 \n\t" | |||||
"vfmdb %%v30, %%v22, %%v0 \n\t" | |||||
"vfmdb %%v31, %%v23, %%v1 \n\t" | |||||
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" | |||||
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n) | |||||
: [alpha] "f"(da) | |||||
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", | |||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" | |||||
); | |||||
} | |||||
#else | |||||
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) | static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) | ||||
{ | { | ||||
@@ -71,7 +117,7 @@ static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) | |||||
); | ); | ||||
} | } | ||||
#endif | |||||
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) | static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) | ||||
{ | { | ||||
@@ -214,6 +260,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
} | } | ||||
return 0; | return 0; | ||||
} | |||||
} |