From bb2f52844bbcd5c786d7b37f8c4d88dbf7a3b89e Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 24 Jun 2020 14:50:12 -0500 Subject: [PATCH] powerpc: Optimized ZGEMM kernel for POWER10 This patch introduces new optimized version of ZGEMM kernel using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. Cycles count reduced by 30-50% compared to POWER9 version depending on M/N/K sizes. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/zgemm_kernel_power10.S | 245 ++++ kernel/power/zgemm_logic_power10.S | 1735 +++++++++++++++++++++++++++ kernel/power/zgemm_macros_power10.S | 1138 ++++++++++++++++++ 4 files changed, 3120 insertions(+), 2 deletions(-) create mode 100644 kernel/power/zgemm_kernel_power10.S create mode 100644 kernel/power/zgemm_logic_power10.S create mode 100644 kernel/power/zgemm_macros_power10.S diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 00d31f8b6..4fc7190b0 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -10,7 +10,7 @@ else STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power9.S +ZTRMMKERNEL = zgemm_kernel_power10.S SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -42,7 +42,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_power9.S +ZGEMMKERNEL = zgemm_kernel_power10.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S new file mode 100644 index 000000000..fca389e69 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs62 +#define alpha_i vs63 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs20, 288(SP) + stxv vs21, 304(SP) + stxv vs22, 320(SP) + stxv vs23, 336(SP) + stxv vs24, 352(SP) + stxv vs25, 368(SP) + stxv vs26, 384(SP) + stxv vs27, 400(SP) + stxv vs28, 416(SP) + stxv vs29, 432(SP) + stxv vs30, 448(SP) + stxv vs31, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power10.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power10.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs20, 288(SP) + lxv vs21, 304(SP) + lxv vs22, 320(SP) + lxv vs23, 336(SP) + lxv vs24, 352(SP) + lxv vs25, 368(SP) + lxv vs26, 384(SP) + lxv vs27, 400(SP) + mtlr r0 + lxv vs28, 416(SP) + lxv vs29, 432(SP) + lxv vs30, 448(SP) + lxv vs31, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_power10.S b/kernel/power/zgemm_logic_power10.S new file mode 100644 index 000000000..1143733e0 --- /dev/null +++ b/kernel/power/zgemm_logic_power10.S @@ -0,0 +1,1735 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 0 + KERNEL2x8_2 32, 0 + KERNEL2x8_2 33, 0 + KERNEL2x8_2 34, 0 + KERNEL2x8_2 35, 0 + KERNEL2x8_2 36, 0 + KERNEL2x8_2 37, 0 + KERNEL2x8_2 38, 0 + KERNEL2x8_2 39, 0 + KERNEL2x8_2 40, 0 + KERNEL2x8_2 41, 0 + KERNEL2x8_2 42, 0 + KERNEL2x8_2 43, 0 + KERNEL2x8_2 44, 0 + KERNEL2x8_2 45, 0 + KERNEL2x8_2 46, 0 + KERNEL2x8_2 47, 0 + KERNEL2x8_2 48, 0 + KERNEL2x8_2 49, 0 + KERNEL2x8_2 50, 0 + KERNEL2x8_2 51, 0 + KERNEL2x8_2 52, 0 + KERNEL2x8_2 53, 0 + KERNEL2x8_2 54, 0 + KERNEL2x8_2 55, 0 + KERNEL2x8_2 56, 0 + KERNEL2x8_2 57, 0 + KERNEL2x8_2 58, 0 + KERNEL2x8_2 59, 0 + KERNEL2x8_2 60, 0 + KERNEL2x8_2 61, 0 + KERNEL2x8_2 62, 0 + KERNEL2x8_2 63, 1 + bdz ZGEMM_L2x8_LOOP_END + b ZGEMM_L2x8_LOOP + MY_ALIGN + +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + KERNEL2x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_2 0, 0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 0 + KERNEL2x4_2 8, 0 + KERNEL2x4_2 9, 0 + KERNEL2x4_2 10, 0 + KERNEL2x4_2 11, 0 + KERNEL2x4_2 12, 0 + KERNEL2x4_2 13, 0 + KERNEL2x4_2 14, 0 + KERNEL2x4_2 15, 1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + KERNEL2x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_2 0, 0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 0 + KERNEL2x2_2 8, 0 + KERNEL2x2_2 9, 0 + KERNEL2x2_2 10, 0 + KERNEL2x2_2 11, 0 + KERNEL2x2_2 12, 0 + KERNEL2x2_2 13, 0 + KERNEL2x2_2 14, 0 + KERNEL2x2_2 15, 1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + KERNEL2x2_2 0, 1 + blr + MY_ALIGN + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 0, 0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_L2 32, 64, 7, 0 + KERNEL2x1_L2 32, 64, 8, 0 + KERNEL2x1_L2 32, 64, 9, 0 + KERNEL2x1_L2 32, 64, 10, 0 + KERNEL2x1_L2 32, 64, 11, 0 + KERNEL2x1_L2 32, 64, 12, 0 + KERNEL2x1_L2 32, 64, 13, 0 + KERNEL2x1_L2 32, 64, 14, 0 + KERNEL2x1_L2 32, 64, 15, 1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + bgt ZGEMM_L2_BEGIN + b ZGEMM_L2_END + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC, 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + bgt ZGEMM_L2_BEGIN_CONTINUE + b ZGEMM_L2x8_END + +ZGEMM_L2_BEGIN_CONTINUE: + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 2 + mr T1, T6 +#else + mr T1, K +#endif +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /* T8 <- T1 % 128 */ + + KERNEL2x8_PRELOAD + KERNEL2x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + + bgt ZGEMM_L2x8_BEGIN_CONTINUE + b ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_BEGIN_CONTINUE: + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP2x8_128K + LOAD_END_2x8 128, 32 + KERNEL2x8_PRELOAD + addi BO, BO, -64 + addi AO,AO, -256 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + +CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -256 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + LOAD_END_2x8 128, 32 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL2x8_UNPRIME_MMA + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 2 +#endif + + ble ZGEMM_L2x8_SAVE_CONTINUE + b ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_SAVE_CONTINUE: + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x4_PRELOAD + KERNEL2x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x4_32K + LOAD_END_2x4 64, 32 + KERNEL2x4_PRELOAD + addi BO, BO, -64 + addi AO,AO, -128 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -128 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + LOAD_END_2x4 64, 32 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + KERNEL2x4_UNPRIME_MMA + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x2_PRELOAD + KERNEL2x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x2_32K + LOAD_END_2x2 32, 32 + KERNEL2x2_PRELOAD + addi BO, BO, -64 + addi AO,AO, -64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + LOAD_END_2x2 32, 32 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + KERNEL2x2_UNPRIME_MMA + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x1_32K + addi BO, BO, -32 + addi AO,AO, -16 + LOAD2x1O 16, 32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -32 + LOAD2x1_2O 32, 64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_E2 32, 64, 7, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_E2 32, 64, 3, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_E2 32, 64, 1, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32, 64, 0, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + ble ZGEMM_L2_END + b ZGEMM_L2_BEGIN + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 0 + KERNEL1x8_2 32, 0 + KERNEL1x8_2 33, 0 + KERNEL1x8_2 34, 0 + KERNEL1x8_2 35, 0 + KERNEL1x8_2 36, 0 + KERNEL1x8_2 37, 0 + KERNEL1x8_2 38, 0 + KERNEL1x8_2 39, 0 + KERNEL1x8_2 40, 0 + KERNEL1x8_2 41, 0 + KERNEL1x8_2 42, 0 + KERNEL1x8_2 43, 0 + KERNEL1x8_2 44, 0 + KERNEL1x8_2 45, 0 + KERNEL1x8_2 46, 0 + KERNEL1x8_2 47, 0 + KERNEL1x8_2 48, 0 + KERNEL1x8_2 49, 0 + KERNEL1x8_2 50, 0 + KERNEL1x8_2 51, 0 + KERNEL1x8_2 52, 0 + KERNEL1x8_2 53, 0 + KERNEL1x8_2 54, 0 + KERNEL1x8_2 55, 0 + KERNEL1x8_2 56, 0 + KERNEL1x8_2 57, 0 + KERNEL1x8_2 58, 0 + KERNEL1x8_2 59, 0 + KERNEL1x8_2 60, 0 + KERNEL1x8_2 61, 0 + KERNEL1x8_2 62, 0 + KERNEL1x8_2 63, 1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + KERNEL1x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_2 0, 0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 0 + KERNEL1x4_2 8, 0 + KERNEL1x4_2 9, 0 + KERNEL1x4_2 10, 0 + KERNEL1x4_2 11, 0 + KERNEL1x4_2 12, 0 + KERNEL1x4_2 13, 0 + KERNEL1x4_2 14, 0 + KERNEL1x4_2 15, 1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + KERNEL1x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_2 0, 0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 0 + KERNEL1x2_2 8, 0 + KERNEL1x2_2 9, 0 + KERNEL1x2_2 10, 0 + KERNEL1x2_2 11, 0 + KERNEL1x2_2 12, 0 + KERNEL1x2_2 13, 0 + KERNEL1x2_2 14, 0 + KERNEL1x2_2 15, 1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + KERNEL1x2_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 0, 0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_L2 32, 32, 7, 0 + KERNEL1x1_L2 32, 32, 8, 0 + KERNEL1x1_L2 32, 32, 9, 0 + KERNEL1x1_L2 32, 32, 10, 0 + KERNEL1x1_L2 32, 32, 11, 0 + KERNEL1x1_L2 32, 32, 12, 0 + KERNEL1x1_L2 32, 32, 13, 0 + KERNEL1x1_L2 32, 32, 14, 0 + KERNEL1x1_L2 32, 32, 15, 1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + KERNEL1x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP1x8_128K + LOAD_END_1x8 -128, -16 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -256 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + KERNEL1x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + LOAD_END_1x8 128, 16 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL1x8_UNPRIME_MMA + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x4_32K + LOAD_END_1x4 -64, -16 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -128 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + KERNEL1x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + LOAD_END_1x4 64,16 + + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + KERNEL1x4_UNPRIME_MMA + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x2_32K + LOAD_END_1x2 -32, -16 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -64 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + KERNEL1x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + LOAD_END_1x2 32,16 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + KERNEL1x2_UNPRIME_MMA + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x1_32K + addi BO, BO, -16 + addi AO,AO, -16 + LOAD1x1O 16, 16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -32 + LOAD1x1_2O 32, 32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1, L, 16 + ble ZGEMM_L1x1_SUB2_8 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_E2 32, 32, 7, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1, L, 8 + ble ZGEMM_L1x1_SUB2_4 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_E2 32, 32, 3, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_E2 32, 32, 1, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32, 32, 0, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S new file mode 100644 index 000000000..42f9c5ad4 --- /dev/null +++ b/kernel/power/zgemm_macros_power10.S @@ -0,0 +1,1138 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs38,vs40,vs48,vs49 + MULT_APLHA_PART2 vs34,vs36,vs46,vs47 + AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + MULT_APLHA_PART1 vs42,vs44, vs56,vs57 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + MULT_APLHA_PART2 vs42,vs44,vs56,vs57 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59 + UNPACK_FOR_STORE vs56,vs57,vs42,vs44 + UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART1 vs38,vs40, vs48,vs49 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 +#ifndef TRMMKERNEL + lxv vs50, (\LOFFSET)(\BASE_REG) + xxmrgld vs46,vs50,vs50 + xxmrghd vs47,vs50,vs50 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + xxmrghd vs39,vs47,vs46 + stxv vs39, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro KERNEL2x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + + +.macro KERNEL2x8_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x8_2 Index, IsLast + lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs44, vs51 + xvf64gerpp 3, vs46, vs51 + xvf64gerpp 4, vs40, vs50 + xvf64gerpp 5, vs42, vs50 + xvf64gerpp 6, vs44, vs50 + xvf64gerpp 7, vs46, vs50 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x8 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 + xxmfacc 4 + xxmfacc 5 + xxmfacc 6 + xxmfacc 7 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + xxpermdi vs32, vs16, vs17, 0b01 + xxpermdi vs33, vs16, vs17, 0b10 + xxpermdi vs34, vs18, vs19, 0b01 + xxpermdi vs35, vs18, vs19, 0b10 + xxpermdi vs36, vs20, vs21, 0b01 + xxpermdi vs37, vs20, vs21, 0b10 + xxpermdi vs38, vs22, vs23, 0b01 + xxpermdi vs39, vs22, vs23, 0b10 + xxpermdi vs40, vs24, vs25, 0b01 + xxpermdi vs41, vs24, vs25, 0b10 + xxpermdi vs42, vs26, vs27, 0b01 + xxpermdi vs43, vs26, vs27, 0b10 + xxpermdi vs44, vs28, vs29, 0b01 + xxpermdi vs45, vs28, vs29, 0b10 + xxpermdi vs46, vs30, vs31, 0b01 + xxpermdi vs47, vs30, vs31, 0b10 + + xxlor vs18, vs32, vs32 + xxlor vs19, vs33, vs33 + xxlor vs16, vs34, vs34 + xxlor vs17, vs35, vs35 + xxlor vs22, vs36, vs36 + xxlor vs23, vs37, vs37 + xxlor vs20, vs38, vs38 + xxlor vs21, vs39, vs39 + xxlor vs26, vs40, vs40 + xxlor vs27, vs41, vs41 + xxlor vs24, vs42, vs42 + xxlor vs25, vs43, vs43 + xxlor vs30, vs44, vs44 + xxlor vs31, vs45, vs45 + xxlor vs28, vs46, vs46 + xxlor vs29, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + +.macro KERNEL2x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL2x4_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x4_2 Index, IsLast + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x4 OffsetA, OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + +.macro KERNEL2x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL2x2_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x2_2 Index, IsLast + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x2 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + SAVE2 vs4,vs5,vs6,vs7,T1,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + lxv vs52, (\OffsetB+32)(BO) // load real,imag from B + lxv vs54, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xxswapd vs55, vs54 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 + xxswapd vs51, vs50 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs2, vs40, vs54 + xvmaddadp vs1, vs40, vs53 + xvmaddadp vs3, vs40, vs55 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs0,vs1,CO,0 + SAVE1 vs2,vs3,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + +.macro KERNEL1x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL1x8_2 Index,IsLast + lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A + lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x8 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE1x8 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=4 +**********************************************************************************************/ + +.macro KERNEL1x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL1x4_2 Index,IsLast + lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x4 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE1x4 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=2 +**********************************************************************************************/ + +.macro KERNEL1x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 +.endm + + +.macro KERNEL1x2_2 Index,IsLast + lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x2 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 +.endm + + +.macro SAVE1x2 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=1 +**********************************************************************************************/ + +.macro ZERO1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs49, vs48 + +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs52, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs1, vs40, vs53 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs0,vs1,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm +