diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S index 30450cc7d..c819ee6fb 100644 --- a/kernel/arm64/sgemm_ncopy_4.S +++ b/kernel/arm64/sgemm_ncopy_4.S @@ -1,333 +1,333 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M x0 -#define N x1 -#define A00 x2 -#define LDA x3 -#define B00 x4 - -#define A01 x5 -#define A02 x6 -#define A03 x7 -#define A04 x8 - -#define I x9 -#define J x10 - -#define TEMP1 x11 -#define TEMP2 x12 - -#define A_PREFETCH 2560 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -.macro SAVE_REGS - add sp, sp, #-(11 * 16) - stp d8, d9, [sp, #(0 * 16)] - stp d10, d11, [sp, #(1 * 16)] - stp d12, d13, [sp, #(2 * 16)] - stp d14, d15, [sp, #(3 * 16)] - stp d16, d17, [sp, #(4 * 16)] - stp x18, x19, [sp, #(5 * 16)] - stp x20, x21, [sp, #(6 * 16)] - stp x22, x23, [sp, #(7 * 16)] - stp x24, x25, [sp, #(8 * 16)] - stp x26, x27, [sp, #(9 * 16)] - str x28, [sp, #(10 * 16)] -.endm - -.macro RESTORE_REGS - ldp d8, d9, [sp, #(0 * 16)] - ldp d10, d11, [sp, #(1 * 16)] - ldp d12, d13, [sp, #(2 * 16)] - ldp d14, d15, [sp, #(3 * 16)] - ldp d16, d17, [sp, #(4 * 16)] - ldp x18, x19, [sp, #(5 * 16)] - ldp x20, x21, [sp, #(6 * 16)] - ldp x22, x23, [sp, #(7 * 16)] - ldp x24, x25, [sp, #(8 * 16)] - ldp x26, x27, [sp, #(9 * 16)] - ldr x28, [sp, #(10 * 16)] - add sp, sp, #(11*16) -.endm - -.macro COPY4x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - - ldr q2, [A03], #16 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v10.s[2], v2.s[2] - ins v11.s[2], v2.s[3] - - ldr q3, [A04], #16 - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] - ins v10.s[3], v3.s[2] - ins v11.s[3], v3.s[3] - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] - add B00, B00, #64 - -.endm - -.macro COPY1x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr s0, [A01], #4 - ldr s1, [A02], #4 - ldr s2, [A03], #4 - ldr s3, [A04], #4 - - stp s0, s1, [B00] - add B00, B00, #8 - stp s2, s3, [B00] - add B00, B00, #8 -.endm - -.macro COPY4x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - - st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] - add B00, B00, #32 -.endm - - -.macro COPY1x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr s0, [A01], #4 - ldr s1, [A02], #4 - - stp s0, s1, [B00] - add B00, B00, #8 -.endm - -.macro COPY4x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr q0, [A01], #16 - str q0, [B00], #16 -.endm - - -.macro COPY1x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr s0, [A01], #4 - str s0, [B00], #4 -.endm - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - SAVE_REGS - - lsl LDA, LDA, #2 // LDA = LDA * SIZE - -.Ldgemm_ncopy_L4_BEGIN: - - asr J, N, #2 // J = N / 4 - cmp J, #0 - ble .Ldgemm_ncopy_L2_BEGIN - - .align 5 -.Ldgemm_ncopy_L4_M4_BEGIN: - - mov A01, A00 - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A00, A04, LDA - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L4_M4_40 - - .align 5 -.Ldgemm_ncopy_L4_M4_20: - - COPY4x4 - - subs I , I , #1 - bne .Ldgemm_ncopy_L4_M4_20 - -.Ldgemm_ncopy_L4_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L4_M4_END - - .align 5 -.Ldgemm_ncopy_L4_M4_60: - - COPY1x4 - - subs I , I , #1 - bne .Ldgemm_ncopy_L4_M4_60 - -.Ldgemm_ncopy_L4_M4_END: - - subs J , J, #1 // j-- - bne .Ldgemm_ncopy_L4_M4_BEGIN - -/*********************************************************************************************/ - -.Ldgemm_ncopy_L2_BEGIN: - - tst N, #3 - ble .Ldgemm_ncopy_L999 - - tst N, #2 - ble .Ldgemm_ncopy_L1_BEGIN - -.Ldgemm_ncopy_L2_M4_BEGIN: - mov A01, A00 - add A02, A01, LDA - add A00, A02, LDA - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L2_M4_40 - - .align 5 -.Ldgemm_ncopy_L2_M4_20: - - COPY4x2 - - subs I , I , #1 - bne .Ldgemm_ncopy_L2_M4_20 - -.Ldgemm_ncopy_L2_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L2_M4_END - - .align 5 -.Ldgemm_ncopy_L2_M4_60: - - COPY1x2 - - subs I , I , #1 - bne .Ldgemm_ncopy_L2_M4_60 - -.Ldgemm_ncopy_L2_M4_END: - - -/*********************************************************************************************/ - -.Ldgemm_ncopy_L1_BEGIN: - - tst N, #1 - ble .Ldgemm_ncopy_L999 - -.Ldgemm_ncopy_L1_M4_BEGIN: - - mov A01, A00 - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L1_M4_40 - - .align 5 -.Ldgemm_ncopy_L1_M4_20: - - COPY4x1 - - subs I , I , #1 - bne .Ldgemm_ncopy_L1_M4_20 - - -.Ldgemm_ncopy_L1_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L1_M4_END - - .align 5 -.Ldgemm_ncopy_L1_M4_60: - - COPY1x1 - - subs I , I , #1 - bne .Ldgemm_ncopy_L1_M4_60 - - -.Ldgemm_ncopy_L1_M4_END: - -.Ldgemm_ncopy_L999: - - mov x0, #0 - RESTORE_REGS - ret - - EPILOGUE - +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 + +#define I x9 +#define J x10 + +#define TEMP1 x11 +#define TEMP2 x12 + +#define A_PREFETCH 2560 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + + ldr q3, [A04], #16 + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] + add B00, B00, #64 + +.endm + +.macro COPY1x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ldr s2, [A03], #4 + ldr s3, [A04], #4 + + stp s0, s1, [B00] + add B00, B00, #8 + stp s2, s3, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] + add B00, B00, #32 +.endm + + +.macro COPY1x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + + stp s0, s1, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01], #16 + str q0, [B00], #16 +.endm + + +.macro COPY1x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Ldgemm_ncopy_L4_BEGIN: + + asr J, N, #2 // J = N / 4 + cmp J, #0 + ble .Ldgemm_ncopy_L2_BEGIN + + .align 5 +.Ldgemm_ncopy_L4_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_40 + + .align 5 +.Ldgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_20 + +.Ldgemm_ncopy_L4_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_END + + .align 5 +.Ldgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_60 + +.Ldgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne .Ldgemm_ncopy_L4_M4_BEGIN + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble .Ldgemm_ncopy_L999 + + tst N, #2 + ble .Ldgemm_ncopy_L1_BEGIN + +.Ldgemm_ncopy_L2_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_40 + + .align 5 +.Ldgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_20 + +.Ldgemm_ncopy_L2_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_END + + .align 5 +.Ldgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_60 + +.Ldgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Ldgemm_ncopy_L999 + +.Ldgemm_ncopy_L1_M4_BEGIN: + + mov A01, A00 + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_40 + + .align 5 +.Ldgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_20 + + +.Ldgemm_ncopy_L1_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_END + + .align 5 +.Ldgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_60 + + +.Ldgemm_ncopy_L1_M4_END: + +.Ldgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 431f1ae2a..3066421bb 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -1,814 +1,814 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M x0 -#define N x1 -#define A x2 -#define LDA x3 -#define B x4 - -#define M8 x5 - -#define A01 x6 -#define A02 x7 -#define A03 x8 -#define A04 x9 -#define A05 x10 -#define A06 x11 -#define A07 x12 -#define A08 x13 - -#define B01 x14 -#define B02 x15 -#define B03 x16 -#define B04 x17 -#define B00 x22 - - -#define I x21 -#define J x19 - -#define TEMP1 x20 - -#define A_PREFETCH 256 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ -.macro SAVE_REGS - add sp, sp, #-(11 * 16) - stp d8, d9, [sp, #(0 * 16)] - stp d10, d11, [sp, #(1 * 16)] - stp d12, d13, [sp, #(2 * 16)] - stp d14, d15, [sp, #(3 * 16)] - stp d16, d17, [sp, #(4 * 16)] - stp x18, x19, [sp, #(5 * 16)] - stp x20, x21, [sp, #(6 * 16)] - stp x22, x23, [sp, #(7 * 16)] - stp x24, x25, [sp, #(8 * 16)] - stp x26, x27, [sp, #(9 * 16)] - str x28, [sp, #(10 * 16)] -.endm - -.macro RESTORE_REGS - ldp d8, d9, [sp, #(0 * 16)] - ldp d10, d11, [sp, #(1 * 16)] - ldp d12, d13, [sp, #(2 * 16)] - ldp d14, d15, [sp, #(3 * 16)] - ldp d16, d17, [sp, #(4 * 16)] - ldp x18, x19, [sp, #(5 * 16)] - ldp x20, x21, [sp, #(6 * 16)] - ldp x22, x23, [sp, #(7 * 16)] - ldp x24, x25, [sp, #(8 * 16)] - ldp x26, x27, [sp, #(9 * 16)] - ldr x28, [sp, #(10 * 16)] - add sp, sp, #(11*16) -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x8 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - prfm PLDL1KEEP, [A05, #A_PREFETCH] - prfm PLDL1KEEP, [A06, #A_PREFETCH] - prfm PLDL1KEEP, [A07, #A_PREFETCH] - prfm PLDL1KEEP, [A08, #A_PREFETCH] - //prfm PSTL1KEEP, [B00, M8] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] - add A03, A03, #64 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] - add A04, A04, #64 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] - add A05, A05, #64 - - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] - add A06, A06, #64 - - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] - add A07, A07, #64 - - st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] - add A08, A08, #64 - - st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - add B00, B00, M8 - -.endm - -.macro COPY8x8 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - prfm PLDL1KEEP, [A05, #A_PREFETCH] - prfm PLDL1KEEP, [A06, #A_PREFETCH] - prfm PLDL1KEEP, [A07, #A_PREFETCH] - prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldp q0, q1, [A01] - ldp q2, q3, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 - - ldp q4, q5, [A03] - ldp q6, q7, [A04] - add A03, A03, #32 - add A04, A04, #32 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] - add B01, B01, #64 - - ldp q8, q9, [A05] - ldp q10, q11, [A06] - add A05, A05, #32 - add A06, A06, #32 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] - add B01, B01, #64 - - ldp q12, q13, [A07] - ldp q14, q15, [A08] - add A07, A07, #32 - add A08, A08, #32 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - ldr q2, [A03] - ldr q3, [A04] - add A01, A01, #16 - add A02, A02, #16 - add A03, A03, #16 - add A04, A04, #16 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] - add B02, B02, #64 - - ldr q4, [A05] - ldr q5, [A06] - ldr q6, [A07] - ldr q7, [A08] - - add A05, A05, #16 - add A06, A06, #16 - add A07, A07, #16 - add A08, A08, #16 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] - add B02, B02, #64 -.endm - -.macro COPY2x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - ldr d2, [A03] - ldr d3, [A04] - - add A01, A01, #8 - add A02, A02, #8 - add A03, A03, #8 - add A04, A04, #8 - - stp d0, d1, [B03] - add B03, B03, #16 - stp d2, d3, [B03] - add B03, B03, #16 - - ldr d4, [A05] - ldr d5, [A06] - ldr d6, [A07] - ldr d7, [A08] - - add A05, A05, #8 - add A06, A06, #8 - add A07, A07, #8 - add A08, A08, #8 - - stp d4, d5, [B03] - add B03, B03, #16 - stp d6, d7, [B03] - add B03, B03, #16 - -.endm - -.macro COPY1x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - ldr s2, [A03] - ldr s3, [A04] - - stp s0, s1, [B04] - add B04, B04, #8 - stp s2, s3, [B04] - add B04, B04, #8 - - ldr s4, [A05] - ldr s5, [A06] - ldr s6, [A07] - ldr s7, [A08] - - stp s4, s5, [B04] - add B04, B04, #8 - stp s6, s7, [B04] - add B04, B04, #8 - -.endm - -/*************************************************************************************************************************/ -.macro COPY16x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] - add A03, A03, #64 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] - add A04, A04, #64 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] - - add B00, B00, M8 -.endm - -.macro COPY8x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldp q0, q1, [A01] - ldp q2, q3, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 - - ldp q4, q5, [A03] - ldp q6, q7, [A04] - add A03, A03, #32 - add A04, A04, #32 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - ldr q2, [A03] - ldr q3, [A04] - add A01, A01, #16 - add A02, A02, #16 - add A03, A03, #16 - add A04, A04, #16 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] - - add B02, B02, #64 -.endm - -.macro COPY2x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - ldr d2, [A03] - ldr d3, [A04] - - add A01, A01, #8 - add A02, A02, #8 - add A03, A03, #8 - add A04, A04, #8 - - stp d0, d1, [B03] - add B03, B03, #16 - stp d2, d3, [B03] - - add B03, B03, #16 -.endm - -.macro COPY1x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - ldr s2, [A03] - ldr s3, [A04] - - add A01, A01, #4 - add A02, A02, #4 - add A03, A03, #4 - add A04, A04, #4 - - stp s0, s1, [B04] - add B04, B04, #8 - stp s2, s3, [B04] - add B04, B04, #8 - -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add B00, B00, M8 -.endm - -.macro COPY8x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ld1 {v0.4s, v1.4s}, [A01] - ld1 {v2.4s, v3.4s}, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - add A01, A01, #16 - add A02, A02, #16 - - stp q0, q1, [B02] - add B02, B02, #32 -.endm - -.macro COPY2x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - - add A01, A01, #8 - add A02, A02, #8 - - stp d0, d1, [B03] - add B03, B03, #16 -.endm - -.macro COPY1x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - - add A01, A01, #4 - add A02, A02, #4 - - stp s0, s1, [B04] - - add B04, B04, #8 -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add B00, B00, M8 -.endm - -.macro COPY8x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldp q0, q1, [A01] - add A01, A01, #32 - stp q0, q1, [B01] - - add B01, B01, #32 -.endm - -.macro COPY4x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr q0, [A01] - add A01, A01, #16 - str q0, [B02] - - add B02, B02, #16 -.endm - -.macro COPY2x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr d0, [A01] - add A01, A01, #8 - str d0, [B03] - - add B03, B03, #8 -.endm - -.macro COPY1x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr s0, [A01] - add A01, A01, #4 - str s0, [B04] - - add B04, B04, #4 -.endm - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - SAVE_REGS - - lsl LDA, LDA, #2 // LDA = LDA * SIZE - - lsl TEMP1, M, #2 // TEMP1 = M * SIZE - - and B01 , N , #-16 - and B02 , N , #-8 - and B03 , N , #-4 - and B04 , N , #-2 - - mul B01, B01, TEMP1 - mul B02, B02, TEMP1 - mul B03, B03, TEMP1 - mul B04, B04, TEMP1 - - add B01 , B01, B - add B02 , B02, B - add B03 , B03, B - add B04 , B04, B - - lsl M8, M, #6 // M8 = M * 16 * SIZE - -.Lsgemm_tcopy_L8_BEGIN: - asr J, M, #3 // J = M / 8 - cmp J, #0 - ble .Lsgemm_tcopy_L4_BEGIN - - .align 5 -.Lsgemm_tcopy_L8_M16_BEGIN: - - mov A01, A - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A05, A04, LDA - add A06, A05, LDA - add A07, A06, LDA - add A08, A07, LDA - add A, A08, LDA - - mov B00, B - add B, B00, #512 // B = B + 8 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L8_M16_40 - - .align 5 -.Lsgemm_tcopy_L8_M16_20: - - COPY16x8 - - subs I , I , #1 - bne .Lsgemm_tcopy_L8_M16_20 - -.Lsgemm_tcopy_L8_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L8_M16_60 - - COPY8x8 - -.Lsgemm_tcopy_L8_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L8_M16_80 - - COPY4x8 - -.Lsgemm_tcopy_L8_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L8_M16_100 - - COPY2x8 - -.Lsgemm_tcopy_L8_M16_100: - - tst N, #1 - ble .Lsgemm_tcopy_L8_M16_END - - COPY1x8 - -.Lsgemm_tcopy_L8_M16_END: - - subs J , J, #1 // j-- - bne .Lsgemm_tcopy_L8_M16_BEGIN - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L4_BEGIN: - tst M, #7 - ble .Lsgemm_tcopy_L999 - - tst M, #4 - ble .Lsgemm_tcopy_L2_BEGIN - -.Lsgemm_tcopy_L4_M16_BEGIN: - - mov A01, A - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A, A04, LDA - - mov B00, B - add B, B00, #256 // B = B + 4 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L4_M16_40 - - .align 5 -.Lsgemm_tcopy_L4_M16_20: - - COPY16x4 - - subs I , I , #1 - bne .Lsgemm_tcopy_L4_M16_20 - -.Lsgemm_tcopy_L4_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L4_M16_60 - - COPY8x4 - -.Lsgemm_tcopy_L4_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L4_M16_80 - - COPY4x4 - -.Lsgemm_tcopy_L4_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L4_M16_100 - - COPY2x4 - - -.Lsgemm_tcopy_L4_M16_100: - - tst N, #1 - ble .Lsgemm_tcopy_L4_M16_END - - COPY1x4 - - -.Lsgemm_tcopy_L4_M16_END: - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L2_BEGIN: - - tst M, #3 - ble .Lsgemm_tcopy_L999 - - tst M, #2 - ble .Lsgemm_tcopy_L1_BEGIN - -.Lsgemm_tcopy_L2_M16_BEGIN: - mov A01, A - add A02, A01, LDA - add A, A02, LDA - - mov B00, B - add B, B00, #128 // B = B + 2 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L2_M16_40 - - .align 5 -.Lsgemm_tcopy_L2_M16_20: - - COPY16x2 - - subs I , I , #1 - bne .Lsgemm_tcopy_L2_M16_20 - -.Lsgemm_tcopy_L2_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L2_M16_60 - - COPY8x2 - -.Lsgemm_tcopy_L2_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L2_M16_80 - - COPY4x2 - -.Lsgemm_tcopy_L2_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L2_M16_100 - - COPY2x2 - -.Lsgemm_tcopy_L2_M16_100: - - tst N , #1 - ble .Lsgemm_tcopy_L2_M16_END - - COPY1x2 - -.Lsgemm_tcopy_L2_M16_END: - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L1_BEGIN: - - tst M, #1 - ble .Lsgemm_tcopy_L999 - - -.Lsgemm_tcopy_L1_M16_BEGIN: - - mov A01, A // A01 = A - mov B00, B - - asr I, N, #4 // I = M / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L1_M16_40 - - .align 5 -.Lsgemm_tcopy_L1_M16_20: - - COPY16x1 - - subs I , I , #1 - bne .Lsgemm_tcopy_L1_M16_20 - -.Lsgemm_tcopy_L1_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L1_M16_60 - - COPY8x1 - -.Lsgemm_tcopy_L1_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L1_M16_80 - - COPY4x1 - -.Lsgemm_tcopy_L1_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L1_M16_100 - - COPY2x1 - -.Lsgemm_tcopy_L1_M16_100: - - tst N , #1 - ble .Lsgemm_tcopy_L1_M16_END - - COPY1x1 - - -.Lsgemm_tcopy_L1_M16_END: - -.Lsgemm_tcopy_L999: - mov x0, #0 // set return value - RESTORE_REGS - ret - - EPILOGUE - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x21 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + //prfm PSTL1KEEP, [B00, M8] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] + add A05, A05, #64 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] + add A06, A06, #64 + + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] + add A07, A07, #64 + + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] + add A08, A08, #64 + + st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 + +.endm + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] + add B01, B01, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + add B02, B02, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] + add B02, B02, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + add B03, B03, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B03] + add B03, B03, #16 + stp d6, d7, [B03] + add B03, B03, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + stp s4, s5, [B04] + add B04, B04, #8 + stp s6, s7, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ +.macro COPY16x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + + add B00, B00, M8 +.endm + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + + add B02, B02, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + + add B03, B03, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add B00, B00, M8 +.endm + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B02] + add B02, B02, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B03] + add B03, B03, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B04] + + add B04, B04, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B01] + + add B01, B01, #32 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B02] + + add B02, B02, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B03] + + add B03, B03, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B04] + + add B04, B04, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-16 + and B02 , N , #-8 + and B03 , N , #-4 + and B04 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + mul B04, B04, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + add B04 , B04, B + + lsl M8, M, #6 // M8 = M * 16 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #512 // B = B + 8 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M16_40 + + .align 5 +.Lsgemm_tcopy_L8_M16_20: + + COPY16x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M16_20 + +.Lsgemm_tcopy_L8_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L8_M16_60 + + COPY8x8 + +.Lsgemm_tcopy_L8_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L8_M16_80 + + COPY4x8 + +.Lsgemm_tcopy_L8_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M16_100 + + COPY2x8 + +.Lsgemm_tcopy_L8_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M16_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M16_END: + + subs J , J, #1 // j-- + bne .Lsgemm_tcopy_L8_M16_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #256 // B = B + 4 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M16_40 + + .align 5 +.Lsgemm_tcopy_L4_M16_20: + + COPY16x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M16_20 + +.Lsgemm_tcopy_L4_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L4_M16_60 + + COPY8x4 + +.Lsgemm_tcopy_L4_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L4_M16_80 + + COPY4x4 + +.Lsgemm_tcopy_L4_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M16_100 + + COPY2x4 + + +.Lsgemm_tcopy_L4_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L4_M16_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #128 // B = B + 2 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M16_40 + + .align 5 +.Lsgemm_tcopy_L2_M16_20: + + COPY16x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M16_20 + +.Lsgemm_tcopy_L2_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L2_M16_60 + + COPY8x2 + +.Lsgemm_tcopy_L2_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L2_M16_80 + + COPY4x2 + +.Lsgemm_tcopy_L2_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M16_100 + + COPY2x2 + +.Lsgemm_tcopy_L2_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M16_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #4 // I = M / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M16_40 + + .align 5 +.Lsgemm_tcopy_L1_M16_20: + + COPY16x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M16_20 + +.Lsgemm_tcopy_L1_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L1_M16_60 + + COPY8x1 + +.Lsgemm_tcopy_L1_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L1_M16_80 + + COPY4x1 + +.Lsgemm_tcopy_L1_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M16_100 + + COPY2x1 + +.Lsgemm_tcopy_L1_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M16_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M16_END: + +.Lsgemm_tcopy_L999: + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE + + diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S index 4b5c2fa31..dfe17f3ef 100644 --- a/kernel/power/cgemm_kernel_power9.S +++ b/kernel/power/cgemm_kernel_power9.S @@ -1,293 +1,293 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - -#define alpha_r vs19 -#define alpha_i vs20 -#define save_permute_1 vs21 -#define permute_mask vs22 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define PRE r29 - -#define T12 r30 -#define T13 r31 - -#include "cgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_12, 0x0c0d0e0f1c1d1e1f -.equ save_permute_11, 0x0405060714151617 - - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - - - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) - - - -#ifdef TRMMKERNEL - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) -#endif - slwi LDC, LDC, ZBASE_SHIFT - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xscvdpspn alpha_i,vs2 - xxspltw alpha_r,alpha_r,0 - xxspltw alpha_i,alpha_i,0 -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - - - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - - - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - - - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - - - li r0,0 - li PRE,512 - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegsp alpha_r,alpha_r - xvnegsp alpha_i,alpha_i -#endif - - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - - /*mask is reverse permute so we have to make it inner permute */ - xxpermdi permute_mask, permute_mask, permute_mask,2 - -#include "cgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs19 +#define alpha_i vs20 +#define save_permute_1 vs21 +#define permute_mask vs22 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S index b4f937e90..a191219fa 100644 --- a/kernel/power/cgemm_logic_power9.S +++ b/kernel/power/cgemm_logic_power9.S @@ -1,2816 +1,2816 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define MY_ALIGN .align 3 -b CGEMM_L4 -/* MINI SUBROUTINES */ -/* 4x8 MAIN 128x+2 LOOP */ - - -CGEMM_L4x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x8_2 - MY_ALIGN -CGEMM_L4x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 -CGEMM_L4x8_K128: -/*----------------------------------------*/ - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_L2 128,64,31,0 - KERNEL4x8_L2 128,64,32,0 - KERNEL4x8_L2 128,64,33,0 - KERNEL4x8_L2 128,64,34,0 - KERNEL4x8_L2 128,64,35,0 - KERNEL4x8_L2 128,64,36,0 - KERNEL4x8_L2 128,64,37,0 - KERNEL4x8_L2 128,64,38,0 - KERNEL4x8_L2 128,64,39,0 - KERNEL4x8_L2 128,64,40,0 - KERNEL4x8_L2 128,64,41,0 - KERNEL4x8_L2 128,64,42,0 - KERNEL4x8_L2 128,64,43,0 - KERNEL4x8_L2 128,64,44,0 - KERNEL4x8_L2 128,64,45,0 - KERNEL4x8_L2 128,64,46,0 - KERNEL4x8_L2 128,64,47,0 - KERNEL4x8_L2 128,64,48,0 - KERNEL4x8_L2 128,64,49,0 - KERNEL4x8_L2 128,64,50,0 - KERNEL4x8_L2 128,64,51,0 - KERNEL4x8_L2 128,64,52,0 - KERNEL4x8_L2 128,64,53,0 - KERNEL4x8_L2 128,64,54,0 - KERNEL4x8_L2 128,64,55,0 - KERNEL4x8_L2 128,64,56,0 - KERNEL4x8_L2 128,64,57,0 - KERNEL4x8_L2 128,64,58,0 - KERNEL4x8_L2 128,64,59,0 - KERNEL4x8_L2 128,64,60,0 - KERNEL4x8_L2 128,64,61,0 - KERNEL4x8_L2 128,64,62,0 - KERNEL4x8_L2 128,64,63,1 - bdnz CGEMM_L4x8_LOOP - MY_ALIGN -CGEMM_L4x8_LOOP_END: -/*----------------------------------------*/ - END4x8_2 - blr - MY_ALIGN - - -CGEMM_4x8_L64_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_E2 128,64,31,1 - blr - MY_ALIGN - - -CGEMM_4x8_L32_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_E2 128,64,15,1 - blr - MY_ALIGN - - -CGEMM_4x8_L16_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_E2 128,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x4_2 - MY_ALIGN -CGEMM_L4x4_LOOP: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,0,0 -CGEMM_L4x4_K32: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_L2 64,64,7,0 - KERNEL4x4_L2 64,64,8,0 - KERNEL4x4_L2 64,64,9,0 - KERNEL4x4_L2 64,64,10,0 - KERNEL4x4_L2 64,64,11,0 - KERNEL4x4_L2 64,64,12,0 - KERNEL4x4_L2 64,64,13,0 - KERNEL4x4_L2 64,64,14,0 - KERNEL4x4_L2 64,64,15,1 - bdnz CGEMM_L4x4_LOOP - MY_ALIGN -CGEMM_L4x4_LOOP_END: -/*----------------------------------------*/ - END4x4_2 - blr - MY_ALIGN - - -CGEMM_4x4_L16_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_E2 64,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_L8_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_E2 64,64,3,1 - blr - - -CGEMM_4x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x2_2 - MY_ALIGN -CGEMM_L4x2_LOOP: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,0,0 -CGEMM_L4x2_K32: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_L2 32,64,7,0 - KERNEL4x2_L2 32,64,8,0 - KERNEL4x2_L2 32,64,9,0 - KERNEL4x2_L2 32,64,10,0 - KERNEL4x2_L2 32,64,11,0 - KERNEL4x2_L2 32,64,12,0 - KERNEL4x2_L2 32,64,13,0 - KERNEL4x2_L2 32,64,14,0 - KERNEL4x2_L2 32,64,15,1 - bdnz CGEMM_L4x2_LOOP - MY_ALIGN - - -CGEMM_L4x2_LOOP_END: -/*----------------------------------------*/ - END4x2_2 - blr - MY_ALIGN -CGEMM_4x2_L16_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_E2 32,64,7,1 - blr - MY_ALIGN -CGEMM_4x2_L8_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_E2 32,64,3,1 - blr - - -CGEMM_4x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x1_2 - MY_ALIGN -CGEMM_L4x1_LOOP: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,0,0 -CGEMM_L4x1_K32: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_L2 16,64,7,0 - KERNEL4x1_L2 16,64,8,0 - KERNEL4x1_L2 16,64,9,0 - KERNEL4x1_L2 16,64,10,0 - KERNEL4x1_L2 16,64,11,0 - KERNEL4x1_L2 16,64,12,0 - KERNEL4x1_L2 16,64,13,0 - KERNEL4x1_L2 16,64,14,0 - KERNEL4x1_L2 16,64,15,1 - bdnz CGEMM_L4x1_LOOP - MY_ALIGN -CGEMM_L4x1_LOOP_END: -/*----------------------------------------*/ - END4x1_2 - blr - - MY_ALIGN -CGEMM_4x1_L16_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_E2 16,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x1_L8_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_E2 16,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L4: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 2 - ble CGEMM_L4_END - - -CGEMM_L4_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 2 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L4x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L4x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO4x8 - ble CGEMM_L4x8_SUB0 - bl CGEMM_L4x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L4x8_SAVE - b CGEMM_L4x8_SUB2 - - -CGEMM_L4x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP4x8_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD4x8O 64,32 - END4x8_WITHOUT_ADD - LOAD4x8_2O 128, 64 - mtctr T8 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - CMP4x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L4x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD4x8_2O 128,64 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - MY_ALIGN - - -CGEMM_L4x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L4x8_SUB2_32 - bl CGEMM_4x8_L64_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L4x8_SUB2_16 - bl CGEMM_4x8_L32_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x8_SUB2_8 - bl CGEMM_4x8_L16_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x8_SUB2_4 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_L2 128,64, 1,0 - KERNEL4x8_L2 128,64, 2,0 - KERNEL4x8_E2 128,64, 3,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x8_SUB2_2 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_E2 128,64, 1,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x8_SUB2_1 - LOAD4x8_2 - KERNEL4x8_E2 128,64, 0,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x8_SAVE - KERNEL4x8 - - MY_ALIGN -CGEMM_L4x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 -#endif - bgt CGEMM_L4x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END - b CGEMM_L4x4_BEGIN - MY_ALIGN - - -CGEMM_L4x8_END: -/*----------------------------------------*/ - - -CGEMM_L4x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x4 - ble CGEMM_L4x4_SUB0 - bl CGEMM_4x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x4_SAVE - b CGEMM_L4x4_SUB2 - - -CGEMM_L4x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x4_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD4x4O 32,32 - END4x4_WITHOUT_ADD - LOAD4x4_2O 64, 64 - mtctr T8 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - CMP4x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD4x4_2O 64,64 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x4_SUB2_8 - bl CGEMM_4x4_L16_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x4_SUB2_4 - bl CGEMM_4x4_L8_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x4_SUB2_2 - LOAD4x4_2 - KERNEL4x4_L2 64,64, 0,0 - KERNEL4x4_E2 64,64, 1,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x4_SUB2_1 - LOAD4x4_2 - KERNEL4x4_E2 64,64, 0,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x4_SAVE - KERNEL4x4 - - -CGEMM_L4x4_SAVE: -/*----------------------------------------*/ - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 -#endif - - -CGEMM_L4x4_END: -/*----------------------------------------*/ - - -CGEMM_L4x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x2 - ble CGEMM_L4x2_SUB0 - bl CGEMM_4x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x2_SAVE - b CGEMM_L4x2_SUB2 - - -CGEMM_L4x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x2_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD4x2O 16,32 - END4x2_WITHOUT_ADD - LOAD4x2_2O 32, 64 - mtctr T8 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - CMP4x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD4x2_2O 32,64 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x2_SUB2_8 - bl CGEMM_4x2_L16_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x2_SUB2_4 - bl CGEMM_4x2_L8_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x2_SUB2_2 - LOAD4x2_2 - KERNEL4x2_L2 32,64, 0,0 - KERNEL4x2_E2 32,64, 1,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x2_SUB2_1 - LOAD4x2_2 - KERNEL4x2_E2 32,64, 0,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -CGEMM_L4x2_SAVE: -/*----------------------------------------*/ - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 -#endif - - -CGEMM_L4x2_END: -/*----------------------------------------*/ - - -CGEMM_L4x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x1 - ble CGEMM_L4x1_SUB0 - bl CGEMM_4x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x1_SAVE - b CGEMM_L4x1_SUB2 - - -CGEMM_L4x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x1_32K - addi BO,BO,-32 - addi AO,AO,-8 - LOAD4x1O 8,32 - END4x1_WITHOUT_ADD - LOAD4x1_2O 16, 64 - mtctr T8 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - CMP4x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-16 - LOAD4x1_2O 16,64 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x1_SUB2_8 - bl CGEMM_4x1_L16_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x1_SUB2_4 - bl CGEMM_4x1_L8_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x1_SUB2_2 - LOAD4x1_2 - KERNEL4x1_L2 16,64, 0,0 - KERNEL4x1_E2 16,64, 1,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x1_SUB2_1 - LOAD4x1_2 - KERNEL4x1_E2 16,64, 0,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -CGEMM_L4x1_SAVE: -/*----------------------------------------*/ - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 -#endif - - -CGEMM_L4x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - bgt CGEMM_L4_BEGIN - - -CGEMM_L4_END: - -b CGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -CGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -CGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 -CGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_L2 128,32,31,0 - KERNEL2x8_L2 128,32,32,0 - KERNEL2x8_L2 128,32,33,0 - KERNEL2x8_L2 128,32,34,0 - KERNEL2x8_L2 128,32,35,0 - KERNEL2x8_L2 128,32,36,0 - KERNEL2x8_L2 128,32,37,0 - KERNEL2x8_L2 128,32,38,0 - KERNEL2x8_L2 128,32,39,0 - KERNEL2x8_L2 128,32,40,0 - KERNEL2x8_L2 128,32,41,0 - KERNEL2x8_L2 128,32,42,0 - KERNEL2x8_L2 128,32,43,0 - KERNEL2x8_L2 128,32,44,0 - KERNEL2x8_L2 128,32,45,0 - KERNEL2x8_L2 128,32,46,0 - KERNEL2x8_L2 128,32,47,0 - KERNEL2x8_L2 128,32,48,0 - KERNEL2x8_L2 128,32,49,0 - KERNEL2x8_L2 128,32,50,0 - KERNEL2x8_L2 128,32,51,0 - KERNEL2x8_L2 128,32,52,0 - KERNEL2x8_L2 128,32,53,0 - KERNEL2x8_L2 128,32,54,0 - KERNEL2x8_L2 128,32,55,0 - KERNEL2x8_L2 128,32,56,0 - KERNEL2x8_L2 128,32,57,0 - KERNEL2x8_L2 128,32,58,0 - KERNEL2x8_L2 128,32,59,0 - KERNEL2x8_L2 128,32,60,0 - KERNEL2x8_L2 128,32,61,0 - KERNEL2x8_L2 128,32,62,0 - KERNEL2x8_L2 128,32,63,1 - bdnz CGEMM_L2x8_LOOP - MY_ALIGN -CGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -CGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_E2 128,32,31,1 - blr - MY_ALIGN - - -CGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_E2 128,32,15,1 - blr - MY_ALIGN - - -CGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_E2 128,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -CGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,0,0 -CGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_L2 64,32,7,0 - KERNEL2x4_L2 64,32,8,0 - KERNEL2x4_L2 64,32,9,0 - KERNEL2x4_L2 64,32,10,0 - KERNEL2x4_L2 64,32,11,0 - KERNEL2x4_L2 64,32,12,0 - KERNEL2x4_L2 64,32,13,0 - KERNEL2x4_L2 64,32,14,0 - KERNEL2x4_L2 64,32,15,1 - bdnz CGEMM_L2x4_LOOP - MY_ALIGN -CGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -CGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_E2 64,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_E2 64,32,3,1 - blr - - -CGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -CGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,0,0 -CGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_L2 32,32,7,0 - KERNEL2x2_L2 32,32,8,0 - KERNEL2x2_L2 32,32,9,0 - KERNEL2x2_L2 32,32,10,0 - KERNEL2x2_L2 32,32,11,0 - KERNEL2x2_L2 32,32,12,0 - KERNEL2x2_L2 32,32,13,0 - KERNEL2x2_L2 32,32,14,0 - KERNEL2x2_L2 32,32,15,1 - bdnz CGEMM_L2x2_LOOP - MY_ALIGN - - -CGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -CGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_E2 32,32,7,1 - blr - MY_ALIGN -CGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_E2 32,32,3,1 - blr - - -CGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -CGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,0,0 -CGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_L2 16,32,7,0 - KERNEL2x1_L2 16,32,8,0 - KERNEL2x1_L2 16,32,9,0 - KERNEL2x1_L2 16,32,10,0 - KERNEL2x1_L2 16,32,11,0 - KERNEL2x1_L2 16,32,12,0 - KERNEL2x1_L2 16,32,13,0 - KERNEL2x1_L2 16,32,14,0 - KERNEL2x1_L2 16,32,15,1 - bdnz CGEMM_L2x1_LOOP - MY_ALIGN -CGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -CGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_E2 16,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_E2 16,32,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L2: -/*----------------------------------------*/ - - andi. J, N, 2 - ble CGEMM_L2_END - - -CGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble CGEMM_L2x8_SUB0 - bl CGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L2x8_SAVE - b CGEMM_L2x8_SUB2 - - -CGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD2x8O 64,16 - END2x8_WITHOUT_ADD - LOAD2x8_2O 128, 32 - mtctr T8 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8_2O 128,32 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - MY_ALIGN - - -CGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L2x8_SUB2_32 - bl CGEMM_2x8_L64_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L2x8_SUB2_16 - bl CGEMM_2x8_L32_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x8_SUB2_8 - bl CGEMM_2x8_L16_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_L2 128,32, 1,0 - KERNEL2x8_L2 128,32, 2,0 - KERNEL2x8_E2 128,32, 3,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_E2 128,32, 1,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 128,32, 0,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -CGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt CGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END - b CGEMM_L2x4_BEGIN - MY_ALIGN - - -CGEMM_L2x8_END: -/*----------------------------------------*/ - - -CGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble CGEMM_L2x4_SUB0 - bl CGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x4_SAVE - b CGEMM_L2x4_SUB2 - - -CGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD2x4O 32,16 - END2x4_WITHOUT_ADD - LOAD2x4_2O 64, 32 - mtctr T8 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4_2O 64,32 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x4_SUB2_8 - bl CGEMM_2x4_L16_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x4_SUB2_4 - bl CGEMM_2x4_L8_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 64,32, 0,0 - KERNEL2x4_E2 64,32, 1,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 64,32, 0,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x4_SAVE - KERNEL2x4 - - -CGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -CGEMM_L2x4_END: -/*----------------------------------------*/ - - -CGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble CGEMM_L2x2_SUB0 - bl CGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x2_SAVE - b CGEMM_L2x2_SUB2 - - -CGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD2x2O 16,16 - END2x2_WITHOUT_ADD - LOAD2x2_2O 32, 32 - mtctr T8 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2_2O 32,32 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x2_SUB2_8 - bl CGEMM_2x2_L16_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x2_SUB2_4 - bl CGEMM_2x2_L8_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 32,32, 0,0 - KERNEL2x2_E2 32,32, 1,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 32,32, 0,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -CGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -CGEMM_L2x2_END: -/*----------------------------------------*/ - - -CGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble CGEMM_L2x1_SUB0 - bl CGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x1_SAVE - b CGEMM_L2x1_SUB2 - - -CGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-16 - addi AO,AO,-8 - LOAD2x1O 8,16 - END2x1_WITHOUT_ADD - LOAD2x1_2O 16, 32 - mtctr T8 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1_2O 16,32 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x1_SUB2_8 - bl CGEMM_2x1_L16_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x1_SUB2_4 - bl CGEMM_2x1_L8_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 16,32, 0,0 - KERNEL2x1_E2 16,32, 1,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 16,32, 0,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -CGEMM_L2x1_SAVE: -/*----------------------------------------*/ - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -CGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 4 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - -CGEMM_L2_END: - - -b CGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -CGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -CGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 -CGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_L2 128,16,31,0 - KERNEL1x8_L2 128,16,32,0 - KERNEL1x8_L2 128,16,33,0 - KERNEL1x8_L2 128,16,34,0 - KERNEL1x8_L2 128,16,35,0 - KERNEL1x8_L2 128,16,36,0 - KERNEL1x8_L2 128,16,37,0 - KERNEL1x8_L2 128,16,38,0 - KERNEL1x8_L2 128,16,39,0 - KERNEL1x8_L2 128,16,40,0 - KERNEL1x8_L2 128,16,41,0 - KERNEL1x8_L2 128,16,42,0 - KERNEL1x8_L2 128,16,43,0 - KERNEL1x8_L2 128,16,44,0 - KERNEL1x8_L2 128,16,45,0 - KERNEL1x8_L2 128,16,46,0 - KERNEL1x8_L2 128,16,47,0 - KERNEL1x8_L2 128,16,48,0 - KERNEL1x8_L2 128,16,49,0 - KERNEL1x8_L2 128,16,50,0 - KERNEL1x8_L2 128,16,51,0 - KERNEL1x8_L2 128,16,52,0 - KERNEL1x8_L2 128,16,53,0 - KERNEL1x8_L2 128,16,54,0 - KERNEL1x8_L2 128,16,55,0 - KERNEL1x8_L2 128,16,56,0 - KERNEL1x8_L2 128,16,57,0 - KERNEL1x8_L2 128,16,58,0 - KERNEL1x8_L2 128,16,59,0 - KERNEL1x8_L2 128,16,60,0 - KERNEL1x8_L2 128,16,61,0 - KERNEL1x8_L2 128,16,62,0 - KERNEL1x8_L2 128,16,63,1 - bdnz CGEMM_L1x8_LOOP - MY_ALIGN -CGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -CGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_E2 128,16,31,1 - blr - MY_ALIGN - - -CGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_E2 128,16,15,1 - blr - MY_ALIGN - - -CGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_E2 128,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN -CGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,0,0 -CGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_L2 64,16,7,0 - KERNEL1x4_L2 64,16,8,0 - KERNEL1x4_L2 64,16,9,0 - KERNEL1x4_L2 64,16,10,0 - KERNEL1x4_L2 64,16,11,0 - KERNEL1x4_L2 64,16,12,0 - KERNEL1x4_L2 64,16,13,0 - KERNEL1x4_L2 64,16,14,0 - KERNEL1x4_L2 64,16,15,1 - bdnz CGEMM_L1x4_LOOP - MY_ALIGN -CGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -CGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_E2 64,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_E2 64,16,3,1 - blr - - -CGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN -CGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,0,0 -CGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_L2 32,16,7,0 - KERNEL1x2_L2 32,16,8,0 - KERNEL1x2_L2 32,16,9,0 - KERNEL1x2_L2 32,16,10,0 - KERNEL1x2_L2 32,16,11,0 - KERNEL1x2_L2 32,16,12,0 - KERNEL1x2_L2 32,16,13,0 - KERNEL1x2_L2 32,16,14,0 - KERNEL1x2_L2 32,16,15,1 - bdnz CGEMM_L1x2_LOOP - MY_ALIGN - - -CGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN -CGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_E2 32,16,7,1 - blr - MY_ALIGN -CGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_E2 32,16,3,1 - blr - - -CGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN -CGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,0,0 -CGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_L2 16,16,7,0 - KERNEL1x1_L2 16,16,8,0 - KERNEL1x1_L2 16,16,9,0 - KERNEL1x1_L2 16,16,10,0 - KERNEL1x1_L2 16,16,11,0 - KERNEL1x1_L2 16,16,12,0 - KERNEL1x1_L2 16,16,13,0 - KERNEL1x1_L2 16,16,14,0 - KERNEL1x1_L2 16,16,15,1 - bdnz CGEMM_L1x1_LOOP - MY_ALIGN -CGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - - MY_ALIGN -CGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_E2 16,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_E2 16,16,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L1: -/*----------------------------------------*/ - - andi. J, N, 1 - ble CGEMM_L1_END - -CGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble CGEMM_L1x8_SUB0 - bl CGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L1x8_SAVE - b CGEMM_L1x8_SUB2 - - -CGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-8 - addi AO,AO,-64 - LOAD1x8O 64,8 - END1x8_WITHOUT_ADD - LOAD1x8_2O 128, 16 - mtctr T8 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8_2O 128,16 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - MY_ALIGN - - -CGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L1x8_SUB2_32 - bl CGEMM_1x8_L64_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L1x8_SUB2_16 - bl CGEMM_1x8_L32_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x8_SUB2_8 - bl CGEMM_1x8_L16_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_L2 128,16, 1,0 - KERNEL1x8_L2 128,16, 2,0 - KERNEL1x8_E2 128,16, 3,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_E2 128,16, 1,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 128,16, 0,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x8_SAVE - KERNEL1x8 - - MY_ALIGN -CGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt CGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END - b CGEMM_L1x4_BEGIN - MY_ALIGN - - -CGEMM_L1x8_END: -/*----------------------------------------*/ - - -CGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x4 - ble CGEMM_L1x4_SUB0 - bl CGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x4_SAVE - b CGEMM_L1x4_SUB2 - - -CGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-8 - addi AO,AO,-32 - LOAD1x4O 32,8 - END1x4_WITHOUT_ADD - LOAD1x4_2O 64, 16 - mtctr T8 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4_2O 64,16 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x4_SUB2_8 - bl CGEMM_1x4_L16_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x4_SUB2_4 - bl CGEMM_1x4_L8_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 64,16, 0,0 - KERNEL1x4_E2 64,16, 1,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 64,16, 0,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x4_SAVE - KERNEL1x4 - - -CGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -CGEMM_L1x4_END: -/*----------------------------------------*/ - - -CGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x2 - ble CGEMM_L1x2_SUB0 - bl CGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x2_SAVE - b CGEMM_L1x2_SUB2 - - -CGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-8 - addi AO,AO,-16 - LOAD1x2O 16,8 - END1x2_WITHOUT_ADD - LOAD1x2_2O 32, 16 - mtctr T8 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2_2O 32,16 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x2_SUB2_8 - bl CGEMM_1x2_L16_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x2_SUB2_4 - bl CGEMM_1x2_L8_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 32,16, 0,0 - KERNEL1x2_E2 32,16, 1,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 32,16, 0,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x2_SAVE - KERNEL1x2 - - MY_ALIGN -CGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -CGEMM_L1x2_END: -/*----------------------------------------*/ - - -CGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x1 - ble CGEMM_L1x1_SUB0 - bl CGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x1_SAVE - b CGEMM_L1x1_SUB2 - - -CGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-8 - addi AO,AO,-8 - LOAD1x1O 8,8 - END1x1_WITHOUT_ADD - LOAD1x1_2O 16, 16 - mtctr T8 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1_2O 16,16 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x1_SUB2_8 - bl CGEMM_1x1_L16_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x1_SUB2_4 - bl CGEMM_1x1_L8_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 16,16, 0,0 - KERNEL1x1_E2 16,16, 1,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 16,16, 0,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x1_SAVE - KERNEL1x1 - - MY_ALIGN -CGEMM_L1x1_SAVE: -/*----------------------------------------*/ - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -CGEMM_L1x1_END: -/*----------------------------------------*/ - slwi T1, K, 3 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - -CGEMM_L1_END: - - - - +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S index a256e1a01..be2b74f01 100644 --- a/kernel/power/cgemm_macros_power9.S +++ b/kernel/power/cgemm_macros_power9.S @@ -1,3019 +1,3019 @@ - -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - - -.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - -/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmulsp \VSOUT1,\VSINII, alpha_i - xvmulsp \VSOUT2,\VSINRR, alpha_i -.endm - -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubasp \VSOUT1,\VSINRR, alpha_r - xvmaddasp \VSOUT2,\VSINII, alpha_r -.endm - -/* macros for N=4 and M=8 -**********************************************************************************************/ - -.macro Zero4x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD4x8 - LOAD4x8O 0,0 -.endm - - -.macro LOAD4x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_NORMAL - END4x8 AO,BO,64,32 -.endm - - -.macro END4x8_WITHOUT_ADD - END4x8 AO,BO,0,0 -.endm - - -.macro END4x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.endm - - -.macro LOAD4x8_2 - LOAD4x8_2O 0,0 -.endm - - -.macro LOAD4x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_2 - /*for load2 offset will be 128 and 64*/ - KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 -.endm - - -.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL4x8 - LOAD4x8 - END4x8 AO, BO, 64,32 -.endm - - -.macro SAVE4x8 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - xxperm vs2,vs50,permute_mask - xxperm vs6,vs58,permute_mask - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - xxperm vs3,vs51,permute_mask - xxperm vs7,vs59,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 - xxperm vs10,vs54,permute_mask - xxperm vs14,vs62,permute_mask - AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 - xxperm vs11,vs55,permute_mask - xxperm vs15,vs63,permute_mask - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - #ifndef TRMMKERNEL - lxv vs32 , 0(T2) - lxv vs40 , 16(T2) -#endif - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs33 , 32(T2) - lxv vs41 , 48(T2) -#endif - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 -#ifndef TRMMKERNEL - lxv vs34 , 0(T3) - lxv vs42 , 16(T3) -#endif - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs35 , 32(T3) - lxv vs43 , 48(T3) -#endif - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - MULT_APLHA_PART1 vs48,vs56,vs0,vs1 - MULT_APLHA_PART1 vs49,vs57,vs2,vs3 - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - MULT_APLHA_PART1 vs50,vs58,vs4,vs5 - MULT_APLHA_PART1 vs51,vs59,vs6,vs7 - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - MULT_APLHA_PART2 vs48,vs56,vs0,vs1 - MULT_APLHA_PART2 vs49,vs57,vs2,vs3 - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - MULT_APLHA_PART2 vs50,vs58,vs4,vs5 - MULT_APLHA_PART2 vs51,vs59,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs8,vs9 - MULT_APLHA_PART1 vs53,vs61,vs10,vs11 - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - MULT_APLHA_PART1 vs54,vs62,vs12,vs13 - MULT_APLHA_PART1 vs55,vs63,vs14,vs15 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - MULT_APLHA_PART2 vs52,vs60,vs8,vs9 - MULT_APLHA_PART2 vs53,vs61,vs10,vs11 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - MULT_APLHA_PART2 vs54,vs62,vs12,vs13 - MULT_APLHA_PART2 vs55,vs63,vs14,vs15 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs32,vs32,vs1 - xvaddsp vs40,vs40,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs33,vs33,vs5 - xvaddsp vs41,vs41,vs7 - xvaddsp vs34,vs34,vs9 - xvaddsp vs42,vs42,vs11 - xvaddsp vs35,vs35,vs13 - xvaddsp vs43,vs43,vs15 -#else - xxpermdi vs32,vs8,vs0,2 - xxpermdi vs40,vs10,vs2,2 - xxpermdi vs33,vs12,vs4,2 - xxpermdi vs41,vs14,vs6,2 - xxpermdi vs34,vs0,vs8,2 - xxpermdi vs42,vs2,vs10,2 - xxpermdi vs35,vs4,vs12,2 - xxpermdi vs43,vs6,vs14,2 -#endif - stxv vs32 , 0(T2) - stxv vs40 , 16(T2) - stxv vs33 , 32(T2) - stxv vs41 , 48(T2) - stxv vs34 , 0(T3) - stxv vs42 , 16(T3) - stxv vs35 , 32(T3) - stxv vs43 , 48(T3) - addi CO, CO, 64 -.endm - -/* macros for N=4 and M=4 -**********************************************************************************************/ - -.macro Zero4x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endm - - -.macro LOAD4x4 - LOAD4x4O 0,0 -.endm - - -.macro LOAD4x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_NORMAL - END4x4 AO,BO,32,32 -.endm - - -.macro END4x4_WITHOUT_ADD - END4x4 AO,BO,0,0 -.endm - - -.macro END4x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.endm - - -.macro LOAD4x4_2 - LOAD4x4_2O 0,0 -.endm - - -.macro LOAD4x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_2 - /*for load2 offset will be 64 and 64*/ - KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 -.endm - - -.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x4 - LOAD4x4 - END4x4 AO, BO, 32,32 -.endm - - -.macro SAVE4x4 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - #ifndef TRMMKERNEL - lxv vs28 , 0(T2) - lxv vs29 , 16(T2) -#endif -#ifndef TRMMKERNEL - lxv vs30 , 0(T3) - lxv vs31 , 16(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs48,vs56,vs4,vs5 - MULT_APLHA_PART1 vs49,vs57,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs12,vs13 - MULT_APLHA_PART1 vs53,vs61,vs14,vs15 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs48,vs56,vs4,vs5 - MULT_APLHA_PART2 vs49,vs57,vs6,vs7 - MULT_APLHA_PART2 vs52,vs60,vs12,vs13 - MULT_APLHA_PART2 vs53,vs61,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 - xvaddsp vs28,vs28,vs5 - xvaddsp vs29,vs29,vs7 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 - xxpermdi vs28,vs12,vs4,2 - xxpermdi vs29,vs14,vs6,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - stxv vs28 , 0(T2) - stxv vs29 , 16(T2) - stxv vs30 , 0(T3) - stxv vs31 , 16(T3) - addi CO, CO, 32 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD4x2 - LOAD4x2O 0,0 -.endm - - -.macro LOAD4x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_NORMAL - END4x2 AO,BO,16,32 -.endm - - -.macro END4x2_WITHOUT_ADD - END4x2 AO,BO,0,0 -.endm - - -.macro END4x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD4x2_2 - LOAD4x2_2O 0,0 -.endm - - -.macro LOAD4x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_2 - /*for load2 offset will be 32 and 64*/ - KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 -.endm - - -.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x2 - LOAD4x2 - END4x2 AO, BO, 16,32 -.endm - - -.macro SAVE4x2 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs25 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxv vs27 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs10,vs2,0 - xxpermdi vs3,vs0,vs8,3 - xxpermdi vs11,vs2,vs10,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 - xvaddsp vs25,vs25,vs3 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs10,vs2,0 - xxpermdi vs25,vs0,vs8,3 - xxpermdi vs27,vs2,vs10,3 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 0(T1) - stxv vs26 , 0(T2) - stxv vs27 , 0(T3) - addi CO, CO, 16 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD4x1 - LOAD4x1O 0,0 -.endm - - -.macro LOAD4x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END4x1_NORMAL - END4x1 AO,BO,8,32 -.endm - - -.macro END4x1_WITHOUT_ADD - END4x1 AO,BO,0,0 -.endm - - -.macro END4x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD4x1_2 - LOAD4x1_2O 0,0 -.endm - - -.macro LOAD4x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) -.endm - - -.macro END4x1_2 - /*for load2 offset will be 16 and 64*/ - KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 -.endm - - -.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x1 - LOAD4x1 - END4x1 AO, BO, 8,32 -.endm - - -.macro SAVE4x1 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxsd v6 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxsd v7 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - xxspltd vs9,vs2,0 - xxspltd vs11,vs2,1 - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 - xvaddsp vs38,vs38,vs9 - xvaddsp vs39,vs39,vs11 -#else - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 - xxspltd vs38,vs2,0 - xxspltd vs39,vs2,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - stxsd v6 , 0(T2) - stxsd v7 , 0(T3) - addi CO, CO, 8 -.endm - -/* macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,64,16 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_2 - /*for load2 offset will be 128 and 32*/ - KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 -.endm - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 64,16 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - addi CO, CO, 64 -.endm - -/* macros for N=2 and M=4 -**********************************************************************************************/ - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,32,16 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_2 - /*for load2 offset will be 64 and 32*/ - KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 -.endm - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 32,16 -.endm - - -.macro SAVE2x4 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - addi CO, CO, 32 -.endm - -/* macros for N=2 and M=2 -**********************************************************************************************/ - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs36, vs36, vs36 - xxlxor vs40, vs40, vs40 - xxlxor vs44, vs44, vs44 -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,16,16 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs44, vs0,vs27 -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_2 - /*for load2 offset will be 32 and 32*/ - KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 -.endm - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs44, vs4,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs44, vs0,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 16,16 -.endm - - -.macro SAVE2x2 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs8,vs9, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs0,vs8,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs0,vs8,3 -#endif - stxv vs24 , 0(CO) - stxv vs26 , 0(T1) - addi CO, CO, 16 -.endm - -/* macros for N=2 and M=1 -**********************************************************************************************/ - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,8,16 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_2 - /*for load2 offset will be 16 and 32*/ - KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 -.endm - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 8,16 -.endm - - -.macro SAVE2x1 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - /*--v4==vs36 v5==vs37---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 -#else - /*--v4==vs36 v5==vs37---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - addi CO, CO, 8 -.endm - -/* macros for N=1 and M=8 -**********************************************************************************************/ - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,64,8 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_2 - /*for load2 offset will be 128 and 16*/ - KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 -.endm - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 64,8 -.endm - - -.macro SAVE1x8 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 - xxperm vs4,vs5, vs28 - xxperm vs6,vs7, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - xvaddsp vs26,vs26,vs4 - xvaddsp vs27,vs27,vs6 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) - stxv vs4 , 32(CO) - stxv vs6 , 48(CO) -#endif - addi CO, CO, 64 -.endm - -/* macros for N=1 and M=4 -**********************************************************************************************/ - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,32,8 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_2 - /*for load2 offset will be 64 and 16*/ - KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 -.endm - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 32,8 -.endm - - -.macro SAVE1x4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) -#endif - addi CO, CO, 32 -.endm - -/* macros for N=1 and M=2 -**********************************************************************************************/ - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,16,8 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs0, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_2 - /*for load2 offset will be 32 and 16*/ - KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 -.endm - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP4(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 16,8 -.endm - - -.macro SAVE1x2 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - stxv vs24 , 0(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) -#endif - addi CO, CO, 16 -.endm - -/* macros for N=1 and M=1 -**********************************************************************************************/ -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxsd v4, (\OffsetB+0)(BO) - lxsd v5, (\OffsetA+0)(AO) - xxperm vs38, vs36, permute_mask -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,8,8 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs37,vs36 - xvmaddasp vs40, vs37,vs38 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask -.endm - - -.macro END1x1_2 - /*for load2 offset will be 16 and 16*/ - KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 -.endm - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP2(\Index,\OffsetB)(\BREG) - lxv vs4, DISP2(\Index,\OffsetB)(\AREG) - xxperm vs10, vs8, permute_mask -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP2(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP2(\Index,16) -.endif - -.endif -.endm - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 8,8 -.endm - - -.macro SAVE1x1 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - /*aggregate x2*/ - xxpermdi vs33,vs32,vs32,2 - xxpermdi vs41,vs40,vs40,2 - xvaddsp vs32,vs32,vs33 - xvaddsp vs40,vs40,vs41 - - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs37,vs1 - MULT_APLHA_PART2 vs32,vs40,vs37,vs1 - -/* reconstruct r,i pairs*/ - xxperm vs37,vs1, vs28 - -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs36,vs36,vs37 - stxsd v4 , 0(CO) -#else - -/* vs37 is v5 */ - stxsd v5 , 0(CO) -#endif - addi CO, CO, 8 -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*8; -// ptrbb = bb + off*4; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+8; // number of values in A -// #else -// temp = off+4; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 8; // number of values in A -// #else -// temp -= 4; // number of values in B -// #endif -// ptrba += temp*8; -// ptrbb += temp*4; -// #endif - -// #ifdef LEFT -// off += 8; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif + +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmulsp \VSOUT1,\VSINII, alpha_i + xvmulsp \VSOUT2,\VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubasp \VSOUT1,\VSINRR, alpha_r + xvmaddasp \VSOUT2,\VSINII, alpha_r +.endm + +/* macros for N=4 and M=8 +**********************************************************************************************/ + +.macro Zero4x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD4x8 + LOAD4x8O 0,0 +.endm + + +.macro LOAD4x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_NORMAL + END4x8 AO,BO,64,32 +.endm + + +.macro END4x8_WITHOUT_ADD + END4x8 AO,BO,0,0 +.endm + + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.endm + + +.macro LOAD4x8_2 + LOAD4x8_2O 0,0 +.endm + + +.macro LOAD4x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 +.endm + + +.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64,32 +.endm + + +.macro SAVE4x8 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + xxperm vs2,vs50,permute_mask + xxperm vs6,vs58,permute_mask + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + xxperm vs3,vs51,permute_mask + xxperm vs7,vs59,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 + xxperm vs10,vs54,permute_mask + xxperm vs14,vs62,permute_mask + AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 + xxperm vs11,vs55,permute_mask + xxperm vs15,vs63,permute_mask + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + #ifndef TRMMKERNEL + lxv vs32 , 0(T2) + lxv vs40 , 16(T2) +#endif + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs33 , 32(T2) + lxv vs41 , 48(T2) +#endif + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 +#ifndef TRMMKERNEL + lxv vs34 , 0(T3) + lxv vs42 , 16(T3) +#endif + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs35 , 32(T3) + lxv vs43 , 48(T3) +#endif + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + MULT_APLHA_PART1 vs48,vs56,vs0,vs1 + MULT_APLHA_PART1 vs49,vs57,vs2,vs3 + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + MULT_APLHA_PART1 vs50,vs58,vs4,vs5 + MULT_APLHA_PART1 vs51,vs59,vs6,vs7 + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + MULT_APLHA_PART2 vs48,vs56,vs0,vs1 + MULT_APLHA_PART2 vs49,vs57,vs2,vs3 + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + MULT_APLHA_PART2 vs50,vs58,vs4,vs5 + MULT_APLHA_PART2 vs51,vs59,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs8,vs9 + MULT_APLHA_PART1 vs53,vs61,vs10,vs11 + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + MULT_APLHA_PART1 vs54,vs62,vs12,vs13 + MULT_APLHA_PART1 vs55,vs63,vs14,vs15 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + MULT_APLHA_PART2 vs52,vs60,vs8,vs9 + MULT_APLHA_PART2 vs53,vs61,vs10,vs11 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + MULT_APLHA_PART2 vs54,vs62,vs12,vs13 + MULT_APLHA_PART2 vs55,vs63,vs14,vs15 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs32,vs32,vs1 + xvaddsp vs40,vs40,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs33,vs33,vs5 + xvaddsp vs41,vs41,vs7 + xvaddsp vs34,vs34,vs9 + xvaddsp vs42,vs42,vs11 + xvaddsp vs35,vs35,vs13 + xvaddsp vs43,vs43,vs15 +#else + xxpermdi vs32,vs8,vs0,2 + xxpermdi vs40,vs10,vs2,2 + xxpermdi vs33,vs12,vs4,2 + xxpermdi vs41,vs14,vs6,2 + xxpermdi vs34,vs0,vs8,2 + xxpermdi vs42,vs2,vs10,2 + xxpermdi vs35,vs4,vs12,2 + xxpermdi vs43,vs6,vs14,2 +#endif + stxv vs32 , 0(T2) + stxv vs40 , 16(T2) + stxv vs33 , 32(T2) + stxv vs41 , 48(T2) + stxv vs34 , 0(T3) + stxv vs42 , 16(T3) + stxv vs35 , 32(T3) + stxv vs43 , 48(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro Zero4x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endm + + +.macro LOAD4x4 + LOAD4x4O 0,0 +.endm + + +.macro LOAD4x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_NORMAL + END4x4 AO,BO,32,32 +.endm + + +.macro END4x4_WITHOUT_ADD + END4x4 AO,BO,0,0 +.endm + + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.endm + + +.macro LOAD4x4_2 + LOAD4x4_2O 0,0 +.endm + + +.macro LOAD4x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 +.endm + + +.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32,32 +.endm + + +.macro SAVE4x4 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + #ifndef TRMMKERNEL + lxv vs28 , 0(T2) + lxv vs29 , 16(T2) +#endif +#ifndef TRMMKERNEL + lxv vs30 , 0(T3) + lxv vs31 , 16(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs48,vs56,vs4,vs5 + MULT_APLHA_PART1 vs49,vs57,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs12,vs13 + MULT_APLHA_PART1 vs53,vs61,vs14,vs15 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs48,vs56,vs4,vs5 + MULT_APLHA_PART2 vs49,vs57,vs6,vs7 + MULT_APLHA_PART2 vs52,vs60,vs12,vs13 + MULT_APLHA_PART2 vs53,vs61,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 + xvaddsp vs28,vs28,vs5 + xvaddsp vs29,vs29,vs7 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 + xxpermdi vs28,vs12,vs4,2 + xxpermdi vs29,vs14,vs6,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + stxv vs28 , 0(T2) + stxv vs29 , 16(T2) + stxv vs30 , 0(T3) + stxv vs31 , 16(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD4x2 + LOAD4x2O 0,0 +.endm + + +.macro LOAD4x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_NORMAL + END4x2 AO,BO,16,32 +.endm + + +.macro END4x2_WITHOUT_ADD + END4x2 AO,BO,0,0 +.endm + + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD4x2_2 + LOAD4x2_2O 0,0 +.endm + + +.macro LOAD4x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16,32 +.endm + + +.macro SAVE4x2 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs10,vs2,0 + xxpermdi vs3,vs0,vs8,3 + xxpermdi vs11,vs2,vs10,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 + xvaddsp vs25,vs25,vs3 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs10,vs2,0 + xxpermdi vs25,vs0,vs8,3 + xxpermdi vs27,vs2,vs10,3 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 0(T1) + stxv vs26 , 0(T2) + stxv vs27 , 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD4x1 + LOAD4x1O 0,0 +.endm + + +.macro LOAD4x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END4x1_NORMAL + END4x1 AO,BO,8,32 +.endm + + +.macro END4x1_WITHOUT_ADD + END4x1 AO,BO,0,0 +.endm + + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD4x1_2 + LOAD4x1_2O 0,0 +.endm + + +.macro LOAD4x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) +.endm + + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 +.endm + + +.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8,32 +.endm + + +.macro SAVE4x1 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + xxspltd vs9,vs2,0 + xxspltd vs11,vs2,1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 + xvaddsp vs38,vs38,vs9 + xvaddsp vs39,vs39,vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 + xxspltd vs38,vs2,0 + xxspltd vs39,vs2,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + stxsd v6 , 0(T2) + stxsd v7 , 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,64,16 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 +.endm + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64,16 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,32,16 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 +.endm + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32,16 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs36, vs36, vs36 + xxlxor vs40, vs40, vs40 + xxlxor vs44, vs44, vs44 +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,16,16 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 +.endm + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs44, vs0,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16,16 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs8,vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs0,vs8,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs0,vs8,3 +#endif + stxv vs24 , 0(CO) + stxv vs26 , 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,8,16 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8,16 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,64,8 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 +.endm + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + + +.macro SAVE1x8 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 + xxperm vs4,vs5, vs28 + xxperm vs6,vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + xvaddsp vs26,vs26,vs4 + xvaddsp vs27,vs27,vs6 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) + stxv vs4 , 32(CO) + stxv vs6 , 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,32,8 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 +.endm + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + + +.macro SAVE1x4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,16,8 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs0, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 +.endm + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP4(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + stxv vs24 , 0(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,8,8 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs37,vs36 + xvmaddasp vs40, vs37,vs38 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 +.endm + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index,\OffsetB)(\BREG) + lxv vs4, DISP2(\Index,\OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP2(\Index,16) +.endif + +.endif +.endm + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33,vs32,vs32,2 + xxpermdi vs41,vs40,vs40,2 + xvaddsp vs32,vs32,vs33 + xvaddsp vs40,vs40,vs41 + + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs37,vs1 + MULT_APLHA_PART2 vs32,vs40,vs37,vs1 + +/* reconstruct r,i pairs*/ + xxperm vs37,vs1, vs28 + +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36,vs36,vs37 + stxsd v4 , 0(CO) +#else + +/* vs37 is v5 */ + stxsd v5 , 0(CO) +#endif + addi CO, CO, 8 +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index 8663039c5..575847da2 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -1,597 +1,597 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/zgemv_n.c" -#else - -#include -#include -#include "common.h" -#include -#define NBMAX 1024 - - -static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; - register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; - register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; - register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; - register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; - register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; - register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; - register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; - register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - register __vector float *vptr_a2 = (__vector float *) a2; - register __vector float *vptr_a3 = (__vector float *) a3; - BLASLONG i = 0; - BLASLONG i2=16; - for (;i< n * 8; i+=32,i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va2 = vec_vsx_ld(i ,vptr_a2); - register __vector float va3 = vec_vsx_ld(i ,vptr_a3); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); - register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); - - vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; - va0 = vec_perm(va0, va0,swap_mask); - va0_1 = vec_perm(va0_1, va0_1,swap_mask); - va1 = vec_perm(va1, va1,swap_mask); - va1_1 = vec_perm(va1_1, va1_1,swap_mask); - va2 = vec_perm(va2, va2,swap_mask); - va2_1 = vec_perm(va2_1, va2_1,swap_mask); - va3 = vec_perm(va3, va3,swap_mask); - va3_1 = vec_perm(va3_1, va3_1,swap_mask); - vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; - vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } - -} - - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - register __vector float va1x = vec_perm(va1, va1,swap_mask); - register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); - vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } - -} - - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) ap; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - vy_0 += va0*vx0_r + va0x*vx0_i; - vy_1 += va0_1*vx0_r + va0x_1*vx0_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } -} - - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i=0; - - - if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i +#include +#include "common.h" +#include +#define NBMAX 1024 + + +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; + register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; + register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; + register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; + register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; + register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; + register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; + register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; + register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + register __vector float *vptr_a2 = (__vector float *) a2; + register __vector float *vptr_a3 = (__vector float *) a3; + BLASLONG i = 0; + BLASLONG i2=16; + for (;i< n * 8; i+=32,i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; + va0 = vec_perm(va0, va0,swap_mask); + va0_1 = vec_perm(va0_1, va0_1,swap_mask); + va1 = vec_perm(va1, va1,swap_mask); + va1_1 = vec_perm(va1_1, va1_1,swap_mask); + va2 = vec_perm(va2, va2,swap_mask); + va2_1 = vec_perm(va2_1, va2_1,swap_mask); + va3 = vec_perm(va3, va3,swap_mask); + va3_1 = vec_perm(va3_1, va3_1,swap_mask); + vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; + vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } + +} + + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + register __vector float va1x = vec_perm(va1, va1,swap_mask); + register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); + vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } + +} + + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) ap; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } +} + + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i=0; + + + if (inc_dest != 2) { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i -static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; - __vector float* vptr_a0 = (__vector float*) a0; - __vector float* vptr_a1 = (__vector float*) a1; - __vector float* vptr_a2 = (__vector float*) a2; - __vector float* vptr_a3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va2 = vec_vsx_ld(i ,vptr_a2); - register __vector float va3 = vec_vsx_ld(i ,vptr_a3); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); - register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); - - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - vtemp1_p += vx_0*va1 + vx_1*va1_1; - vtemp1_r += vxr_0*va1 + vxr_1*va1_1; - vtemp2_p += vx_0*va2 + vx_1*va2_1; - vtemp2_r += vxr_0*va2 + vxr_1*va2_1; - vtemp3_p += vx_0*va3 + vx_1*va3_1; - vtemp3_r += vxr_0*va3 + vxr_1*va3_1; - - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - -#endif - -} - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - - - __vector float* vptr_a0 = (__vector float*) a0; - __vector float* vptr_a1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - vtemp1_p += vx_0*va1 + vx_1*va1_1; - vtemp1_r += vxr_0*va1 + vxr_1*va1_1; - - } -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - -#endif - -} - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - __vector float* vptr_a0 = (__vector float*) ap; - __vector float* v_x = (__vector float*) x; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - -#endif - - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i=0; - BLASLONG j=0; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; - - xbuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return (0); - } - - if (m3 == 1) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); - -} -#endif +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_t.c" +#else + +#include "common.h" + +#define NBMAX 1024 +#include +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* vptr_a2 = (__vector float*) a2; + __vector float* vptr_a3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + vtemp2_p += vx_0*va2 + vx_1*va2_1; + vtemp2_r += vxr_0*va2 + vxr_1*va2_1; + vtemp3_p += vx_0*va3 + vx_1*va3_1; + vtemp3_r += vxr_0*va3 + vxr_1*va3_1; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif + +} + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + + + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + + } +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif + +} + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + __vector float* vptr_a0 = (__vector float*) ap; + __vector float* v_x = (__vector float*) x; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest = *src; + *(dest + 1) = *(src + 1); + dest += 2; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i=0; + BLASLONG j=0; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda; + y_ptr += 2; + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return (0); + } + + if (m3 == 1) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + return (0); + +} +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 84ba5d913..dbd7e3482 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -1,233 +1,233 @@ -/*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if defined(POWER8) || defined(POWER9) || defined(POWER10) -#if defined(__VEC__) || defined(__ALTIVEC__) - -static void crot_kernel_8 (long n, float *x, float *y, float c, float s) -{ - __vector float t0; - __vector float t1; - __vector float t2; - __vector float t3; - __vector float t4; - __vector float t5; - __vector float t6; - __vector float t7; - __asm__ - ( - "xscvdpspn 36, %x[cos] \n\t" // load c to all words - "xxspltw 36, 36, 0 \n\t" - "xscvdpspn 37, %x[sin] \n\t" // load s to all words - "xxspltw 37, 37, 0 \n\t" - "lxvd2x 32, 0, %[x_ptr] \n\t" // load x - "lxvd2x 33, %[i16], %[x_ptr] \n\t" - "lxvd2x 34, %[i32], %[x_ptr] \n\t" - "lxvd2x 35, %[i48], %[x_ptr] \n\t" - "lxvd2x 48, 0, %[y_ptr] \n\t" // load y - "lxvd2x 49, %[i16], %[y_ptr] \n\t" - "lxvd2x 50, %[i32], %[y_ptr] \n\t" - "lxvd2x 51, %[i48], %[y_ptr] \n\t" - "addi %[x_ptr], %[x_ptr], 64 \n\t" - "addi %[y_ptr], %[y_ptr], 64 \n\t" - "addic. %[temp_n], %[temp_n], -8 \n\t" - "ble two%= \n\t" - ".align 5 \n\t" - "one%=: \n\t" - "xvmulsp 40, 32, 36 \n\t" // c * x - "xvmulsp 41, 33, 36 \n\t" - "xvmulsp 42, 34, 36 \n\t" - "xvmulsp 43, 35, 36 \n\t" - "xvmulsp %x[x0], 48, 36 \n\t" // c * y - "xvmulsp %x[x2], 49, 36 \n\t" - "xvmulsp %x[x1], 50, 36 \n\t" - "xvmulsp %x[x3], 51, 36 \n\t" - "xvmulsp 44, 32, 37 \n\t" // s * x - "xvmulsp 45, 33, 37 \n\t" - "lxvd2x 32, 0, %[x_ptr] \n\t" // load x - "lxvd2x 33, %[i16], %[x_ptr] \n\t" - "xvmulsp 46, 34, 37 \n\t" - "xvmulsp 47, 35, 37 \n\t" - "lxvd2x 34, %[i32], %[x_ptr] \n\t" - "lxvd2x 35, %[i48], %[x_ptr] \n\t" - "xvmulsp %x[x4], 48, 37 \n\t" // s * y - "xvmulsp %x[x5], 49, 37 \n\t" - "lxvd2x 48, 0, %[y_ptr] \n\t" // load y - "lxvd2x 49, %[i16], %[y_ptr] \n\t" - "xvmulsp %x[x6], 50, 37 \n\t" - "xvmulsp %x[x7], 51, 37 \n\t" - "lxvd2x 50, %[i32], %[y_ptr] \n\t" - "lxvd2x 51, %[i48], %[y_ptr] \n\t" - "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y - "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y - "addi %[x_ptr], %[x_ptr], -64 \n\t" - "addi %[y_ptr], %[y_ptr], -64 \n\t" - "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y - "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y - "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x - "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x - "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x - "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x - "stxvd2x 40, 0, %[x_ptr] \n\t" // store x - "stxvd2x 41, %[i16], %[x_ptr] \n\t" - "stxvd2x 42, %[i32], %[x_ptr] \n\t" - "stxvd2x 43, %[i48], %[x_ptr] \n\t" - "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y - "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" - "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" - "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" - "addi %[x_ptr], %[x_ptr], 128 \n\t" - "addi %[y_ptr], %[y_ptr], 128 \n\t" - "addic. %[temp_n], %[temp_n], -8 \n\t" - "bgt one%= \n\t" - "two%=: \n\t" - "xvmulsp 40, 32, 36 \n\t" // c * x - "xvmulsp 41, 33, 36 \n\t" - "xvmulsp 42, 34, 36 \n\t" - "xvmulsp 43, 35, 36 \n\t" - "xvmulsp %x[x0], 48, 36 \n\t" // c * y - "xvmulsp %x[x2], 49, 36 \n\t" - "xvmulsp %x[x1], 50, 36 \n\t" - "xvmulsp %x[x3], 51, 36 \n\t" - "xvmulsp 44, 32, 37 \n\t" // s * x - "xvmulsp 45, 33, 37 \n\t" - "xvmulsp 46, 34, 37 \n\t" - "xvmulsp 47, 35, 37 \n\t" - "xvmulsp %x[x4], 48, 37 \n\t" // s * y - "xvmulsp %x[x5], 49, 37 \n\t" - "xvmulsp %x[x6], 50, 37 \n\t" - "xvmulsp %x[x7], 51, 37 \n\t" - "addi %[x_ptr], %[x_ptr], -64 \n\t" - "addi %[y_ptr], %[y_ptr], -64 \n\t" - "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y - "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y - "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y - "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y - "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x - "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x - "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x - "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x - "stxvd2x 40, 0, %[x_ptr] \n\t" // store x - "stxvd2x 41, %[i16], %[x_ptr] \n\t" - "stxvd2x 42, %[i32], %[x_ptr] \n\t" - "stxvd2x 43, %[i48], %[x_ptr] \n\t" - "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y - "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" - "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" - "stxvd2x %x[x3], %[i48], %[y_ptr] " - : - [mem_x] "+m" (*(float (*)[2*n])x), - [mem_y] "+m" (*(float (*)[2*n])y), - [temp_n] "+r" (n), - [x_ptr] "+&b" (x), - [y_ptr] "+&b" (y), - [x0] "=wa" (t0), - [x1] "=wa" (t2), - [x2] "=wa" (t1), - [x3] "=wa" (t3), - [x4] "=wa" (t4), - [x5] "=wa" (t5), - [x6] "=wa" (t6), - [x7] "=wa" (t7) - : - [cos] "f" (c), - [sin] "f" (s), - [i16] "b" (16), - [i32] "b" (32), - [i48] "b" (48) - : - "cr0", - "vs32","vs33","vs34","vs35","vs36","vs37", - "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", - "vs48","vs49","vs50","vs51" - ); -} - -#endif -#endif - - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { -#if defined(__VEC__) || defined(__ALTIVEC__) - BLASLONG n1 = n & -8; - if ( n1 > 0 ) - { - crot_kernel_8(n1, x, y, c, s); - i=n1; - ix=2*n1; - } -#endif - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; - - } - - } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - } - return(0); -} - +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) + +static void crot_kernel_8 (long n, float *x, float *y, float c, float s) +{ + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + __vector float t4; + __vector float t5; + __vector float t6; + __vector float t7; + __asm__ + ( + "xscvdpspn 36, %x[cos] \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + "xscvdpspn 37, %x[sin] \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" + "ble two%= \n\t" + ".align 5 \n\t" + "one%=: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" + "bgt one%= \n\t" + "two%=: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] " + : + [mem_x] "+m" (*(float (*)[2*n])x), + [mem_y] "+m" (*(float (*)[2*n])y), + [temp_n] "+r" (n), + [x_ptr] "+&b" (x), + [y_ptr] "+&b" (y), + [x0] "=wa" (t0), + [x1] "=wa" (t2), + [x2] "=wa" (t1), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "f" (c), + [sin] "f" (s), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} + +#endif +#endif + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x, y, c, s); + i=n1; + ix=2*n1; + } +#endif + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + } + return(0); +} + diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S index 2fb1b27ef..86108f20c 100644 --- a/kernel/power/dgemm_kernel_power9.S +++ b/kernel/power/dgemm_kernel_power9.S @@ -1,249 +1,249 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld - - - - -#define STACKSIZE (512 ) -#define ALPHA_SP (296+192)(SP) -#define FZERO (304+192)(SP) - - - -#define M r3 -#define N r4 -#define K r5 - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs18 - -#define o0 0 - - -#define T4 r12 -#define T3 r11 -#define C4 r14 -#define o8 r15 -#define o24 r16 -#define C2 r17 -#define L r18 -#define T1 r19 -#define C3 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define o16 r27 -#define o32 r28 -#define o48 r29 - -#define PRE r30 -#define T2 r31 - -#include "dgemm_macros_power9.S" - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - li r0, 0 - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - - stfd f1, ALPHA_SP - stw r0, FZERO - - slwi LDC, LDC, BASE_SHIFT - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - - - cmpwi cr0, M, 0 - ble .L999_H1 - cmpwi cr0, N, 0 - ble .L999_H1 - cmpwi cr0, K, 0 - ble .L999_H1 - - - - addi T1, SP, 296+192 - - - li PRE, 384 - li o8 , 8 - li o16, 16 - li o24, 24 - li o32, 32 - li o48, 48 - - - lxvdsx alpha_r, 0, T1 - -#include "dgemm_logic_power9.S" - -.L999: - addi r3, 0, 0 - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld + + + + +#define STACKSIZE (512 ) +#define ALPHA_SP (296+192)(SP) +#define FZERO (304+192)(SP) + + + +#define M r3 +#define N r4 +#define K r5 + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs18 + +#define o0 0 + + +#define T4 r12 +#define T3 r11 +#define C4 r14 +#define o8 r15 +#define o24 r16 +#define C2 r17 +#define L r18 +#define T1 r19 +#define C3 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_power9.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + + stfd f1, ALPHA_SP + stw r0, FZERO + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + + + addi T1, SP, 296+192 + + + li PRE, 384 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + + lxvdsx alpha_r, 0, T1 + +#include "dgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S index 251839d19..a48bc685a 100644 --- a/kernel/power/dgemm_logic_power9.S +++ b/kernel/power/dgemm_logic_power9.S @@ -1,1981 +1,1981 @@ -/*************************************************************************** -Copyright (c) 2013-2019 The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#define MY_ALIGN .align 3 - -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 2 - ble LDGEMM_L4_END - -LDGEMM_L4_BEGIN: - - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LDGEMM_L4x16_END - - MY_ALIGN -LDGEMM_L4x16_BEGIN: - - li L, -128 - - - SAVE4x16_REGS - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - - and T1, CO, L - and T2, C2, L - and T3, C3, L - and T4, C4, L - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - - - addi T1, T1, 128 - addi T2, T2, 128 - addi T3, T3, 128 - addi T4, T4, 128 - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 - srawi. L, T3, 5 -#else - srawi. L, K, 5 -#endif - - ble LDGEMM_L4x16_SUB0 - - - MY_ALIGN -LDGEMM_L4x16_LOOP_START: - - li T2, 512 - - - LOAD4x16_1 - ##OffsetA=128 OffsetB=32 - addi AO,AO,2176 - # addi BO,BO,32 - addic. L, L, -1 - - ble LDGEMM_L4x16_LOOP_END - - - mtctr L - - MY_ALIGN - -LDGEMM_L4x16_LOOP: - - #dcbt AO, PRE - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_2 -2048,32, 15,1 - - - bdnz LDGEMM_L4x16_LOOP - - MY_ALIGN - MY_ALIGN -LDGEMM_L4x16_LOOP_END: - - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_3 -2048,32, 15,1 - b LDGEMM_L4x16_SUB1 - - - MY_ALIGN -LDGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - KERNEL4x16 1 - - addic. L, L, -1 - ble LDGEMM_L4x16_SAVE - b LDGEMM_L4x16_SUB2 - MY_ALIGN -LDGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - ble LDGEMM_L4x16_SAVE - MY_ALIGN -LDGEMM_L4x16_SUB2: - - andi. T1,L, 16 - ble LDGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_2 128,32, 3,0 - KERNEL4x16_I1_L2_2 128,32, 4,0 - KERNEL4x16_I1_L2_2 128,32, 5,0 - KERNEL4x16_I1_L2_2 128,32, 6,0 - KERNEL4x16_I1_L2_3 128,32, 7,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_8: - andi. T1,L, 8 - ble LDGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_3 128,32, 3,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_3 128,32, 1,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 128,32, 0,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LDGEMM_L4x16_SUB2 - - MY_ALIGN -LDGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LDGEMM_L4x16_BEGIN - -LDGEMM_L4x16_END: - -LDGEMM_L4x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L4x1_END - - andi. T1, M, 8 - ble LDGEMM_L4x8_END - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 - srawi. L, T3, 4 -#else - mr BO, B - srawi. L, K, 4 -#endif - - - ble LDGEMM_L4x8_SUB0 - -LDGEMM_L4x8_LOOP_START: - - - LOAD4x8_1 - ##OffsetA=64 OffsetB=32 - - - addic. L, L, -1 - - ble LDGEMM_L4x8_LOOP_END - - mtctr L - MY_ALIGN - -LDGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_2 64,32, 7,1 - - bdnz LDGEMM_L4x8_LOOP - MY_ALIGN -LDGEMM_L4x8_LOOP_END: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_3 64,32, 7,1 - - b LDGEMM_L4x8_SUB1 - MY_ALIGN -LDGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - KERNEL4x8 1 - - addic. L, L, -1 - ble LDGEMM_L4x8_SAVE - b LDGEMM_L4x8_SUB2 - MY_ALIGN -LDGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - ble LDGEMM_L4x8_SAVE - MY_ALIGN -LDGEMM_L4x8_SUB2: - - andi. T1,L, 8 - ble LDGEMM_L4x8_SUB2_4 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_3 64,32, 3,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_3 64,32, 1,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 64,32, 0,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x8_SAVE - KERNEL4x8 0 - - MY_ALIGN -LDGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 -#endif -LDGEMM_L4x8_END: - -LDGEMM_L4x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x4_SUB4 - -LDGEMM_L4x4_LOOP_START: - - #dcbt AO, PRE - LOAD4x4_1 - KERNEL4x4_I1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -2 - ble LDGEMM_L4x4_LOOP_END - - MY_ALIGN - -LDGEMM_L4x4_LOOP: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -1 - bgt LDGEMM_L4x4_LOOP - -LDGEMM_L4x4_LOOP_END: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_E2 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB4: - - KERNEL4x4_SUBI1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x4_SAVE - b LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x4_SAVE - -LDGEMM_L4x4_SUB2: - - KERNEL4x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SAVE: - - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 -#endif -LDGEMM_L4x4_END: - -LDGEMM_L4x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x2_SUB4 - -LDGEMM_L4x2_LOOP_START: - - LOAD4x2_1 - KERNEL4x2_I1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -2 - ble LDGEMM_L4x2_LOOP_END - - MY_ALIGN - -LDGEMM_L4x2_LOOP: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -1 - bgt LDGEMM_L4x2_LOOP - -LDGEMM_L4x2_LOOP_END: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_E2 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB4: - - KERNEL4x2_SUBI1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x2_SAVE - b LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x2_SAVE - -LDGEMM_L4x2_SUB2: - - KERNEL4x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SAVE: - - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 -#endif -LDGEMM_L4x2_END: - -LDGEMM_L4x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x1_SUB4 - -LDGEMM_L4x1_LOOP_START: - - LOAD4x1_1 - KERNEL4x1_I1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -2 - ble LDGEMM_L4x1_LOOP_END - - MY_ALIGN - -LDGEMM_L4x1_LOOP: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -1 - bgt LDGEMM_L4x1_LOOP - -LDGEMM_L4x1_LOOP_END: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_E2 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB4: - - KERNEL4x1_SUBI1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x1_SAVE - b LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x1_SAVE - -LDGEMM_L4x1_SUB2: - - KERNEL4x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SAVE: - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 -#endif -LDGEMM_L4x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - addic. J, J, -1 - bgt LDGEMM_L4_BEGIN - - andi. T2, N, 3 - ble .L999 - -LDGEMM_L4_END: - - b LDGEMM_L2_BEGIN - -.L999_H1: - - b .L999 - -LDGEMM_L2_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 2 - ble LDGEMM_L2_END - mr CO, C - mr AO, A - slwi T1, LDC , 1 - add C, C, T1 - srawi. I, M, 4 - ble LDGEMM_L2x16_END - -LDGEMM_L2x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x16_SUB4 - -LDGEMM_L2x16_LOOP_START: - - #dcbt AO, PRE - LOAD2x16_1 - #dcbt AO, PRE - KERNEL2x16_I1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -2 - ble LDGEMM_L2x16_LOOP_END - - MY_ALIGN - -LDGEMM_L2x16_LOOP: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -1 - bgt LDGEMM_L2x16_LOOP - -LDGEMM_L2x16_LOOP_END: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - KERNEL2x16_E2 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB4: - - #dcbt AO, PRE - KERNEL2x16_SUBI1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x16_SAVE - b LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x16_SAVE - -LDGEMM_L2x16_SUB2: - - KERNEL2x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SAVE: - - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt LDGEMM_L2x16_BEGIN - -LDGEMM_L2x16_END: - -LDGEMM_L2x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L2x1_END - - andi. T1, M, 8 - ble LDGEMM_L2x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x8_SUB4 - -LDGEMM_L2x8_LOOP_START: - - #dcbt AO, PRE - LOAD2x8_1 - KERNEL2x8_I1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -2 - ble LDGEMM_L2x8_LOOP_END - - MY_ALIGN - -LDGEMM_L2x8_LOOP: - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -1 - bgt LDGEMM_L2x8_LOOP - -LDGEMM_L2x8_LOOP_END: - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_2 - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_E2 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB4: - - KERNEL2x8_SUBI1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x8_SAVE - b LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x8_SAVE - -LDGEMM_L2x8_SUB2: - - KERNEL2x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SAVE: - - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 -#endif -LDGEMM_L2x8_END: - -LDGEMM_L2x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x4_SUB4 - -LDGEMM_L2x4_LOOP_START: - - LOAD2x4_1 - KERNEL2x4_I1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -2 - ble LDGEMM_L2x4_LOOP_END - - MY_ALIGN - -LDGEMM_L2x4_LOOP: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -1 - bgt LDGEMM_L2x4_LOOP - -LDGEMM_L2x4_LOOP_END: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_E2 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB4: - - KERNEL2x4_SUBI1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x4_SAVE - b LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x4_SAVE - -LDGEMM_L2x4_SUB2: - - KERNEL2x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SAVE: - - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 -#endif -LDGEMM_L2x4_END: - -LDGEMM_L2x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x2_SUB4 - -LDGEMM_L2x2_LOOP_START: - - LOAD2x2_1 - KERNEL2x2_I1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -2 - ble LDGEMM_L2x2_LOOP_END - - MY_ALIGN - -LDGEMM_L2x2_LOOP: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -1 - bgt LDGEMM_L2x2_LOOP - -LDGEMM_L2x2_LOOP_END: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_E2 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB4: - - KERNEL2x2_SUBI1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x2_SAVE - b LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x2_SAVE - -LDGEMM_L2x2_SUB2: - - KERNEL2x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SAVE: - - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 -#endif -LDGEMM_L2x2_END: - -LDGEMM_L2x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x1_SUB4 - -LDGEMM_L2x1_LOOP_START: - - LOAD2x1_1 - KERNEL2x1_I1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -2 - ble LDGEMM_L2x1_LOOP_END - - MY_ALIGN - -LDGEMM_L2x1_LOOP: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -1 - bgt LDGEMM_L2x1_LOOP - -LDGEMM_L2x1_LOOP_END: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_E2 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB4: - - KERNEL2x1_SUBI1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x1_SAVE - b LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x1_SAVE - -LDGEMM_L2x1_SUB2: - - KERNEL2x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SAVE: - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 -#endif -LDGEMM_L2x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LDGEMM_L2_END: -LDGEMM_L1_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 1 - ble LDGEMM_L1_END - mr CO, C - mr AO, A - srawi. I, M, 4 - ble LDGEMM_L1x16_END - -LDGEMM_L1x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x16_SUB4 - -LDGEMM_L1x16_LOOP_START: - - #dcbt AO, PRE - LOAD1x16_1 - #dcbt AO, PRE - KERNEL1x16_I1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -2 - ble LDGEMM_L1x16_LOOP_END - - MY_ALIGN - -LDGEMM_L1x16_LOOP: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -1 - bgt LDGEMM_L1x16_LOOP - -LDGEMM_L1x16_LOOP_END: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - KERNEL1x16_E2 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB4: - - #dcbt AO, PRE - KERNEL1x16_SUBI1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x16_SAVE - b LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x16_SAVE - -LDGEMM_L1x16_SUB2: - - KERNEL1x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SAVE: - - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt LDGEMM_L1x16_BEGIN - -LDGEMM_L1x16_END: - -LDGEMM_L1x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L1x1_END - - andi. T1, M, 8 - ble LDGEMM_L1x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x8_SUB4 - -LDGEMM_L1x8_LOOP_START: - - #dcbt AO, PRE - LOAD1x8_1 - KERNEL1x8_I1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -2 - ble LDGEMM_L1x8_LOOP_END - - MY_ALIGN - -LDGEMM_L1x8_LOOP: - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -1 - bgt LDGEMM_L1x8_LOOP - -LDGEMM_L1x8_LOOP_END: - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_2 - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_E2 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB4: - - KERNEL1x8_SUBI1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x8_SAVE - b LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x8_SAVE - -LDGEMM_L1x8_SUB2: - - KERNEL1x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SAVE: - - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 -#endif -LDGEMM_L1x8_END: - -LDGEMM_L1x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x4_SUB4 - -LDGEMM_L1x4_LOOP_START: - - LOAD1x4_1 - KERNEL1x4_I1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -2 - ble LDGEMM_L1x4_LOOP_END - - MY_ALIGN - -LDGEMM_L1x4_LOOP: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -1 - bgt LDGEMM_L1x4_LOOP - -LDGEMM_L1x4_LOOP_END: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_E2 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB4: - - KERNEL1x4_SUBI1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x4_SAVE - b LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x4_SAVE - -LDGEMM_L1x4_SUB2: - - KERNEL1x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SAVE: - - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 -#endif -LDGEMM_L1x4_END: - -LDGEMM_L1x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x2_SUB4 - -LDGEMM_L1x2_LOOP_START: - - LOAD1x2_1 - KERNEL1x2_I1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -2 - ble LDGEMM_L1x2_LOOP_END - - MY_ALIGN - -LDGEMM_L1x2_LOOP: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -1 - bgt LDGEMM_L1x2_LOOP - -LDGEMM_L1x2_LOOP_END: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_E2 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB4: - - KERNEL1x2_SUBI1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x2_SAVE - b LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x2_SAVE - -LDGEMM_L1x2_SUB2: - - KERNEL1x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SAVE: - - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 -#endif -LDGEMM_L1x2_END: - -LDGEMM_L1x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x1_SUB4 - -LDGEMM_L1x1_LOOP_START: - - LOAD1x1_1 - KERNEL1x1_I1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -2 - ble LDGEMM_L1x1_LOOP_END - - MY_ALIGN - -LDGEMM_L1x1_LOOP: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -1 - bgt LDGEMM_L1x1_LOOP - -LDGEMM_L1x1_LOOP_END: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_E2 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB4: - - KERNEL1x1_SUBI1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x1_SAVE - b LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x1_SAVE - -LDGEMM_L1x1_SUB2: - - KERNEL1x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SAVE: - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 -#endif -LDGEMM_L1x1_END: -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif -LDGEMM_L1_END: +/*************************************************************************** +Copyright (c) 2013-2019 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 2 + ble LDGEMM_L4_END + +LDGEMM_L4_BEGIN: + + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LDGEMM_L4x16_END + + MY_ALIGN +LDGEMM_L4x16_BEGIN: + + li L, -128 + + + SAVE4x16_REGS + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + + and T1, CO, L + and T2, C2, L + and T3, C3, L + and T4, C4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 + srawi. L, T3, 5 +#else + srawi. L, K, 5 +#endif + + ble LDGEMM_L4x16_SUB0 + + + MY_ALIGN +LDGEMM_L4x16_LOOP_START: + + li T2, 512 + + + LOAD4x16_1 + ##OffsetA=128 OffsetB=32 + addi AO,AO,2176 + # addi BO,BO,32 + addic. L, L, -1 + + ble LDGEMM_L4x16_LOOP_END + + + mtctr L + + MY_ALIGN + +LDGEMM_L4x16_LOOP: + + #dcbt AO, PRE + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_2 -2048,32, 15,1 + + + bdnz LDGEMM_L4x16_LOOP + + MY_ALIGN + MY_ALIGN +LDGEMM_L4x16_LOOP_END: + + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_3 -2048,32, 15,1 + b LDGEMM_L4x16_SUB1 + + + MY_ALIGN +LDGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + KERNEL4x16 1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 + MY_ALIGN +LDGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + ble LDGEMM_L4x16_SAVE + MY_ALIGN +LDGEMM_L4x16_SUB2: + + andi. T1,L, 16 + ble LDGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_2 128,32, 3,0 + KERNEL4x16_I1_L2_2 128,32, 4,0 + KERNEL4x16_I1_L2_2 128,32, 5,0 + KERNEL4x16_I1_L2_2 128,32, 6,0 + KERNEL4x16_I1_L2_3 128,32, 7,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_8: + andi. T1,L, 8 + ble LDGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_3 128,32, 3,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_3 128,32, 1,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 128,32, 0,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LDGEMM_L4x16_SUB2 + + MY_ALIGN +LDGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LDGEMM_L4x16_BEGIN + +LDGEMM_L4x16_END: + +LDGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L4x1_END + + andi. T1, M, 8 + ble LDGEMM_L4x8_END + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 + srawi. L, T3, 4 +#else + mr BO, B + srawi. L, K, 4 +#endif + + + ble LDGEMM_L4x8_SUB0 + +LDGEMM_L4x8_LOOP_START: + + + LOAD4x8_1 + ##OffsetA=64 OffsetB=32 + + + addic. L, L, -1 + + ble LDGEMM_L4x8_LOOP_END + + mtctr L + MY_ALIGN + +LDGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_2 64,32, 7,1 + + bdnz LDGEMM_L4x8_LOOP + MY_ALIGN +LDGEMM_L4x8_LOOP_END: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_3 64,32, 7,1 + + b LDGEMM_L4x8_SUB1 + MY_ALIGN +LDGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + KERNEL4x8 1 + + addic. L, L, -1 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 + MY_ALIGN +LDGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + ble LDGEMM_L4x8_SAVE + MY_ALIGN +LDGEMM_L4x8_SUB2: + + andi. T1,L, 8 + ble LDGEMM_L4x8_SUB2_4 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_3 64,32, 3,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_3 64,32, 1,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 64,32, 0,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x8_SAVE + KERNEL4x8 0 + + MY_ALIGN +LDGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 +#endif +LDGEMM_L4x8_END: + +LDGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x4_SUB4 + +LDGEMM_L4x4_LOOP_START: + + #dcbt AO, PRE + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -2 + ble LDGEMM_L4x4_LOOP_END + + MY_ALIGN + +LDGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -1 + bgt LDGEMM_L4x4_LOOP + +LDGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x4_SAVE + +LDGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SAVE: + + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 +#endif +LDGEMM_L4x4_END: + +LDGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x2_SUB4 + +LDGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble LDGEMM_L4x2_LOOP_END + + MY_ALIGN + +LDGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt LDGEMM_L4x2_LOOP + +LDGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x2_SAVE + +LDGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SAVE: + + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 +#endif +LDGEMM_L4x2_END: + +LDGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x1_SUB4 + +LDGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble LDGEMM_L4x1_LOOP_END + + MY_ALIGN + +LDGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt LDGEMM_L4x1_LOOP + +LDGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x1_SAVE + +LDGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SAVE: + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 +#endif +LDGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + addic. J, J, -1 + bgt LDGEMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999 + +LDGEMM_L4_END: + + b LDGEMM_L2_BEGIN + +.L999_H1: + + b .L999 + +LDGEMM_L2_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 2 + ble LDGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble LDGEMM_L2x16_END + +LDGEMM_L2x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x16_SUB4 + +LDGEMM_L2x16_LOOP_START: + + #dcbt AO, PRE + LOAD2x16_1 + #dcbt AO, PRE + KERNEL2x16_I1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble LDGEMM_L2x16_LOOP_END + + MY_ALIGN + +LDGEMM_L2x16_LOOP: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt LDGEMM_L2x16_LOOP + +LDGEMM_L2x16_LOOP_END: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB4: + + #dcbt AO, PRE + KERNEL2x16_SUBI1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x16_SAVE + +LDGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SAVE: + + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt LDGEMM_L2x16_BEGIN + +LDGEMM_L2x16_END: + +LDGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L2x1_END + + andi. T1, M, 8 + ble LDGEMM_L2x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x8_SUB4 + +LDGEMM_L2x8_LOOP_START: + + #dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble LDGEMM_L2x8_LOOP_END + + MY_ALIGN + +LDGEMM_L2x8_LOOP: + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt LDGEMM_L2x8_LOOP + +LDGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x8_SAVE + +LDGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SAVE: + + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 +#endif +LDGEMM_L2x8_END: + +LDGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x4_SUB4 + +LDGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble LDGEMM_L2x4_LOOP_END + + MY_ALIGN + +LDGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt LDGEMM_L2x4_LOOP + +LDGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x4_SAVE + +LDGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SAVE: + + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 +#endif +LDGEMM_L2x4_END: + +LDGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x2_SUB4 + +LDGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble LDGEMM_L2x2_LOOP_END + + MY_ALIGN + +LDGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt LDGEMM_L2x2_LOOP + +LDGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x2_SAVE + +LDGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SAVE: + + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 +#endif +LDGEMM_L2x2_END: + +LDGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x1_SUB4 + +LDGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble LDGEMM_L2x1_LOOP_END + + MY_ALIGN + +LDGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt LDGEMM_L2x1_LOOP + +LDGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x1_SAVE + +LDGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SAVE: + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 +#endif +LDGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 1 + ble LDGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble LDGEMM_L1x16_END + +LDGEMM_L1x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x16_SUB4 + +LDGEMM_L1x16_LOOP_START: + + #dcbt AO, PRE + LOAD1x16_1 + #dcbt AO, PRE + KERNEL1x16_I1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble LDGEMM_L1x16_LOOP_END + + MY_ALIGN + +LDGEMM_L1x16_LOOP: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt LDGEMM_L1x16_LOOP + +LDGEMM_L1x16_LOOP_END: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB4: + + #dcbt AO, PRE + KERNEL1x16_SUBI1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x16_SAVE + +LDGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SAVE: + + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt LDGEMM_L1x16_BEGIN + +LDGEMM_L1x16_END: + +LDGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L1x1_END + + andi. T1, M, 8 + ble LDGEMM_L1x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x8_SUB4 + +LDGEMM_L1x8_LOOP_START: + + #dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble LDGEMM_L1x8_LOOP_END + + MY_ALIGN + +LDGEMM_L1x8_LOOP: + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt LDGEMM_L1x8_LOOP + +LDGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x8_SAVE + +LDGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SAVE: + + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 +#endif +LDGEMM_L1x8_END: + +LDGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x4_SUB4 + +LDGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble LDGEMM_L1x4_LOOP_END + + MY_ALIGN + +LDGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt LDGEMM_L1x4_LOOP + +LDGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x4_SAVE + +LDGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SAVE: + + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 +#endif +LDGEMM_L1x4_END: + +LDGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x2_SUB4 + +LDGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble LDGEMM_L1x2_LOOP_END + + MY_ALIGN + +LDGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt LDGEMM_L1x2_LOOP + +LDGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x2_SAVE + +LDGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SAVE: + + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 +#endif +LDGEMM_L1x2_END: + +LDGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x1_SUB4 + +LDGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble LDGEMM_L1x1_LOOP_END + + MY_ALIGN + +LDGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt LDGEMM_L1x1_LOOP + +LDGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x1_SAVE + +LDGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SAVE: + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 +#endif +LDGEMM_L1x1_END: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S index c4b8270b8..4eddab24f 100644 --- a/kernel/power/dgemm_macros_power9.S +++ b/kernel/power/dgemm_macros_power9.S @@ -1,3623 +1,3623 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - -/********************************************************************* -* Macros for N=4, M=16 * -*********************************************************************/ -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - xxlxor vs36,vs36,vs36 - xxlxor vs37,vs37,vs37 - xxlxor vs38,vs38,vs38 - xxlxor vs39,vs39,vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endif -.endm - - -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -.macro KERNEL4x16_L1_L2 Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete - -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 -.else - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 -.endif - lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) - lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - -.else - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 -.endif - lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - - - - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - - - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) -.endif - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) - lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 -.if \Complete==0 - lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) -.endif - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) -.endif - xvmaddadp vs52, vs12, vs30 - xvmaddadp vs53, vs13, vs30 - xvmaddadp vs54, vs14, vs30 - xvmaddadp vs55, vs15, vs30 -.if \Complete==0 - lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) -.endif - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 - - - xvmaddadp vs60, vs12, vs31 - - xvmaddadp vs61, vs13, vs31 - xvmaddadp vs62, vs14, vs31 - - xvmaddadp vs63, vs15, vs31 - .if \IsLast==1 - .if \Complete==1 - addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) - .else - addi \AREG, \AREG, DISP32(\Index,256) - addi \BREG, \BREG, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x16 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) - - - - addi BO, BO, 32 - addi AO, AO, 128 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 - -.endif -.endm - -.macro SAVE4x16_REGS - add C2, CO, LDC - add C3, C2, LDC - add C4, C3, LDC -.endm - -.macro SAVE4x16 -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs24, 64(CO) - lxv vs26, 80(CO) - lxv vs28, 96(CO) - lxv vs30, 112(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C2) - lxv vs3, 16(C2) - lxv vs5, 32(C2) - lxv vs7, 48(C2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs25, 64(C2) - lxv vs27, 80(C2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 -#ifndef TRMMKERNEL - lxv vs29, 96(C2) - lxv vs31, 112(C2) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - xxpermdi vs8, vs44,vs36,1 - xxpermdi vs9 ,vs36,vs44,1 - xxpermdi vs10, vs45,vs37,1 - xxpermdi vs11 ,vs37,vs45,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - xxpermdi vs12, vs46,vs38,1 - xxpermdi vs13 ,vs38,vs46,1 - xxpermdi vs14, vs47,vs39,1 - xxpermdi vs15 ,vs39,vs47,1 - -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r - -#endif - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - stxv vs24, 64(CO) - stxv vs26, 80(CO) - stxv vs28, 96(CO) - stxv vs30, 112(CO) - - stxv vs1, 0(C2) - stxv vs3, 16(C2) - stxv vs5, 32(C2) - stxv vs7, 48(C2) - - stxv vs25, 64(C2) - stxv vs27, 80(C2) - stxv vs29, 96(C2) - stxv vs31, 112(C2) -#ifndef TRMMKERNEL - lxv vs0, 0(C3) - lxv vs2, 16(C3) - lxv vs4, 32(C3) - lxv vs6, 48(C3) -#endif - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs24, 64(C3) - lxv vs26, 80(C3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs28, 96(C3) - lxv vs30, 112(C3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C4) - lxv vs3, 16(C4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(C4) - lxv vs7, 48(C4) - - lxv vs25, 64(C4) - lxv vs27, 80(C4) - lxv vs29, 96(C4) - lxv vs31, 112(C4) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - - xxpermdi vs8, vs60,vs52,1 - xxpermdi vs9 ,vs52,vs60,1 - xxpermdi vs10, vs61,vs53,1 - xxpermdi vs11 ,vs53,vs61,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - - - xxpermdi vs12, vs62,vs54,1 - xxpermdi vs13 ,vs54,vs62,1 - xxpermdi vs14, vs63,vs55,1 - xxpermdi vs15 ,vs55,vs63,1 -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r -#endif - stxv vs0, 0(C3) - stxv vs2, 16(C3) - stxv vs4, 32(C3) - stxv vs6, 48(C3) - - stxv vs24, 64(C3) - stxv vs26, 80(C3) - stxv vs28, 96(C3) - stxv vs30, 112(C3) - - stxv vs1, 0(C4) - stxv vs3, 16(C4) - stxv vs5, 32(C4) - stxv vs7, 48(C4) - - stxv vs25, 64(C4) - stxv vs27, 80(C4) - stxv vs29, 96(C4) - stxv vs31, 112(C4) - - addi CO, CO, 128 -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - -.endif -.endm - - - -.macro KERNEL4x8_L1_L2 Index,IsLast - KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index,0+\OffsetA)(AO) - lxv vs9, DISP16(\Index,16+\OffsetA)(AO) -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - - lxv vs10, DISP16(\Index,32+\OffsetA)(AO) - lxv vs11, DISP16(\Index,48+\OffsetA)(AO) - - - -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - - lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) - lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(AO) - lxv vs1, DISP16(\Index,80+\OffsetA)(AO) -.endif - - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.if \Complete==0 - lxv vs2, DISP16(\Index,96+\OffsetA)(AO) - lxv vs3, DISP16(\Index,112+\OffsetA)(AO) -.endif - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) - lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) -.endif - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - - .if \IsLast==1 - .if \Complete==1 - addi AO, AO, DISP16(\Index,64+\OffsetA) - addi BO, BO, DISP8(\Index,32+\OffsetB) - .else - addi AO, AO, DISP16(\Index,128) - addi BO, BO, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x8 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - - - addi BO, BO, 32 - addi AO, AO, 64 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - -.endif -.endm - - - -.macro SAVE4x8 - add T2, CO, LDC - add T3, T2, LDC - add T4, T3, LDC -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T2) - lxv vs3, 16(T2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T2) - lxv vs7, 48(T2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 - - - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - - stxv vs1, 0(T2) - stxv vs3, 16(T2) - stxv vs5, 32(T2) - stxv vs7, 48(T2) - - - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs0, 0(T3) - lxv vs2, 16(T3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs4, 32(T3) - lxv vs6, 48(T3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T4) - lxv vs3, 16(T4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T4) - lxv vs7, 48(T4) - - - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(T3) - stxv vs2, 16(T3) - stxv vs4, 32(T3) - stxv vs6, 48(T3) - - - stxv vs1, 0(T4) - stxv vs3, 16(T4) - stxv vs5, 32(T4) - stxv vs7, 48(T4) - - - - addi CO, CO, 64 -.endm - - -/********************************************************************* -* Macros for N=4, M=4 * -*********************************************************************/ - -.macro LOAD4x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro SAVE4x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r -#else - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=4, M=2 * -*********************************************************************/ - -.macro LOAD4x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r -#else - xvmuldp vs0, vs48, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r -#else - xvmuldp vs8, vs56, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=4, M=1 * -*********************************************************************/ - -.macro LOAD4x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs48, alpha_r -#else - xsmuldp vs0, vs48, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs56, alpha_r -#else - xsmuldp vs8, vs56, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=2, M=16 * -*********************************************************************/ - -.macro LOAD2x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL2x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro SAVE2x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r - xvmaddadp vs12, vs44, alpha_r - xvmaddadp vs13, vs45, alpha_r - xvmaddadp vs14, vs46, alpha_r - xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD2x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro SAVE2x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=2, M=4 * -*********************************************************************/ - -.macro LOAD2x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro SAVE2x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=2, M=2 * -*********************************************************************/ - -.macro LOAD2x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=2, M=1 * -*********************************************************************/ - -.macro LOAD2x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=1, M=16 * -*********************************************************************/ - -.macro LOAD1x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL1x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro SAVE1x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD1x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro SAVE1x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=1, M=4 * -*********************************************************************/ - -.macro LOAD1x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro SAVE1x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=1, M=2 * -*********************************************************************/ - -.macro LOAD1x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_E2 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=1, M=1 * -*********************************************************************/ - -.macro LOAD1x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_E2 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - addi CO, CO, 8 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + xxlxor vs36,vs36,vs36 + xxlxor vs37,vs37,vs37 + xxlxor vs38,vs38,vs38 + xxlxor vs39,vs39,vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + + +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro KERNEL4x16_L1_L2 Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete + +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 +.else + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 +.endif + lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) + lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + +.else + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 +.endif + lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) +.endif + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) + lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 +.if \Complete==0 + lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) +.endif + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) +.endif + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 +.if \Complete==0 + lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) +.endif + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + + xvmaddadp vs60, vs12, vs31 + + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + + xvmaddadp vs63, vs15, vs31 + .if \IsLast==1 + .if \Complete==1 + addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) + .else + addi \AREG, \AREG, DISP32(\Index,256) + addi \BREG, \BREG, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x16 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) + + + + addi BO, BO, 32 + addi AO, AO, 128 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endif +.endm + +.macro SAVE4x16_REGS + add C2, CO, LDC + add C3, C2, LDC + add C4, C3, LDC +.endm + +.macro SAVE4x16 +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs24, 64(CO) + lxv vs26, 80(CO) + lxv vs28, 96(CO) + lxv vs30, 112(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C2) + lxv vs3, 16(C2) + lxv vs5, 32(C2) + lxv vs7, 48(C2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs25, 64(C2) + lxv vs27, 80(C2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 +#ifndef TRMMKERNEL + lxv vs29, 96(C2) + lxv vs31, 112(C2) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + xxpermdi vs8, vs44,vs36,1 + xxpermdi vs9 ,vs36,vs44,1 + xxpermdi vs10, vs45,vs37,1 + xxpermdi vs11 ,vs37,vs45,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + xxpermdi vs12, vs46,vs38,1 + xxpermdi vs13 ,vs38,vs46,1 + xxpermdi vs14, vs47,vs39,1 + xxpermdi vs15 ,vs39,vs47,1 + +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r + +#endif + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + stxv vs24, 64(CO) + stxv vs26, 80(CO) + stxv vs28, 96(CO) + stxv vs30, 112(CO) + + stxv vs1, 0(C2) + stxv vs3, 16(C2) + stxv vs5, 32(C2) + stxv vs7, 48(C2) + + stxv vs25, 64(C2) + stxv vs27, 80(C2) + stxv vs29, 96(C2) + stxv vs31, 112(C2) +#ifndef TRMMKERNEL + lxv vs0, 0(C3) + lxv vs2, 16(C3) + lxv vs4, 32(C3) + lxv vs6, 48(C3) +#endif + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs24, 64(C3) + lxv vs26, 80(C3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs28, 96(C3) + lxv vs30, 112(C3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C4) + lxv vs3, 16(C4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(C4) + lxv vs7, 48(C4) + + lxv vs25, 64(C4) + lxv vs27, 80(C4) + lxv vs29, 96(C4) + lxv vs31, 112(C4) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + + xxpermdi vs8, vs60,vs52,1 + xxpermdi vs9 ,vs52,vs60,1 + xxpermdi vs10, vs61,vs53,1 + xxpermdi vs11 ,vs53,vs61,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + + + xxpermdi vs12, vs62,vs54,1 + xxpermdi vs13 ,vs54,vs62,1 + xxpermdi vs14, vs63,vs55,1 + xxpermdi vs15 ,vs55,vs63,1 +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r +#endif + stxv vs0, 0(C3) + stxv vs2, 16(C3) + stxv vs4, 32(C3) + stxv vs6, 48(C3) + + stxv vs24, 64(C3) + stxv vs26, 80(C3) + stxv vs28, 96(C3) + stxv vs30, 112(C3) + + stxv vs1, 0(C4) + stxv vs3, 16(C4) + stxv vs5, 32(C4) + stxv vs7, 48(C4) + + stxv vs25, 64(C4) + stxv vs27, 80(C4) + stxv vs29, 96(C4) + stxv vs31, 112(C4) + + addi CO, CO, 128 +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + +.endif +.endm + + + +.macro KERNEL4x8_L1_L2 Index,IsLast + KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index,0+\OffsetA)(AO) + lxv vs9, DISP16(\Index,16+\OffsetA)(AO) +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + + lxv vs10, DISP16(\Index,32+\OffsetA)(AO) + lxv vs11, DISP16(\Index,48+\OffsetA)(AO) + + + +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + + lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) + lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(AO) + lxv vs1, DISP16(\Index,80+\OffsetA)(AO) +.endif + + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.if \Complete==0 + lxv vs2, DISP16(\Index,96+\OffsetA)(AO) + lxv vs3, DISP16(\Index,112+\OffsetA)(AO) +.endif + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) + lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) +.endif + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + + .if \IsLast==1 + .if \Complete==1 + addi AO, AO, DISP16(\Index,64+\OffsetA) + addi BO, BO, DISP8(\Index,32+\OffsetB) + .else + addi AO, AO, DISP16(\Index,128) + addi BO, BO, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x8 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + + + addi BO, BO, 32 + addi AO, AO, 64 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + +.endif +.endm + + + +.macro SAVE4x8 + add T2, CO, LDC + add T3, T2, LDC + add T4, T3, LDC +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T2) + lxv vs3, 16(T2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T2) + lxv vs7, 48(T2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 + + + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + + stxv vs1, 0(T2) + stxv vs3, 16(T2) + stxv vs5, 32(T2) + stxv vs7, 48(T2) + + + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs0, 0(T3) + lxv vs2, 16(T3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs4, 32(T3) + lxv vs6, 48(T3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T4) + lxv vs3, 16(T4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T4) + lxv vs7, 48(T4) + + + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(T3) + stxv vs2, 16(T3) + stxv vs4, 32(T3) + stxv vs6, 48(T3) + + + stxv vs1, 0(T4) + stxv vs3, 16(T4) + stxv vs5, 32(T4) + stxv vs7, 48(T4) + + + + addi CO, CO, 64 +.endm + + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index bd74d20e5..58dcdec5a 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -1,328 +1,328 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - -#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code - -#if !defined(USE_MASK_PERMUTATIONS) - -static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgew %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgow %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -#endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { - - BLASLONG index; - BLASLONG i=0; -#if defined(USE_MASK_PERMUTATIONS) - register __vector unsigned int static_index0 = {0,1,2,3}; -#else - register __vector unsigned int static_index0 = {2,0,3,1}; -#endif - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0; - register __vector unsigned int static_index2=static_index0 +temp1; - register __vector unsigned int static_index3=static_index1 +temp1; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - register __vector float quadruple_values={0,0,0,0}; - - register __vector float * v_ptrx=(__vector float *)x; -#if defined(USE_MASK_PERMUTATIONS) - register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; - register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; -#endif - for(; i31 - - //find final quadruple from 32 elements - r2=vec_cmpgt(vv0,vf0); - ind2 = vec_sel( indf0,indv0,r2); - vv0= vec_sel(vf0,vv0,r2); - //get asbolute index - ind2+=temp0; - //compare with old quadruple and update - r1=vec_cmpgt(vv0,quadruple_values); - quadruple_indices = vec_sel( quadruple_indices,ind2,r1); - quadruple_values= vec_sel(quadruple_values,vv0,r1); - - temp0+=temp_add; - } - - //now we have to chose from 4 values and 4 different indices - // we will compare pairwise if pairs are exactly the same we will choose minimum between index - // otherwise we will assign index of the maximum value - float a1,a2,a3,a4; - unsigned int i1,i2,i3,i4; - a1=vec_extract(quadruple_values,0); - a2=vec_extract(quadruple_values,1); - a3=vec_extract(quadruple_values,2); - a4=vec_extract(quadruple_values,3); - i1=vec_extract(quadruple_indices,0); - i2=vec_extract(quadruple_indices,1); - i3=vec_extract(quadruple_indices,2); - i4=vec_extract(quadruple_indices,3); - if(a1==a2){ - index=i1>i2?i2:i1; - }else if(a2>a1){ - index=i2; - a1=a2; - }else{ - index= i1; - } - - if(a4==a3){ - i1=i3>i4?i4:i3; - }else if(a4>a3){ - i1=i4; - a3=a4; - }else{ - i1= i3; - } - - if(a1==a3){ - index=i1>index?index:i1; - *maxf=a1; - }else if(a3>a1){ - index=i1; - *maxf=a3; - }else{ - *maxf=a1; - } - return index; - -} - - - - - - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - max = ciamax_kernel_32(n1, x, &maxf); - i = n1; - ix = n1 << 1; - } - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (max + 1); - - } else { - - inc_x2 = 2 * inc_x; - - maxf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (max + 1); - } - -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code + +#if !defined(USE_MASK_PERMUTATIONS) + +static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgew %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgow %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + BLASLONG i=0; +#if defined(USE_MASK_PERMUTATIONS) + register __vector unsigned int static_index0 = {0,1,2,3}; +#else + register __vector unsigned int static_index0 = {2,0,3,1}; +#endif + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0; + register __vector unsigned int static_index2=static_index0 +temp1; + register __vector unsigned int static_index3=static_index1 +temp1; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + + register __vector float * v_ptrx=(__vector float *)x; +#if defined(USE_MASK_PERMUTATIONS) + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; +#endif + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vv0,vf0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(vv0,quadruple_values); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the maximum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = ciamax_kernel_32(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c index 336766245..843370c6c 100644 --- a/kernel/power/icamin.c +++ b/kernel/power/icamin.c @@ -1,266 +1,266 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - - - -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return index - */ -static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - float first_min=CABS1(x,0); - register __vector float quadruple_values={first_min,first_min,first_min,first_min}; - - register __vector float * v_ptrx=(__vector float *)x; - register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; - register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; - for(; i31 - - //find final quadruple from 32 elements - r2=vec_cmpgt(vf0,vv0); - ind2 = vec_sel( indf0,indv0,r2); - vv0= vec_sel(vf0,vv0,r2); - //get asbolute index - ind2+=temp0; - //compare with old quadruple and update - r1=vec_cmpgt(quadruple_values,vv0); - quadruple_indices = vec_sel( quadruple_indices,ind2,r1); - quadruple_values= vec_sel(quadruple_values,vv0,r1); - - temp0+=temp_add; - } - - //now we have to chose from 4 values and 4 different indices - // we will compare pairwise if pairs are exactly the same we will choose minimum between index - // otherwise we will assign index of the minimum value - float a1,a2,a3,a4; - unsigned int i1,i2,i3,i4; - a1=vec_extract(quadruple_values,0); - a2=vec_extract(quadruple_values,1); - a3=vec_extract(quadruple_values,2); - a4=vec_extract(quadruple_values,3); - i1=vec_extract(quadruple_indices,0); - i2=vec_extract(quadruple_indices,1); - i3=vec_extract(quadruple_indices,2); - i4=vec_extract(quadruple_indices,3); - if(a1==a2){ - index=i1>i2?i2:i1; - }else if(a2i4?i4:i3; - }else if(a4index?index:i1; - *minf=a1; - }else if(a3 0) { - - min = ciamin_kernel_32(n1, x, &minf); - i = n1; - ix = n1 << 1; - } - - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (min + 1); - - } else { - - inc_x2 = 2 * inc_x; - - minf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (min + 1); - } - -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + float first_min=CABS1(x,0); + register __vector float quadruple_values={first_min,first_min,first_min,first_min}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vf0,vv0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(quadruple_values,vv0); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the minimum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = ciamin_kernel_32(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c index bf1af78d6..fb2dafec0 100644 --- a/kernel/power/isamax.c +++ b/kernel/power/isamax.c @@ -1,288 +1,288 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#include "common.h" -#include -#include - - -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 64 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - register __vector float quadruple_values={0,0,0,0}; - register __vector float * v_ptrx=(__vector float *)x; - for(; ii2?i2:i1; - }else if(a2>a1){ - index=i2; - a1=a2; - }else{ - index= i1; - } - - if(a4==a3){ - i1=i3>i4?i4:i3; - }else if(a4>a3){ - i1=i4; - a3=a4; - }else{ - i1= i3; - } - - if(a1==a3){ - index=i1>index?index:i1; - *maxf=a1; - }else if(a3>a1){ - index=i1; - *maxf=a3; - }else{ - *maxf=a1; - } - return index; - -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - max = siamax_kernel_64(n1, x, &maxf); - - i = n1; - } - - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); - - } else { - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); - } -} +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include + + +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + register __vector float * v_ptrx=(__vector float *)x; + for(; ii2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = siamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c index 1c1f0ad78..60c843f58 100644 --- a/kernel/power/isamin.c +++ b/kernel/power/isamin.c @@ -1,288 +1,288 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 64 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return index - */ -static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; - register __vector float * v_ptrx=(__vector float *)x; - register __vector float quadruple_values=vec_abs(v_ptrx[0]); - for(; ii2?i2:i1; - }else if(a2i4?i4:i3; - }else if(a4index?index:i1; - *minf=a1; - }else if(a3 0) { - - min = siamin_kernel_64(n1, x, &minf); - i = n1; - } - - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); - - } else { - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); - } -} +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +/** + * Find minimum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; + register __vector float * v_ptrx=(__vector float *)x; + register __vector float quadruple_values=vec_abs(v_ptrx[0]); + for(; ii2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = siamin_kernel_64(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index 7a0f3143e..5cdc83d87 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -1,272 +1,272 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs20 -#define save_permute_1 vs21 -#define save_permute_2 vs22 -#define permute_mask vs23 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define T11 r29 - -#define T12 r30 -#define T13 r31 - -#include "sgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_11, 0x1415161718191a1b -.equ save_permute_12, 0x0405060708090a0b -.equ save_permute_21, 0x101112131c1d1e1f -.equ save_permute_22, 0x000102030c0d0e0f - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - slwi LDC, LDC, 2 - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xxspltw alpha_r,alpha_r,0 - -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - lis T5, save_permute_22@highest - lis T6, save_permute_21@highest - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - ori T5, T5, save_permute_22@higher - ori T6, T6, save_permute_21@higher - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - rldicr T5, T5, 32, 31 - rldicr T6, T6, 32, 31 - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - oris T5, T5, save_permute_22@h - oris T6, T6, save_permute_21@h - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - ori T5, T5, save_permute_22@l - ori T6, T6, save_permute_21@l - li r0,0 - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - mtvsrdd save_permute_2,T5,T6 - -#include "sgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs20 +#define save_permute_1 vs21 +#define save_permute_2 vs22 +#define permute_mask vs23 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define T11 r29 + +#define T12 r30 +#define T13 r31 + +#include "sgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_11, 0x1415161718191a1b +.equ save_permute_12, 0x0405060708090a0b +.equ save_permute_21, 0x101112131c1d1e1f +.equ save_permute_22, 0x000102030c0d0e0f + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + slwi LDC, LDC, 2 + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 + +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + lis T5, save_permute_22@highest + lis T6, save_permute_21@highest + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + ori T5, T5, save_permute_22@higher + ori T6, T6, save_permute_21@higher + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + rldicr T5, T5, 32, 31 + rldicr T6, T6, 32, 31 + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + oris T5, T5, save_permute_22@h + oris T6, T6, save_permute_21@h + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + ori T5, T5, save_permute_22@l + ori T6, T6, save_permute_21@l + li r0,0 + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + mtvsrdd save_permute_2,T5,T6 + +#include "sgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index a34ed32b8..4022959e2 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -1,2192 +1,2192 @@ -#define MY_ALIGN .align 3 -b L8 - - MY_ALIGN -LSGEMM_L8x16_LMAIN_SUB: - LOAD8x16_2 - MY_ALIGN - -LSGEMM_L8x16_LOOP: - KERNEL8x16_L2 128,64,0,0 -LSGEMM_L8x16_K128: - KERNEL8x16_L2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64, 1,0 - KERNEL8x16_I1_L4_2 128,64, 2,0 - KERNEL8x16_I1_L4_2 128,64, 3,0 - KERNEL8x16_I1_L4_2 128,64, 4,0 - KERNEL8x16_I1_L4_2 128,64, 5,0 - KERNEL8x16_I1_L4_2 128,64, 6,0 - KERNEL8x16_I1_L4_2 128,64, 7,0 - KERNEL8x16_I1_L4_2 128,64, 8,0 - KERNEL8x16_I1_L4_2 128,64, 9,0 - KERNEL8x16_I1_L4_2 128,64, 10,0 - KERNEL8x16_I1_L4_2 128,64, 11,0 - KERNEL8x16_I1_L4_2 128,64, 12,0 - KERNEL8x16_I1_L4_2 128,64, 13,0 - KERNEL8x16_I1_L4_2 128,64, 14,0 - KERNEL8x16_I1_L4_2 128,64, 15,0 - KERNEL8x16_I1_L4_2 128,64, 16,0 - KERNEL8x16_I1_L4_2 128,64, 17,0 - KERNEL8x16_I1_L4_2 128,64, 18,0 - KERNEL8x16_I1_L4_2 128,64, 19,0 - KERNEL8x16_I1_L4_2 128,64, 20,0 - KERNEL8x16_I1_L4_2 128,64, 21,0 - KERNEL8x16_I1_L4_2 128,64, 22,0 - KERNEL8x16_I1_L4_2 128,64, 23,0 - KERNEL8x16_I1_L4_2 128,64, 24,0 - KERNEL8x16_I1_L4_2 128,64, 25,0 - KERNEL8x16_I1_L4_2 128,64, 26,0 - KERNEL8x16_I1_L4_2 128,64, 27,0 - KERNEL8x16_I1_L4_2 128,64, 28,0 - KERNEL8x16_I1_L4_2 128,64, 29,0 - KERNEL8x16_I1_L4_2 128,64, 30,0 - KERNEL8x16_I1_L4_2 128,64, 31,1 - bdnz LSGEMM_L8x16_LOOP - - MY_ALIGN -LSGEMM_L8x16_LOOP_END: - END8x16_2 - blr - - MY_ALIGN -LSGEMM_L8x16_L64_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64, 0,0 - KERNEL8x16_I1_L4_2 128,64, 1,0 - KERNEL8x16_I1_L4_2 128,64, 2,0 - KERNEL8x16_I1_L4_2 128,64,3,0 - KERNEL8x16_I1_L4_2 128,64,4,0 - KERNEL8x16_I1_L4_2 128,64,5,0 - KERNEL8x16_I1_L4_2 128,64,6,0 - KERNEL8x16_I1_L4_2 128,64,7,0 - KERNEL8x16_I1_L4_2 128,64,8,0 - KERNEL8x16_I1_L4_2 128,64,9,0 - KERNEL8x16_I1_L4_2 128,64,10,0 - KERNEL8x16_I1_L4_2 128,64,11,0 - KERNEL8x16_I1_L4_2 128,64,12,0 - KERNEL8x16_I1_L4_2 128,64,13,0 - KERNEL8x16_I1_L4_2 128,64,14,0 - KERNEL8x16_I1_L4_3 128,64,15,1 - blr -LSGEMM_L8x16_L32_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64,0,0 - KERNEL8x16_I1_L4_2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64,2,0 - KERNEL8x16_I1_L4_2 128,64,3,0 - KERNEL8x16_I1_L4_2 128,64,4,0 - KERNEL8x16_I1_L4_2 128,64,5,0 - KERNEL8x16_I1_L4_2 128,64,6,0 - KERNEL8x16_I1_L4_3 128,64,7,1 - blr - -LSGEMM_L8x16_L16_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64,0,0 - KERNEL8x16_I1_L4_2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64,2,0 - KERNEL8x16_I1_L4_3 128,64,3,1 - blr - -L8: -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 3 - - ble LSGEMM_L8_END - -LSGEMM_L8_BEGIN: - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 3 - add C, C, T3 - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L8x16_END - - MY_ALIGN -LSGEMM_L8x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 - mr T12, T11 - addi T12,T12, -2 - srawi. L, T12, 7 /**(T11-2) % 128x */ -#else - mr T12, K - addi T12,T12, -2 - srawi. L, T12, 7 /**(K-2) % 128x */ -#endif - - ZERO8x16 - ble LSGEMM_L8x16_SUB0 - mtctr L - bl LSGEMM_L8x16_LMAIN_SUB - andi. L, T12, 127 - ble LSGEMM_L8x16_SAVE - b LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 255 - cmpwi T11,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T10,1 - bne CMP8x16_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD8x16 64,32 - END8x16_WITHOUT_ADD - LOAD8x16_2O AO,BO, 128, 64 - mtctr T10 - bl LSGEMM_L8x16_K128 - b LSGEMM_L8x16_SAVE -CMP8x16_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T11,128 -#else - cmpwi K,128 -#endif - bne LSGEMM_L8x16_SUB2 - MY_ALIGN - mtctr T10 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD8x16_2O AO,BO, 128,64 - bl LSGEMM_L8x16_K128 - b LSGEMM_L8x16_SAVE - MY_ALIGN -LSGEMM_L8x16_SUB2: - andi. T10,L,64 - ble LSGEMM_L8x16_SUB2_32 - bl LSGEMM_L8x16_L64_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_32: - andi. T10,L, 32 - ble LSGEMM_L8x16_SUB2_16 - bl LSGEMM_L8x16_L32_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L8x16_SUB2_8 - bl LSGEMM_L8x16_L16_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L8x16_SUB2_4 - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64, 0,0 - KERNEL8x16_I1_L4_3 128,64, 1,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L8x16_SUB2_2 - LOAD8x16_2 - KERNEL8x16_I1_L4_3 128,64, 0,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L8x16_SUB2_1 - LOAD8x16_2 - KERNEL8x16_E2 128,64, 0,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L8x16_SAVE - KERNEL8x16 0 - - - MY_ALIGN -LSGEMM_L8x16_SAVE: - SAVE8x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L8x16_BEGIN - MY_ALIGN -LSGEMM_L8x16_END: -LSGEMM_L8x8_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L8x1_END - - andi. T1, M, 8 - ble LSGEMM_L8x8_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO8x8 - ble LSGEMM_L8x8_SUB0 - - MY_ALIGN -LSGEMM_L8x8_LOOP_START: - - LOAD8x8_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L8x8_LOOP: - - KERNEL8x8_I1_L4_2 32,32, 0,0 - KERNEL8x8_I1_L4_2 32,32, 1,0 - KERNEL8x8_I1_L4_2 32,32, 2,0 - KERNEL8x8_I1_L4_2 32,32, 3,1 - - bdnz LSGEMM_L8x8_LOOP - - MY_ALIGN -LSGEMM_L8x8_LOOP_END: - - END8x8 0, AO, BO, 32, 32 - - b LSGEMM_L8x8_SUB1 - MY_ALIGN -LSGEMM_L8x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L8x8_SUB2 - MY_ALIGN -LSGEMM_L8x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L8x8_SAVE - MY_ALIGN -LSGEMM_L8x8_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L8x8_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L8x8_SUB2_LOOP: - LOAD8x8_0 - KERNEL8x8_I1_L4_2 32,32, 0,0 - KERNEL8x8_I1_L4_3 32,32, 1,1 - bdnz LSGEMM_L8x8_SUB2_LOOP - MY_ALIGN -LSGEMM_L8x8_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L8x8_SUB2_2 - LOAD8x8_0 - KERNEL8x8_I1_L4_3 32,32, 0,1 - MY_ALIGN -LSGEMM_L8x8_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x8_SUB2_1 - LOAD8x8_0 - KERNEL8x8_I1_L2_3 32,32, 0,1 - MY_ALIGN -LSGEMM_L8x8_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x8_SAVE - KERNEL8x8 0 - - - MY_ALIGN -LSGEMM_L8x8_SAVE: - SAVE8x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 -#endif - MY_ALIGN -LSGEMM_L8x8_END: -LSGEMM_L8x4_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L8x1_END - - andi. T1, M, 4 - ble LSGEMM_L8x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO8x4 - ble LSGEMM_L8x4_SUB0 - - MY_ALIGN -LSGEMM_L8x4_LOOP_START: - - LOAD8x4_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L8x4_LOOP: - - KERNEL8x4_I1_L4_2 16,32, 0,0 - KERNEL8x4_I1_L4_2 16,32, 1,0 - KERNEL8x4_I1_L4_2 16,32, 2,0 - KERNEL8x4_I1_L4_2 16,32, 3,1 - - bdnz LSGEMM_L8x4_LOOP - - MY_ALIGN -LSGEMM_L8x4_LOOP_END: - - END8x4 0, AO, BO, 16, 32 - - b LSGEMM_L8x4_SUB1 - MY_ALIGN -LSGEMM_L8x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L8x4_SUB2 - MY_ALIGN -LSGEMM_L8x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L8x4_SAVE - MY_ALIGN -LSGEMM_L8x4_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L8x4_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L8x4_SUB2_LOOP: - LOAD8x4_0 - KERNEL8x4_I1_L4_2 16,32, 0,0 - KERNEL8x4_I1_L4_3 16,32, 1,1 - bdnz LSGEMM_L8x4_SUB2_LOOP - MY_ALIGN -LSGEMM_L8x4_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L8x4_SUB2_2 - LOAD8x4_0 - KERNEL8x4_I1_L4_3 16,32, 0,1 - MY_ALIGN -LSGEMM_L8x4_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x4_SUB2_1 - LOAD8x4_0 - KERNEL8x4_I1_L2_3 16,32, 0,1 - MY_ALIGN -LSGEMM_L8x4_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x4_SAVE - KERNEL8x4 0 - - - MY_ALIGN -LSGEMM_L8x4_SAVE: - SAVE8x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 -#endif - MY_ALIGN -LSGEMM_L8x4_END: -LSGEMM_L8x2_BEGIN: - andi. T1, M, 2 - ble LSGEMM_L8x2_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO8x2 - ble LSGEMM_L8x2_SUB0 - - MY_ALIGN -LSGEMM_L8x2_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L8x2_LOOP: - - KERNEL8x2_2 0,0, 0,0 - KERNEL8x2_2 0,0, 1,0 - KERNEL8x2_2 0,0, 2,0 - KERNEL8x2_2 0,0, 3,1 - - bdnz LSGEMM_L8x2_LOOP - - MY_ALIGN -LSGEMM_L8x2_LOOP_END: - -LSGEMM_L8x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L8x2_SAVE - MY_ALIGN -LSGEMM_L8x2_SUB2: - andi. T1,L, 4 - ble LSGEMM_L8x2_SUB2_2 - KERNEL8x2_2 0,0, 0,0 - KERNEL8x2_2 0,0, 1,1 - MY_ALIGN -LSGEMM_L8x2_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x2_SUB2_1 - KERNEL8x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L8x2_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x2_SAVE - KERNEL8x2 - - MY_ALIGN -LSGEMM_L8x2_SAVE: - SAVE8x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 -#endif - MY_ALIGN -LSGEMM_L8x2_END: -LSGEMM_L8x1_BEGIN: - andi. T1, M, 1 - ble LSGEMM_L8x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO8x1 - ble LSGEMM_L8x1_SUB0 - - MY_ALIGN -LSGEMM_L8x1_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L8x1_LOOP: - - KERNEL8x1_4 0,0, 0,0 - KERNEL8x1_4 0,0, 1,1 - - bdnz LSGEMM_L8x1_LOOP - - MY_ALIGN -LSGEMM_L8x1_LOOP_END: - -LSGEMM_L8x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L8x1_SAVE - MY_ALIGN -LSGEMM_L8x1_SUB2: - andi. T1,L, 4 - ble LSGEMM_L8x1_SUB2_2 - KERNEL8x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L8x1_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x1_SUB2_1 - KERNEL8x1_2 - MY_ALIGN -LSGEMM_L8x1_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x1_SAVE - KERNEL8x1 - - MY_ALIGN -LSGEMM_L8x1_SAVE: - SAVE8x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 -#endif - MY_ALIGN -LSGEMM_L8x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 8 -#endif - addic. J, J, -1 - bgt LSGEMM_L8_BEGIN - - -LSGEMM_L8_END: - -/* b LSGEMM_L4_BEGIN*/ - andi. T1, N, 4 - ble LSGEMM_L4_END -LSGEMM_L4_BEGIN: - - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L4x16_END - - MY_ALIGN -LSGEMM_L4x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 6 /**(T11-1) % 64x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 6 /**(K-1) % 64x */ -#endif - - ZERO4x16 - ble LSGEMM_L4x16_SUB0 - - MY_ALIGN -LSGEMM_L4x16_LOOP_START: - - LOAD4x16_0 /*we already zeroed */ - ##OffsetA=64 OffsetB=16 - addi AO,AO,2112 - addi BO,BO,16 - - mtctr L - - MY_ALIGN - -LSGEMM_L4x16_LOOP: - - KERNEL4x16_I1_L4_2 -2048,0, 0,0 - KERNEL4x16_I1_L4_2 -2048,0, 1,0 - KERNEL4x16_I1_L4_2 -2048,0, 2,0 - KERNEL4x16_I1_L4_2 -2048,0, 3,0 - KERNEL4x16_I1_L4_2 -2048,0, 4,0 - KERNEL4x16_I1_L4_2 -2048,0, 5,0 - KERNEL4x16_I1_L4_2 -2048,0, 6,0 - KERNEL4x16_I1_L4_2 -2048,0, 7,0 - KERNEL4x16_I1_L4_2 -2048,0, 8,0 - KERNEL4x16_I1_L4_2 -2048,0, 9,0 - KERNEL4x16_I1_L4_2 -2048,0, 10,0 - KERNEL4x16_I1_L4_2 -2048,0, 11,0 - KERNEL4x16_I1_L4_2 -2048,0, 12,0 - KERNEL4x16_I1_L4_2 -2048,0, 13,0 - KERNEL4x16_I1_L4_2 -2048,0, 14,0 - KERNEL4x16_I1_L4_2 -2048,0, 15,1 - - bdnz LSGEMM_L4x16_LOOP - - MY_ALIGN -LSGEMM_L4x16_LOOP_END: - - END4x16 0, AO, BO, -2048, 0 - - b LSGEMM_L4x16_SUB1 - MY_ALIGN -LSGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 127 -#else - andi. L, K, 127 -#endif - b LSGEMM_L4x16_SUB2 - MY_ALIGN -LSGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 63 -#else - andi. L, T12, 63 -#endif - ble LSGEMM_L4x16_SAVE - MY_ALIGN -LSGEMM_L4x16_SUB2: - - srawi. T10,L, 5 - ble LSGEMM_L4x16_SUB2_16 - mtctr T10 - MY_ALIGN -LSGEMM_L4x16_SUB2_LOOP: - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_2 64,16, 1,0 - KERNEL4x16_I1_L4_2 64,16, 2,0 - KERNEL4x16_I1_L4_2 64,16, 3,0 - KERNEL4x16_I1_L4_2 64,16, 4,0 - KERNEL4x16_I1_L4_2 64,16, 5,0 - KERNEL4x16_I1_L4_2 64,16, 6,0 - KERNEL4x16_I1_L4_3 64,16, 7,1 - bdnz LSGEMM_L4x16_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_2 64,16, 1,0 - KERNEL4x16_I1_L4_2 64,16, 2,0 - KERNEL4x16_I1_L4_3 64,16, 3,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_3 64,16, 1,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L4_3 64,16, 0,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 64,16, 0,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LSGEMM_L4x16_SUB2 - - MY_ALIGN -LSGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L4x16_BEGIN - MY_ALIGN -LSGEMM_L4x16_END: -LSGEMM_L4x8_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L4x1_END - - andi. T1, M, 8 - ble LSGEMM_L4x8_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO4x8 - ble LSGEMM_L4x8_SUB0 - - MY_ALIGN -LSGEMM_L4x8_LOOP_START: - - LOAD4x8_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L4_2 32,16, 0,0 - KERNEL4x8_I1_L4_2 32,16, 1,0 - KERNEL4x8_I1_L4_2 32,16, 2,0 - KERNEL4x8_I1_L4_2 32,16, 3,1 - - bdnz LSGEMM_L4x8_LOOP - - MY_ALIGN -LSGEMM_L4x8_LOOP_END: - - END4x8 0, AO, BO, 32, 16 - - b LSGEMM_L4x8_SUB1 - MY_ALIGN -LSGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L4x8_SUB2 - MY_ALIGN -LSGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L4x8_SAVE - MY_ALIGN -LSGEMM_L4x8_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L4x8_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L4x8_SUB2_LOOP: - LOAD4x8_0 - KERNEL4x8_I1_L4_2 32,16, 0,0 - KERNEL4x8_I1_L4_3 32,16, 1,1 - bdnz LSGEMM_L4x8_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L4_3 32,16, 0,1 - MY_ALIGN -LSGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 32,16, 0,1 - MY_ALIGN -LSGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x8_SAVE - KERNEL4x8 0 - - - MY_ALIGN -LSGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 -#endif - MY_ALIGN -LSGEMM_L4x8_END: -LSGEMM_L4x4_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L4x1_END - - andi. T1, M, 4 - ble LSGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO4x4 - ble LSGEMM_L4x4_SUB0 - - MY_ALIGN -LSGEMM_L4x4_LOOP_START: - - LOAD4x4_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L4x4_LOOP: - - KERNEL4x4_I1_L4_2 16,16, 0,0 - KERNEL4x4_I1_L4_2 16,16, 1,0 - KERNEL4x4_I1_L4_2 16,16, 2,0 - KERNEL4x4_I1_L4_2 16,16, 3,1 - - bdnz LSGEMM_L4x4_LOOP - - MY_ALIGN -LSGEMM_L4x4_LOOP_END: - - END4x4 0, AO, BO, 16, 16 - - b LSGEMM_L4x4_SUB1 - MY_ALIGN -LSGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L4x4_SUB2 - MY_ALIGN -LSGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L4x4_SAVE - MY_ALIGN -LSGEMM_L4x4_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L4x4_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L4x4_SUB2_LOOP: - LOAD4x4_0 - KERNEL4x4_I1_L4_2 16,16, 0,0 - KERNEL4x4_I1_L4_3 16,16, 1,1 - bdnz LSGEMM_L4x4_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x4_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L4x4_SUB2_2 - LOAD4x4_0 - KERNEL4x4_I1_L4_3 16,16, 0,1 - MY_ALIGN -LSGEMM_L4x4_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x4_SUB2_1 - LOAD4x4_0 - KERNEL4x4_I1_L2_3 16,16, 0,1 - MY_ALIGN -LSGEMM_L4x4_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x4_SAVE - KERNEL4x4 0 - - - MY_ALIGN -LSGEMM_L4x4_SAVE: - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 -#endif - MY_ALIGN -LSGEMM_L4x4_END: -LSGEMM_L4x2_BEGIN: - andi. T1, M, 2 - ble LSGEMM_L4x2_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO4x2 - ble LSGEMM_L4x2_SUB0 - - MY_ALIGN -LSGEMM_L4x2_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L4x2_LOOP: - - KERNEL4x2_2 0,0, 0,0 - KERNEL4x2_2 0,0, 1,0 - KERNEL4x2_2 0,0, 2,0 - KERNEL4x2_2 0,0, 3,1 - - bdnz LSGEMM_L4x2_LOOP - - MY_ALIGN -LSGEMM_L4x2_LOOP_END: - -LSGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L4x2_SAVE - MY_ALIGN -LSGEMM_L4x2_SUB2: - andi. T1,L, 4 - ble LSGEMM_L4x2_SUB2_2 - KERNEL4x2_2 0,0, 0,0 - KERNEL4x2_2 0,0, 1,1 - MY_ALIGN -LSGEMM_L4x2_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x2_SUB2_1 - KERNEL4x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L4x2_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -LSGEMM_L4x2_SAVE: - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 -#endif - MY_ALIGN -LSGEMM_L4x2_END: -LSGEMM_L4x1_BEGIN: - andi. T1, M, 1 - ble LSGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO4x1 - ble LSGEMM_L4x1_SUB0 - - MY_ALIGN -LSGEMM_L4x1_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L4x1_LOOP: - - KERNEL4x1_4 0,0, 0,0 - KERNEL4x1_4 0,0, 1,1 - - bdnz LSGEMM_L4x1_LOOP - - MY_ALIGN -LSGEMM_L4x1_LOOP_END: - -LSGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L4x1_SAVE - MY_ALIGN -LSGEMM_L4x1_SUB2: - andi. T1,L, 4 - ble LSGEMM_L4x1_SUB2_2 - KERNEL4x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L4x1_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x1_SUB2_1 - KERNEL4x1_2 - MY_ALIGN -LSGEMM_L4x1_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -LSGEMM_L4x1_SAVE: - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 -#endif - MY_ALIGN -LSGEMM_L4x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - - andi. T2, N, 3 - ble .L999 - -LSGEMM_L4_END: - andi. T1, N, 2 - ble LSGEMM_L2_END -LSGEMM_L2_BEGIN: - - - mr AO, A - mr CO, C - slwi T3, LDC , 1 - add C, C, T3 - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L2x16_END - - MY_ALIGN -LSGEMM_L2x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x16 - ble LSGEMM_L2x16_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_L2x16_LOOP: - - KERNEL2x16_4 -2048,0, 0,0 - KERNEL2x16_4 -2048,0, 1,0 - KERNEL2x16_4 -2048,0, 2,0 - KERNEL2x16_4 -2048,0, 3,0 - KERNEL2x16_4 -2048,0, 4,0 - KERNEL2x16_4 -2048,0, 5,0 - KERNEL2x16_4 -2048,0, 6,0 - KERNEL2x16_4 -2048,0, 7,0 - KERNEL2x16_4 -2048,0, 8,0 - KERNEL2x16_4 -2048,0, 9,0 - KERNEL2x16_4 -2048,0, 10,0 - KERNEL2x16_4 -2048,0, 11,0 - KERNEL2x16_4 -2048,0, 12,0 - KERNEL2x16_4 -2048,0, 13,0 - KERNEL2x16_4 -2048,0, 14,0 - KERNEL2x16_4 -2048,0, 15,1 - - bdnz LSGEMM_L2x16_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x16_SAVE - MY_ALIGN -LSGEMM_L2x16_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x16_SUB2_16 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,0 - KERNEL2x16_4 0,0, 2,0 - KERNEL2x16_4 0,0, 3,0 - KERNEL2x16_4 0,0, 4,0 - KERNEL2x16_4 0,0, 5,0 - KERNEL2x16_4 0,0, 6,0 - KERNEL2x16_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x16_SUB2_8 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,0 - KERNEL2x16_4 0,0, 2,0 - KERNEL2x16_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x16_SUB2_4 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x16_SUB2_2 - KERNEL2x16_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x16_SUB2_1 - KERNEL2x16_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x16_SAVE - KERNEL2x16 - - MY_ALIGN -LSGEMM_L2x16_SAVE: - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L2x16_BEGIN - MY_ALIGN -LSGEMM_L2x16_END: - andi. I, M, 8 - ble LSGEMM_L2x8_END - - MY_ALIGN -LSGEMM_L2x8_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x8 - ble LSGEMM_L2x8_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_L2x8_LOOP: - - KERNEL2x8_4 -2048,0, 0,0 - KERNEL2x8_4 -2048,0, 1,0 - KERNEL2x8_4 -2048,0, 2,0 - KERNEL2x8_4 -2048,0, 3,0 - KERNEL2x8_4 -2048,0, 4,0 - KERNEL2x8_4 -2048,0, 5,0 - KERNEL2x8_4 -2048,0, 6,0 - KERNEL2x8_4 -2048,0, 7,0 - KERNEL2x8_4 -2048,0, 8,0 - KERNEL2x8_4 -2048,0, 9,0 - KERNEL2x8_4 -2048,0, 10,0 - KERNEL2x8_4 -2048,0, 11,0 - KERNEL2x8_4 -2048,0, 12,0 - KERNEL2x8_4 -2048,0, 13,0 - KERNEL2x8_4 -2048,0, 14,0 - KERNEL2x8_4 -2048,0, 15,1 - - bdnz LSGEMM_L2x8_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x8_SAVE - MY_ALIGN -LSGEMM_L2x8_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x8_SUB2_16 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,0 - KERNEL2x8_4 0,0, 2,0 - KERNEL2x8_4 0,0, 3,0 - KERNEL2x8_4 0,0, 4,0 - KERNEL2x8_4 0,0, 5,0 - KERNEL2x8_4 0,0, 6,0 - KERNEL2x8_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x8_SUB2_8 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,0 - KERNEL2x8_4 0,0, 2,0 - KERNEL2x8_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x8_SUB2_4 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x8_SUB2_2 - KERNEL2x8_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x8_SUB2_1 - KERNEL2x8_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -LSGEMM_L2x8_SAVE: - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 -#endif - MY_ALIGN -LSGEMM_L2x8_END: - andi. I, M, 4 - ble LSGEMM_L2x4_END - - MY_ALIGN -LSGEMM_L2x4_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x4 - ble LSGEMM_L2x4_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x4_LOOP: - - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,0 - KERNEL2x4_4 0,0, 4,0 - KERNEL2x4_4 0,0, 5,0 - KERNEL2x4_4 0,0, 6,0 - KERNEL2x4_4 0,0, 7,0 - KERNEL2x4_4 0,0, 8,0 - KERNEL2x4_4 0,0, 9,0 - KERNEL2x4_4 0,0, 10,0 - KERNEL2x4_4 0,0, 11,0 - KERNEL2x4_4 0,0, 12,0 - KERNEL2x4_4 0,0, 13,0 - KERNEL2x4_4 0,0, 14,0 - KERNEL2x4_4 0,0, 15,1 - - bdnz LSGEMM_L2x4_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x4_SAVE - MY_ALIGN -LSGEMM_L2x4_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x4_SUB2_16 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,0 - KERNEL2x4_4 0,0, 4,0 - KERNEL2x4_4 0,0, 5,0 - KERNEL2x4_4 0,0, 6,0 - KERNEL2x4_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x4_SUB2_8 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x4_SUB2_4 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x4_SUB2_2 - KERNEL2x4_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x4_SUB2_1 - KERNEL2x4_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x4_SAVE - KERNEL2x4 - - MY_ALIGN -LSGEMM_L2x4_SAVE: - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 -#endif - MY_ALIGN -LSGEMM_L2x4_END: - andi. I, M, 2 - ble LSGEMM_L2x2_END - - MY_ALIGN -LSGEMM_L2x2_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x2 - ble LSGEMM_L2x2_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x2_LOOP: - - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,0 - KERNEL2x2_4 0,0, 4,0 - KERNEL2x2_4 0,0, 5,0 - KERNEL2x2_4 0,0, 6,0 - KERNEL2x2_4 0,0, 7,0 - KERNEL2x2_4 0,0, 8,0 - KERNEL2x2_4 0,0, 9,0 - KERNEL2x2_4 0,0, 10,0 - KERNEL2x2_4 0,0, 11,0 - KERNEL2x2_4 0,0, 12,0 - KERNEL2x2_4 0,0, 13,0 - KERNEL2x2_4 0,0, 14,0 - KERNEL2x2_4 0,0, 15,1 - - bdnz LSGEMM_L2x2_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x2_SAVE - MY_ALIGN -LSGEMM_L2x2_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x2_SUB2_16 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,0 - KERNEL2x2_4 0,0, 4,0 - KERNEL2x2_4 0,0, 5,0 - KERNEL2x2_4 0,0, 6,0 - KERNEL2x2_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x2_SUB2_8 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x2_SUB2_4 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x2_SUB2_2 - KERNEL2x2_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x2_SUB2_1 - KERNEL2x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -LSGEMM_L2x2_SAVE: - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 -#endif - MY_ALIGN -LSGEMM_L2x2_END: - andi. I, M, 1 - ble LSGEMM_L2x1_END - - MY_ALIGN -LSGEMM_L2x1_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x1 - ble LSGEMM_L2x1_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x1_LOOP: - - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,0 - KERNEL2x1_4 0,0, 4,0 - KERNEL2x1_4 0,0, 5,0 - KERNEL2x1_4 0,0, 6,0 - KERNEL2x1_4 0,0, 7,0 - KERNEL2x1_4 0,0, 8,0 - KERNEL2x1_4 0,0, 9,0 - KERNEL2x1_4 0,0, 10,0 - KERNEL2x1_4 0,0, 11,0 - KERNEL2x1_4 0,0, 12,0 - KERNEL2x1_4 0,0, 13,0 - KERNEL2x1_4 0,0, 14,0 - KERNEL2x1_4 0,0, 15,1 - - bdnz LSGEMM_L2x1_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x1_SAVE - MY_ALIGN -LSGEMM_L2x1_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x1_SUB2_16 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,0 - KERNEL2x1_4 0,0, 4,0 - KERNEL2x1_4 0,0, 5,0 - KERNEL2x1_4 0,0, 6,0 - KERNEL2x1_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x1_SUB2_8 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x1_SUB2_4 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x1_SUB2_2 - KERNEL2x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x1_SUB2_1 - KERNEL2x1_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -LSGEMM_L2x1_SAVE: - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 -#endif - MY_ALIGN -LSGEMM_L2x1_END: - slwi T1, K, 3 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LSGEMM_L2_END: - andi. T1, N, 1 - ble LSGEMM_END -LSGEMM_1_BEGIN: - - - mr AO, A - mr CO, C - add C, C, LDC - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_1x16_END - - MY_ALIGN -LSGEMM_1x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x16 - ble LSGEMM_1x16_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_1x16_LOOP: - - KERNEL1x16_4 -2048,0, 0,0 - KERNEL1x16_4 -2048,0, 1,0 - KERNEL1x16_4 -2048,0, 2,0 - KERNEL1x16_4 -2048,0, 3,0 - KERNEL1x16_4 -2048,0, 4,0 - KERNEL1x16_4 -2048,0, 5,0 - KERNEL1x16_4 -2048,0, 6,0 - KERNEL1x16_4 -2048,0, 7,0 - KERNEL1x16_4 -2048,0, 8,0 - KERNEL1x16_4 -2048,0, 9,0 - KERNEL1x16_4 -2048,0, 10,0 - KERNEL1x16_4 -2048,0, 11,0 - KERNEL1x16_4 -2048,0, 12,0 - KERNEL1x16_4 -2048,0, 13,0 - KERNEL1x16_4 -2048,0, 14,0 - KERNEL1x16_4 -2048,0, 15,1 - - bdnz LSGEMM_1x16_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x16_SAVE - MY_ALIGN -LSGEMM_1x16_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x16_SUB2_16 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,0 - KERNEL1x16_4 0,0, 2,0 - KERNEL1x16_4 0,0, 3,0 - KERNEL1x16_4 0,0, 4,0 - KERNEL1x16_4 0,0, 5,0 - KERNEL1x16_4 0,0, 6,0 - KERNEL1x16_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x16_SUB2_8 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,0 - KERNEL1x16_4 0,0, 2,0 - KERNEL1x16_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x16_SUB2_4 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x16_SUB2_2 - KERNEL1x16_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x16_SUB2_1 - KERNEL1x16_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x16_SAVE - KERNEL1x16 - - MY_ALIGN -LSGEMM_1x16_SAVE: - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt+ LSGEMM_1x16_BEGIN - MY_ALIGN -LSGEMM_1x16_END: - andi. I, M, 8 - ble LSGEMM_1x8_END - - MY_ALIGN -LSGEMM_1x8_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x8 - ble LSGEMM_1x8_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_1x8_LOOP: - - KERNEL1x8_4 -2048,0, 0,0 - KERNEL1x8_4 -2048,0, 1,0 - KERNEL1x8_4 -2048,0, 2,0 - KERNEL1x8_4 -2048,0, 3,0 - KERNEL1x8_4 -2048,0, 4,0 - KERNEL1x8_4 -2048,0, 5,0 - KERNEL1x8_4 -2048,0, 6,0 - KERNEL1x8_4 -2048,0, 7,0 - KERNEL1x8_4 -2048,0, 8,0 - KERNEL1x8_4 -2048,0, 9,0 - KERNEL1x8_4 -2048,0, 10,0 - KERNEL1x8_4 -2048,0, 11,0 - KERNEL1x8_4 -2048,0, 12,0 - KERNEL1x8_4 -2048,0, 13,0 - KERNEL1x8_4 -2048,0, 14,0 - KERNEL1x8_4 -2048,0, 15,1 - - bdnz LSGEMM_1x8_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x8_SAVE - MY_ALIGN -LSGEMM_1x8_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x8_SUB2_16 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,0 - KERNEL1x8_4 0,0, 2,0 - KERNEL1x8_4 0,0, 3,0 - KERNEL1x8_4 0,0, 4,0 - KERNEL1x8_4 0,0, 5,0 - KERNEL1x8_4 0,0, 6,0 - KERNEL1x8_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x8_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x8_SUB2_8 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,0 - KERNEL1x8_4 0,0, 2,0 - KERNEL1x8_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x8_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x8_SUB2_4 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x8_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x8_SUB2_2 - KERNEL1x8_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x8_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x8_SUB2_1 - KERNEL1x8_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x8_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x8_SAVE - KERNEL1x8 - - MY_ALIGN -LSGEMM_1x8_SAVE: - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 -#endif - MY_ALIGN -LSGEMM_1x8_END: - andi. I, M, 4 - ble LSGEMM_1x4_END - - MY_ALIGN -LSGEMM_1x4_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x4 - ble LSGEMM_1x4_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x4_LOOP: - - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,0 - KERNEL1x4_4 0,0, 4,0 - KERNEL1x4_4 0,0, 5,0 - KERNEL1x4_4 0,0, 6,0 - KERNEL1x4_4 0,0, 7,0 - KERNEL1x4_4 0,0, 8,0 - KERNEL1x4_4 0,0, 9,0 - KERNEL1x4_4 0,0, 10,0 - KERNEL1x4_4 0,0, 11,0 - KERNEL1x4_4 0,0, 12,0 - KERNEL1x4_4 0,0, 13,0 - KERNEL1x4_4 0,0, 14,0 - KERNEL1x4_4 0,0, 15,1 - - bdnz LSGEMM_1x4_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x4_SAVE - MY_ALIGN -LSGEMM_1x4_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x4_SUB2_16 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,0 - KERNEL1x4_4 0,0, 4,0 - KERNEL1x4_4 0,0, 5,0 - KERNEL1x4_4 0,0, 6,0 - KERNEL1x4_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x4_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x4_SUB2_8 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x4_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x4_SUB2_4 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x4_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x4_SUB2_2 - KERNEL1x4_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x4_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x4_SUB2_1 - KERNEL1x4_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x4_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x4_SAVE - KERNEL1x4 - - MY_ALIGN -LSGEMM_1x4_SAVE: - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 -#endif - MY_ALIGN -LSGEMM_1x4_END: - andi. I, M, 2 - ble LSGEMM_1x2_END - - MY_ALIGN -LSGEMM_1x2_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x2 - ble LSGEMM_1x2_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x2_LOOP: - - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,0 - KERNEL1x2_4 0,0, 4,0 - KERNEL1x2_4 0,0, 5,0 - KERNEL1x2_4 0,0, 6,0 - KERNEL1x2_4 0,0, 7,0 - KERNEL1x2_4 0,0, 8,0 - KERNEL1x2_4 0,0, 9,0 - KERNEL1x2_4 0,0, 10,0 - KERNEL1x2_4 0,0, 11,0 - KERNEL1x2_4 0,0, 12,0 - KERNEL1x2_4 0,0, 13,0 - KERNEL1x2_4 0,0, 14,0 - KERNEL1x2_4 0,0, 15,1 - - bdnz LSGEMM_1x2_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x2_SAVE - MY_ALIGN -LSGEMM_1x2_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x2_SUB2_16 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,0 - KERNEL1x2_4 0,0, 4,0 - KERNEL1x2_4 0,0, 5,0 - KERNEL1x2_4 0,0, 6,0 - KERNEL1x2_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x2_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x2_SUB2_8 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x2_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x2_SUB2_4 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x2_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x2_SUB2_2 - KERNEL1x2_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x2_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x2_SUB2_1 - KERNEL1x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x2_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x2_SAVE - KERNEL1x2 - - MY_ALIGN -LSGEMM_1x2_SAVE: - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 -#endif - MY_ALIGN -LSGEMM_1x2_END: - andi. I, M, 1 - ble LSGEMM_1x1_END - - MY_ALIGN -LSGEMM_1x1_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x1 - ble LSGEMM_1x1_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x1_LOOP: - - KERNEL1x1_16 0,0, 0,0 - KERNEL1x1_16 0,0, 1,0 - KERNEL1x1_16 0,0, 2,0 - KERNEL1x1_16 0,0, 3,1 - - bdnz LSGEMM_1x1_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x1_SAVE - MY_ALIGN -LSGEMM_1x1_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x1_SUB2_16 - KERNEL1x1_16 0,0, 0,0 - KERNEL1x1_16 0,0, 1,1 - MY_ALIGN -LSGEMM_1x1_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x1_SUB2_8 - KERNEL1x1_16 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x1_SUB2_4 - KERNEL1x1_8 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x1_SUB2_2 - KERNEL1x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x1_SUB2_1 - KERNEL1x1_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x1_SAVE - KERNEL1x1 - - MY_ALIGN -LSGEMM_1x1_SAVE: - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 -#endif - MY_ALIGN -LSGEMM_1x1_END: - slwi T1, K, 2 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif +#define MY_ALIGN .align 3 +b L8 + + MY_ALIGN +LSGEMM_L8x16_LMAIN_SUB: + LOAD8x16_2 + MY_ALIGN + +LSGEMM_L8x16_LOOP: + KERNEL8x16_L2 128,64,0,0 +LSGEMM_L8x16_K128: + KERNEL8x16_L2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64, 3,0 + KERNEL8x16_I1_L4_2 128,64, 4,0 + KERNEL8x16_I1_L4_2 128,64, 5,0 + KERNEL8x16_I1_L4_2 128,64, 6,0 + KERNEL8x16_I1_L4_2 128,64, 7,0 + KERNEL8x16_I1_L4_2 128,64, 8,0 + KERNEL8x16_I1_L4_2 128,64, 9,0 + KERNEL8x16_I1_L4_2 128,64, 10,0 + KERNEL8x16_I1_L4_2 128,64, 11,0 + KERNEL8x16_I1_L4_2 128,64, 12,0 + KERNEL8x16_I1_L4_2 128,64, 13,0 + KERNEL8x16_I1_L4_2 128,64, 14,0 + KERNEL8x16_I1_L4_2 128,64, 15,0 + KERNEL8x16_I1_L4_2 128,64, 16,0 + KERNEL8x16_I1_L4_2 128,64, 17,0 + KERNEL8x16_I1_L4_2 128,64, 18,0 + KERNEL8x16_I1_L4_2 128,64, 19,0 + KERNEL8x16_I1_L4_2 128,64, 20,0 + KERNEL8x16_I1_L4_2 128,64, 21,0 + KERNEL8x16_I1_L4_2 128,64, 22,0 + KERNEL8x16_I1_L4_2 128,64, 23,0 + KERNEL8x16_I1_L4_2 128,64, 24,0 + KERNEL8x16_I1_L4_2 128,64, 25,0 + KERNEL8x16_I1_L4_2 128,64, 26,0 + KERNEL8x16_I1_L4_2 128,64, 27,0 + KERNEL8x16_I1_L4_2 128,64, 28,0 + KERNEL8x16_I1_L4_2 128,64, 29,0 + KERNEL8x16_I1_L4_2 128,64, 30,0 + KERNEL8x16_I1_L4_2 128,64, 31,1 + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + END8x16_2 + blr + + MY_ALIGN +LSGEMM_L8x16_L64_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_2 128,64,7,0 + KERNEL8x16_I1_L4_2 128,64,8,0 + KERNEL8x16_I1_L4_2 128,64,9,0 + KERNEL8x16_I1_L4_2 128,64,10,0 + KERNEL8x16_I1_L4_2 128,64,11,0 + KERNEL8x16_I1_L4_2 128,64,12,0 + KERNEL8x16_I1_L4_2 128,64,13,0 + KERNEL8x16_I1_L4_2 128,64,14,0 + KERNEL8x16_I1_L4_3 128,64,15,1 + blr +LSGEMM_L8x16_L32_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_3 128,64,7,1 + blr + +LSGEMM_L8x16_L16_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_3 128,64,3,1 + blr + +L8: +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 3 + + ble LSGEMM_L8_END + +LSGEMM_L8_BEGIN: + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 3 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L8x16_END + + MY_ALIGN +LSGEMM_L8x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 + mr T12, T11 + addi T12,T12, -2 + srawi. L, T12, 7 /**(T11-2) % 128x */ +#else + mr T12, K + addi T12,T12, -2 + srawi. L, T12, 7 /**(K-2) % 128x */ +#endif + + ZERO8x16 + ble LSGEMM_L8x16_SUB0 + mtctr L + bl LSGEMM_L8x16_LMAIN_SUB + andi. L, T12, 127 + ble LSGEMM_L8x16_SAVE + b LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 255 + cmpwi T11,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T10,1 + bne CMP8x16_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD8x16 64,32 + END8x16_WITHOUT_ADD + LOAD8x16_2O AO,BO, 128, 64 + mtctr T10 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE +CMP8x16_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T11,128 +#else + cmpwi K,128 +#endif + bne LSGEMM_L8x16_SUB2 + MY_ALIGN + mtctr T10 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD8x16_2O AO,BO, 128,64 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE + MY_ALIGN +LSGEMM_L8x16_SUB2: + andi. T10,L,64 + ble LSGEMM_L8x16_SUB2_32 + bl LSGEMM_L8x16_L64_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_32: + andi. T10,L, 32 + ble LSGEMM_L8x16_SUB2_16 + bl LSGEMM_L8x16_L32_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L8x16_SUB2_8 + bl LSGEMM_L8x16_L16_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L8x16_SUB2_4 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_3 128,64, 1,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L8x16_SUB2_2 + LOAD8x16_2 + KERNEL8x16_I1_L4_3 128,64, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L8x16_SUB2_1 + LOAD8x16_2 + KERNEL8x16_E2 128,64, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L8x16_SAVE + KERNEL8x16 0 + + + MY_ALIGN +LSGEMM_L8x16_SAVE: + SAVE8x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L8x16_BEGIN + MY_ALIGN +LSGEMM_L8x16_END: +LSGEMM_L8x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 8 + ble LSGEMM_L8x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x8 + ble LSGEMM_L8x8_SUB0 + + MY_ALIGN +LSGEMM_L8x8_LOOP_START: + + LOAD8x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x8_LOOP: + + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_2 32,32, 1,0 + KERNEL8x8_I1_L4_2 32,32, 2,0 + KERNEL8x8_I1_L4_2 32,32, 3,1 + + bdnz LSGEMM_L8x8_LOOP + + MY_ALIGN +LSGEMM_L8x8_LOOP_END: + + END8x8 0, AO, BO, 32, 32 + + b LSGEMM_L8x8_SUB1 + MY_ALIGN +LSGEMM_L8x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x8_SUB2 + MY_ALIGN +LSGEMM_L8x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x8_SAVE + MY_ALIGN +LSGEMM_L8x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x8_SUB2_LOOP: + LOAD8x8_0 + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_3 32,32, 1,1 + bdnz LSGEMM_L8x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x8_SUB2_2 + LOAD8x8_0 + KERNEL8x8_I1_L4_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x8_SUB2_1 + LOAD8x8_0 + KERNEL8x8_I1_L2_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x8_SAVE + KERNEL8x8 0 + + + MY_ALIGN +LSGEMM_L8x8_SAVE: + SAVE8x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 +#endif + MY_ALIGN +LSGEMM_L8x8_END: +LSGEMM_L8x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 4 + ble LSGEMM_L8x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x4 + ble LSGEMM_L8x4_SUB0 + + MY_ALIGN +LSGEMM_L8x4_LOOP_START: + + LOAD8x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x4_LOOP: + + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_2 16,32, 1,0 + KERNEL8x4_I1_L4_2 16,32, 2,0 + KERNEL8x4_I1_L4_2 16,32, 3,1 + + bdnz LSGEMM_L8x4_LOOP + + MY_ALIGN +LSGEMM_L8x4_LOOP_END: + + END8x4 0, AO, BO, 16, 32 + + b LSGEMM_L8x4_SUB1 + MY_ALIGN +LSGEMM_L8x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x4_SUB2 + MY_ALIGN +LSGEMM_L8x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x4_SAVE + MY_ALIGN +LSGEMM_L8x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x4_SUB2_LOOP: + LOAD8x4_0 + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_3 16,32, 1,1 + bdnz LSGEMM_L8x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x4_SUB2_2 + LOAD8x4_0 + KERNEL8x4_I1_L4_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x4_SUB2_1 + LOAD8x4_0 + KERNEL8x4_I1_L2_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x4_SAVE + KERNEL8x4 0 + + + MY_ALIGN +LSGEMM_L8x4_SAVE: + SAVE8x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 +#endif + MY_ALIGN +LSGEMM_L8x4_END: +LSGEMM_L8x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L8x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x2 + ble LSGEMM_L8x2_SUB0 + + MY_ALIGN +LSGEMM_L8x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x2_LOOP: + + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,0 + KERNEL8x2_2 0,0, 2,0 + KERNEL8x2_2 0,0, 3,1 + + bdnz LSGEMM_L8x2_LOOP + + MY_ALIGN +LSGEMM_L8x2_LOOP_END: + +LSGEMM_L8x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x2_SAVE + MY_ALIGN +LSGEMM_L8x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x2_SUB2_2 + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x2_SUB2_1 + KERNEL8x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x2_SAVE + KERNEL8x2 + + MY_ALIGN +LSGEMM_L8x2_SAVE: + SAVE8x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 +#endif + MY_ALIGN +LSGEMM_L8x2_END: +LSGEMM_L8x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L8x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x1 + ble LSGEMM_L8x1_SUB0 + + MY_ALIGN +LSGEMM_L8x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x1_LOOP: + + KERNEL8x1_4 0,0, 0,0 + KERNEL8x1_4 0,0, 1,1 + + bdnz LSGEMM_L8x1_LOOP + + MY_ALIGN +LSGEMM_L8x1_LOOP_END: + +LSGEMM_L8x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x1_SAVE + MY_ALIGN +LSGEMM_L8x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x1_SUB2_2 + KERNEL8x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x1_SUB2_1 + KERNEL8x1_2 + MY_ALIGN +LSGEMM_L8x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x1_SAVE + KERNEL8x1 + + MY_ALIGN +LSGEMM_L8x1_SAVE: + SAVE8x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 +#endif + MY_ALIGN +LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 8 +#endif + addic. J, J, -1 + bgt LSGEMM_L8_BEGIN + + +LSGEMM_L8_END: + +/* b LSGEMM_L4_BEGIN*/ + andi. T1, N, 4 + ble LSGEMM_L4_END +LSGEMM_L4_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L4x16_END + + MY_ALIGN +LSGEMM_L4x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO4x16 + ble LSGEMM_L4x16_SUB0 + + MY_ALIGN +LSGEMM_L4x16_LOOP_START: + + LOAD4x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=16 + addi AO,AO,2112 + addi BO,BO,16 + + mtctr L + + MY_ALIGN + +LSGEMM_L4x16_LOOP: + + KERNEL4x16_I1_L4_2 -2048,0, 0,0 + KERNEL4x16_I1_L4_2 -2048,0, 1,0 + KERNEL4x16_I1_L4_2 -2048,0, 2,0 + KERNEL4x16_I1_L4_2 -2048,0, 3,0 + KERNEL4x16_I1_L4_2 -2048,0, 4,0 + KERNEL4x16_I1_L4_2 -2048,0, 5,0 + KERNEL4x16_I1_L4_2 -2048,0, 6,0 + KERNEL4x16_I1_L4_2 -2048,0, 7,0 + KERNEL4x16_I1_L4_2 -2048,0, 8,0 + KERNEL4x16_I1_L4_2 -2048,0, 9,0 + KERNEL4x16_I1_L4_2 -2048,0, 10,0 + KERNEL4x16_I1_L4_2 -2048,0, 11,0 + KERNEL4x16_I1_L4_2 -2048,0, 12,0 + KERNEL4x16_I1_L4_2 -2048,0, 13,0 + KERNEL4x16_I1_L4_2 -2048,0, 14,0 + KERNEL4x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L4x16_LOOP + + MY_ALIGN +LSGEMM_L4x16_LOOP_END: + + END4x16 0, AO, BO, -2048, 0 + + b LSGEMM_L4x16_SUB1 + MY_ALIGN +LSGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L4x16_SUB2 + MY_ALIGN +LSGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L4x16_SAVE + MY_ALIGN +LSGEMM_L4x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L4x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L4x16_SUB2_LOOP: + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_2 64,16, 3,0 + KERNEL4x16_I1_L4_2 64,16, 4,0 + KERNEL4x16_I1_L4_2 64,16, 5,0 + KERNEL4x16_I1_L4_2 64,16, 6,0 + KERNEL4x16_I1_L4_3 64,16, 7,1 + bdnz LSGEMM_L4x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_3 64,16, 3,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_3 64,16, 1,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L4_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L4x16_SUB2 + + MY_ALIGN +LSGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L4x16_BEGIN + MY_ALIGN +LSGEMM_L4x16_END: +LSGEMM_L4x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 8 + ble LSGEMM_L4x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x8 + ble LSGEMM_L4x8_SUB0 + + MY_ALIGN +LSGEMM_L4x8_LOOP_START: + + LOAD4x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_2 32,16, 1,0 + KERNEL4x8_I1_L4_2 32,16, 2,0 + KERNEL4x8_I1_L4_2 32,16, 3,1 + + bdnz LSGEMM_L4x8_LOOP + + MY_ALIGN +LSGEMM_L4x8_LOOP_END: + + END4x8 0, AO, BO, 32, 16 + + b LSGEMM_L4x8_SUB1 + MY_ALIGN +LSGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x8_SUB2 + MY_ALIGN +LSGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x8_SAVE + MY_ALIGN +LSGEMM_L4x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x8_SUB2_LOOP: + LOAD4x8_0 + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_3 32,16, 1,1 + bdnz LSGEMM_L4x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L4_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x8_SAVE + KERNEL4x8 0 + + + MY_ALIGN +LSGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 +#endif + MY_ALIGN +LSGEMM_L4x8_END: +LSGEMM_L4x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 4 + ble LSGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x4 + ble LSGEMM_L4x4_SUB0 + + MY_ALIGN +LSGEMM_L4x4_LOOP_START: + + LOAD4x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x4_LOOP: + + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_2 16,16, 1,0 + KERNEL4x4_I1_L4_2 16,16, 2,0 + KERNEL4x4_I1_L4_2 16,16, 3,1 + + bdnz LSGEMM_L4x4_LOOP + + MY_ALIGN +LSGEMM_L4x4_LOOP_END: + + END4x4 0, AO, BO, 16, 16 + + b LSGEMM_L4x4_SUB1 + MY_ALIGN +LSGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x4_SUB2 + MY_ALIGN +LSGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x4_SAVE + MY_ALIGN +LSGEMM_L4x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x4_SUB2_LOOP: + LOAD4x4_0 + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_3 16,16, 1,1 + bdnz LSGEMM_L4x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x4_SUB2_2 + LOAD4x4_0 + KERNEL4x4_I1_L4_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x4_SUB2_1 + LOAD4x4_0 + KERNEL4x4_I1_L2_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x4_SAVE + KERNEL4x4 0 + + + MY_ALIGN +LSGEMM_L4x4_SAVE: + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 +#endif + MY_ALIGN +LSGEMM_L4x4_END: +LSGEMM_L4x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L4x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x2 + ble LSGEMM_L4x2_SUB0 + + MY_ALIGN +LSGEMM_L4x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x2_LOOP: + + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,0 + KERNEL4x2_2 0,0, 2,0 + KERNEL4x2_2 0,0, 3,1 + + bdnz LSGEMM_L4x2_LOOP + + MY_ALIGN +LSGEMM_L4x2_LOOP_END: + +LSGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x2_SAVE + MY_ALIGN +LSGEMM_L4x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x2_SUB2_2 + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x2_SUB2_1 + KERNEL4x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +LSGEMM_L4x2_SAVE: + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 +#endif + MY_ALIGN +LSGEMM_L4x2_END: +LSGEMM_L4x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x1 + ble LSGEMM_L4x1_SUB0 + + MY_ALIGN +LSGEMM_L4x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x1_LOOP: + + KERNEL4x1_4 0,0, 0,0 + KERNEL4x1_4 0,0, 1,1 + + bdnz LSGEMM_L4x1_LOOP + + MY_ALIGN +LSGEMM_L4x1_LOOP_END: + +LSGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x1_SAVE + MY_ALIGN +LSGEMM_L4x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x1_SUB2_2 + KERNEL4x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x1_SUB2_1 + KERNEL4x1_2 + MY_ALIGN +LSGEMM_L4x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +LSGEMM_L4x1_SAVE: + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 +#endif + MY_ALIGN +LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + + andi. T2, N, 3 + ble .L999 + +LSGEMM_L4_END: + andi. T1, N, 2 + ble LSGEMM_L2_END +LSGEMM_L2_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 1 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L2x16_END + + MY_ALIGN +LSGEMM_L2x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x16 + ble LSGEMM_L2x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x16_LOOP: + + KERNEL2x16_4 -2048,0, 0,0 + KERNEL2x16_4 -2048,0, 1,0 + KERNEL2x16_4 -2048,0, 2,0 + KERNEL2x16_4 -2048,0, 3,0 + KERNEL2x16_4 -2048,0, 4,0 + KERNEL2x16_4 -2048,0, 5,0 + KERNEL2x16_4 -2048,0, 6,0 + KERNEL2x16_4 -2048,0, 7,0 + KERNEL2x16_4 -2048,0, 8,0 + KERNEL2x16_4 -2048,0, 9,0 + KERNEL2x16_4 -2048,0, 10,0 + KERNEL2x16_4 -2048,0, 11,0 + KERNEL2x16_4 -2048,0, 12,0 + KERNEL2x16_4 -2048,0, 13,0 + KERNEL2x16_4 -2048,0, 14,0 + KERNEL2x16_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x16_SAVE + MY_ALIGN +LSGEMM_L2x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x16_SUB2_16 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,0 + KERNEL2x16_4 0,0, 4,0 + KERNEL2x16_4 0,0, 5,0 + KERNEL2x16_4 0,0, 6,0 + KERNEL2x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x16_SUB2_8 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x16_SUB2_4 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x16_SUB2_2 + KERNEL2x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x16_SUB2_1 + KERNEL2x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x16_SAVE + KERNEL2x16 + + MY_ALIGN +LSGEMM_L2x16_SAVE: + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L2x16_BEGIN + MY_ALIGN +LSGEMM_L2x16_END: + andi. I, M, 8 + ble LSGEMM_L2x8_END + + MY_ALIGN +LSGEMM_L2x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x8 + ble LSGEMM_L2x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x8_LOOP: + + KERNEL2x8_4 -2048,0, 0,0 + KERNEL2x8_4 -2048,0, 1,0 + KERNEL2x8_4 -2048,0, 2,0 + KERNEL2x8_4 -2048,0, 3,0 + KERNEL2x8_4 -2048,0, 4,0 + KERNEL2x8_4 -2048,0, 5,0 + KERNEL2x8_4 -2048,0, 6,0 + KERNEL2x8_4 -2048,0, 7,0 + KERNEL2x8_4 -2048,0, 8,0 + KERNEL2x8_4 -2048,0, 9,0 + KERNEL2x8_4 -2048,0, 10,0 + KERNEL2x8_4 -2048,0, 11,0 + KERNEL2x8_4 -2048,0, 12,0 + KERNEL2x8_4 -2048,0, 13,0 + KERNEL2x8_4 -2048,0, 14,0 + KERNEL2x8_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x8_SAVE + MY_ALIGN +LSGEMM_L2x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x8_SUB2_16 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,0 + KERNEL2x8_4 0,0, 4,0 + KERNEL2x8_4 0,0, 5,0 + KERNEL2x8_4 0,0, 6,0 + KERNEL2x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x8_SUB2_8 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x8_SUB2_4 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x8_SUB2_2 + KERNEL2x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +LSGEMM_L2x8_SAVE: + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 +#endif + MY_ALIGN +LSGEMM_L2x8_END: + andi. I, M, 4 + ble LSGEMM_L2x4_END + + MY_ALIGN +LSGEMM_L2x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x4 + ble LSGEMM_L2x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x4_LOOP: + + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,0 + KERNEL2x4_4 0,0, 8,0 + KERNEL2x4_4 0,0, 9,0 + KERNEL2x4_4 0,0, 10,0 + KERNEL2x4_4 0,0, 11,0 + KERNEL2x4_4 0,0, 12,0 + KERNEL2x4_4 0,0, 13,0 + KERNEL2x4_4 0,0, 14,0 + KERNEL2x4_4 0,0, 15,1 + + bdnz LSGEMM_L2x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x4_SAVE + MY_ALIGN +LSGEMM_L2x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x4_SUB2_16 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x4_SUB2_8 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x4_SUB2_4 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x4_SUB2_2 + KERNEL2x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x4_SAVE + KERNEL2x4 + + MY_ALIGN +LSGEMM_L2x4_SAVE: + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 +#endif + MY_ALIGN +LSGEMM_L2x4_END: + andi. I, M, 2 + ble LSGEMM_L2x2_END + + MY_ALIGN +LSGEMM_L2x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x2 + ble LSGEMM_L2x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x2_LOOP: + + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,0 + KERNEL2x2_4 0,0, 8,0 + KERNEL2x2_4 0,0, 9,0 + KERNEL2x2_4 0,0, 10,0 + KERNEL2x2_4 0,0, 11,0 + KERNEL2x2_4 0,0, 12,0 + KERNEL2x2_4 0,0, 13,0 + KERNEL2x2_4 0,0, 14,0 + KERNEL2x2_4 0,0, 15,1 + + bdnz LSGEMM_L2x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x2_SAVE + MY_ALIGN +LSGEMM_L2x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x2_SUB2_16 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x2_SUB2_8 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x2_SUB2_4 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x2_SUB2_2 + KERNEL2x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +LSGEMM_L2x2_SAVE: + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 +#endif + MY_ALIGN +LSGEMM_L2x2_END: + andi. I, M, 1 + ble LSGEMM_L2x1_END + + MY_ALIGN +LSGEMM_L2x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x1 + ble LSGEMM_L2x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x1_LOOP: + + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,0 + KERNEL2x1_4 0,0, 8,0 + KERNEL2x1_4 0,0, 9,0 + KERNEL2x1_4 0,0, 10,0 + KERNEL2x1_4 0,0, 11,0 + KERNEL2x1_4 0,0, 12,0 + KERNEL2x1_4 0,0, 13,0 + KERNEL2x1_4 0,0, 14,0 + KERNEL2x1_4 0,0, 15,1 + + bdnz LSGEMM_L2x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x1_SAVE + MY_ALIGN +LSGEMM_L2x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x1_SUB2_16 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x1_SUB2_8 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x1_SUB2_4 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x1_SUB2_2 + KERNEL2x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x1_SUB2_1 + KERNEL2x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +LSGEMM_L2x1_SAVE: + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 +#endif + MY_ALIGN +LSGEMM_L2x1_END: + slwi T1, K, 3 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LSGEMM_L2_END: + andi. T1, N, 1 + ble LSGEMM_END +LSGEMM_1_BEGIN: + + + mr AO, A + mr CO, C + add C, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_1x16_END + + MY_ALIGN +LSGEMM_1x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x16 + ble LSGEMM_1x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x16_LOOP: + + KERNEL1x16_4 -2048,0, 0,0 + KERNEL1x16_4 -2048,0, 1,0 + KERNEL1x16_4 -2048,0, 2,0 + KERNEL1x16_4 -2048,0, 3,0 + KERNEL1x16_4 -2048,0, 4,0 + KERNEL1x16_4 -2048,0, 5,0 + KERNEL1x16_4 -2048,0, 6,0 + KERNEL1x16_4 -2048,0, 7,0 + KERNEL1x16_4 -2048,0, 8,0 + KERNEL1x16_4 -2048,0, 9,0 + KERNEL1x16_4 -2048,0, 10,0 + KERNEL1x16_4 -2048,0, 11,0 + KERNEL1x16_4 -2048,0, 12,0 + KERNEL1x16_4 -2048,0, 13,0 + KERNEL1x16_4 -2048,0, 14,0 + KERNEL1x16_4 -2048,0, 15,1 + + bdnz LSGEMM_1x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x16_SAVE + MY_ALIGN +LSGEMM_1x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x16_SUB2_16 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,0 + KERNEL1x16_4 0,0, 4,0 + KERNEL1x16_4 0,0, 5,0 + KERNEL1x16_4 0,0, 6,0 + KERNEL1x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x16_SUB2_8 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x16_SUB2_4 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x16_SUB2_2 + KERNEL1x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x16_SUB2_1 + KERNEL1x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x16_SAVE + KERNEL1x16 + + MY_ALIGN +LSGEMM_1x16_SAVE: + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt+ LSGEMM_1x16_BEGIN + MY_ALIGN +LSGEMM_1x16_END: + andi. I, M, 8 + ble LSGEMM_1x8_END + + MY_ALIGN +LSGEMM_1x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x8 + ble LSGEMM_1x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x8_LOOP: + + KERNEL1x8_4 -2048,0, 0,0 + KERNEL1x8_4 -2048,0, 1,0 + KERNEL1x8_4 -2048,0, 2,0 + KERNEL1x8_4 -2048,0, 3,0 + KERNEL1x8_4 -2048,0, 4,0 + KERNEL1x8_4 -2048,0, 5,0 + KERNEL1x8_4 -2048,0, 6,0 + KERNEL1x8_4 -2048,0, 7,0 + KERNEL1x8_4 -2048,0, 8,0 + KERNEL1x8_4 -2048,0, 9,0 + KERNEL1x8_4 -2048,0, 10,0 + KERNEL1x8_4 -2048,0, 11,0 + KERNEL1x8_4 -2048,0, 12,0 + KERNEL1x8_4 -2048,0, 13,0 + KERNEL1x8_4 -2048,0, 14,0 + KERNEL1x8_4 -2048,0, 15,1 + + bdnz LSGEMM_1x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x8_SAVE + MY_ALIGN +LSGEMM_1x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x8_SUB2_16 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,0 + KERNEL1x8_4 0,0, 4,0 + KERNEL1x8_4 0,0, 5,0 + KERNEL1x8_4 0,0, 6,0 + KERNEL1x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x8_SUB2_8 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x8_SUB2_4 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x8_SUB2_2 + KERNEL1x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x8_SUB2_1 + KERNEL1x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x8_SAVE + KERNEL1x8 + + MY_ALIGN +LSGEMM_1x8_SAVE: + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 +#endif + MY_ALIGN +LSGEMM_1x8_END: + andi. I, M, 4 + ble LSGEMM_1x4_END + + MY_ALIGN +LSGEMM_1x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x4 + ble LSGEMM_1x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x4_LOOP: + + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,0 + KERNEL1x4_4 0,0, 8,0 + KERNEL1x4_4 0,0, 9,0 + KERNEL1x4_4 0,0, 10,0 + KERNEL1x4_4 0,0, 11,0 + KERNEL1x4_4 0,0, 12,0 + KERNEL1x4_4 0,0, 13,0 + KERNEL1x4_4 0,0, 14,0 + KERNEL1x4_4 0,0, 15,1 + + bdnz LSGEMM_1x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x4_SAVE + MY_ALIGN +LSGEMM_1x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x4_SUB2_16 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x4_SUB2_8 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x4_SUB2_4 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x4_SUB2_2 + KERNEL1x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x4_SUB2_1 + KERNEL1x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x4_SAVE + KERNEL1x4 + + MY_ALIGN +LSGEMM_1x4_SAVE: + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 +#endif + MY_ALIGN +LSGEMM_1x4_END: + andi. I, M, 2 + ble LSGEMM_1x2_END + + MY_ALIGN +LSGEMM_1x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x2 + ble LSGEMM_1x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x2_LOOP: + + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,0 + KERNEL1x2_4 0,0, 8,0 + KERNEL1x2_4 0,0, 9,0 + KERNEL1x2_4 0,0, 10,0 + KERNEL1x2_4 0,0, 11,0 + KERNEL1x2_4 0,0, 12,0 + KERNEL1x2_4 0,0, 13,0 + KERNEL1x2_4 0,0, 14,0 + KERNEL1x2_4 0,0, 15,1 + + bdnz LSGEMM_1x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x2_SAVE + MY_ALIGN +LSGEMM_1x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x2_SUB2_16 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x2_SUB2_8 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x2_SUB2_4 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x2_SUB2_2 + KERNEL1x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x2_SUB2_1 + KERNEL1x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x2_SAVE + KERNEL1x2 + + MY_ALIGN +LSGEMM_1x2_SAVE: + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 +#endif + MY_ALIGN +LSGEMM_1x2_END: + andi. I, M, 1 + ble LSGEMM_1x1_END + + MY_ALIGN +LSGEMM_1x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x1 + ble LSGEMM_1x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x1_LOOP: + + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,0 + KERNEL1x1_16 0,0, 2,0 + KERNEL1x1_16 0,0, 3,1 + + bdnz LSGEMM_1x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x1_SAVE + MY_ALIGN +LSGEMM_1x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x1_SUB2_16 + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,1 + MY_ALIGN +LSGEMM_1x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x1_SUB2_8 + KERNEL1x1_16 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x1_SUB2_4 + KERNEL1x1_8 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x1_SUB2_2 + KERNEL1x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x1_SUB2_1 + KERNEL1x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x1_SAVE + KERNEL1x1 + + MY_ALIGN +LSGEMM_1x1_SAVE: + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 +#endif + MY_ALIGN +LSGEMM_1x1_END: + slwi T1, K, 2 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif LSGEMM_END: \ No newline at end of file diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index 2c9e537c7..3750d338d 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -1,5575 +1,5575 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 4 -#define DISP64(ind,disp) (ind*unit_size*64+disp) -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -/********************************************************************************************** -* Macros for N=8 and M=16 -**********************************************************************************************/ - - - -.macro KERNEL8x16_L1_L4 Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero8X16 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - -.macro LOAD8x16 OffsetA,OffsetB - - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endm - -.macro END8x16_NORMAL - END8x16 0, AO, BO, 64,32 -.endm - -.macro END8x16_WITHOUT_ADD - END8x16 0, AO,BO,0,0 -.endm - -.macro END8x16 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - xvmulsp vs50, vs2,vs28 - xvmulsp vs51, vs3,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - xvmulsp vs54, vs2,vs29 - xvmulsp vs55, vs3,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - xvmulsp vs58, vs2,vs30 - xvmulsp vs59, vs3,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - xvmulsp vs62, vs2,vs31 - xvmulsp vs63, vs3,vs31 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - -.endif -.endm - -.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - -KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 -KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete - -.endm - -.macro KERNEL8x16 First - - LOAD8x16 0,0 - END8x16 \First, AO, BO, 64,32 -.endm - -.macro LOAD8x16_2 - LOAD8x16_2O AO,BO, 0,0 -.endm - -.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB - lxv vs8, (\OffsetB)(\BREG) - lxv vs12, (16+\OffsetB)(\BREG) - lxv vs24, (32+\OffsetB)(\BREG) - lxv vs28, (32+16+\OffsetB)(\BREG) - lxv vs4, (0+\OffsetA)(\AREG) - lxv vs5, (16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(\AREG) - lxv vs7, (48+\OffsetA)(\AREG) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(\AREG) - lxv vs1, (64+16+\OffsetA)(\AREG) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(\AREG) - lxv vs3, (64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - -.macro END8x16_2 - /*for load2 offset will be 128 and 64*/ - KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.if \Complete==0 - lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP16(\Index,\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - -.if \Complete==0 - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif -.if \Complete==0 - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,\OffsetB) - addi \AREG, \AREG, DISP32(\Index,\OffsetA) - -.else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) - -.endif -.endif - - -.endm - - -.macro SAVE8x16 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - add T4, T2, T10 - add T5, T3, T10 - - add T6, T4, T10 - add T7, T5, T10 - - - - /* permute to restore butterfly rank 1 updateto normal promoted one */ - /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ - /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ - /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ - /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) -#endif - xxmrglw vs16, vs34, vs46 - xxmrglw vs18, vs38, vs42 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxmrghw vs4, vs38, vs42 - xxmrghw vs5, vs34, vs46 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs35, vs47 - xxmrglw vs26, vs39, vs43 - - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - - xxmrghw vs30, vs39, vs43 - xxmrghw vs31, vs35, vs47 -#ifndef TRMMKERNEL - lxv vs34, 32(CO) - lxv vs35, 48(CO) -#endif - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 -#ifndef TRMMKERNEL - lxv vs36, 0(T1) - lxv vs37, 16(T1) -#endif - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs38, 32(T1) - lxv vs39, 48(T1) -#endif - - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - - - -#ifndef TRMMKERNEL - lxv vs40, 0(T2) - lxv vs41, 16(T2) -#endif - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 -#ifndef TRMMKERNEL - lxv vs42, 32(T2) - lxv vs43, 48(T2) -#endif - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 -#ifndef TRMMKERNEL - lxv vs44, 0(T3) - lxv vs45, 16(T3) -#endif - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 -#ifndef TRMMKERNEL - lxv vs46, 32(T3) - lxv vs47, 48(T3) -#endif - - - - - - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r -#endif - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - - - stxv vs32, 0(CO) - stxv vs33, 16(CO) -#ifdef TRMMKERNEL - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r -#else - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r -#endif - - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - - - stxv vs34, 32(CO) - stxv vs35, 48(CO) -#ifdef TRMMKERNEL - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r -#else - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r -#endif - stxv vs36, 0(T1) - stxv vs37, 16(T1) -#ifdef TRMMKERNEL - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - stxv vs38, 32(T1) - stxv vs39, 48(T1) - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r -#else - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r -#endif - - stxv vs40, 0(T2) - stxv vs41, 16(T2) -#ifdef TRMMKERNEL - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r -#else - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r -#endif - stxv vs42, 32(T2) - stxv vs43, 48(T2) -#ifdef TRMMKERNEL - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r -#else - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r -#endif - stxv vs44, 0(T3) - stxv vs45, 16(T3) -#ifdef TRMMKERNEL - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r -#endif - stxv vs46, 32(T3) - stxv vs47, 48(T3) - - /*****the same with the second 8X8 ****/ - #ifndef TRMMKERNEL - lxv vs32, 0(T4) - lxv vs33, 16(T4) -#endif - xxmrglw vs8, vs48, vs60 - xxmrglw vs10, vs52, vs56 -#ifndef TRMMKERNEL - lxv vs34, 32(T4) - lxv vs35, 48(T4) -#endif - xxmrghw vs1, vs48, vs60 - xxmrghw vs0, vs52, vs56 -#ifndef TRMMKERNEL - lxv vs36, 0(T5) - lxv vs37, 16(T5) -#endif - xxmrglw vs12, vs49, vs61 - xxmrglw vs14, vs53, vs57 -#ifndef TRMMKERNEL - lxv vs38,32(T5) - lxv vs39, 48(T5) -#endif - - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 -#ifndef TRMMKERNEL - lxv vs40, 0(T6) - lxv vs41, 16(T6) -#endif - xxmrglw vs16, vs50, vs62 - xxmrglw vs18, vs54, vs58 -#ifndef TRMMKERNEL - lxv vs42, 32(T6) - lxv vs43, 48(T6) -#endif - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - xxmrghw vs4, vs54, vs58 - xxmrghw vs5, vs50, vs62 -#ifndef TRMMKERNEL - lxv vs44, 0(T7) - lxv vs45, 16(T7) -#endif - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs51, vs63 - xxmrglw vs26, vs55, vs59 -#ifndef TRMMKERNEL - lxv vs46, 32(T7) - lxv vs47, 48(T7) -#endif - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - xxmrghw vs30, vs55, vs59 - xxmrghw vs31, vs51, vs63 - - - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - #ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r -#endif - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 - stxv vs32, 0(T4) - stxv vs33, 16(T4) - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - -#ifdef TRMMKERNEL - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r -#else - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r -#endif - stxv vs34, 32(T4) - stxv vs35, 48(T4) - -#ifdef TRMMKERNEL - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r -#else - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r -#endif - stxv vs36, 0(T5) - stxv vs37, 16(T5) - -#ifdef TRMMKERNEL - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - - - - - stxv vs38, 32(T5) - stxv vs39, 48(T5) - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r -#else - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r -#endif - stxv vs40, 0(T6) - stxv vs41, 16(T6) -#ifdef TRMMKERNEL - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r -#else - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r -#endif - stxv vs42, 32(T6) - stxv vs43, 48(T6) -#ifdef TRMMKERNEL - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r -#else - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r -#endif - - stxv vs44, 0(T7) - stxv vs45, 16(T7) -#ifdef TRMMKERNEL - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r -#endif - - stxv vs46, 32(T7) - stxv vs47, 48(T7) - - - addi CO,CO,64 - - -.endm - - - -/********************************************************************************************** -* Macros for N=8 and M=8 -**********************************************************************************************/ - -.macro LOAD8x8_1 - LOAD8x8 1 -.endm - -.macro LOAD8x8_0 - LOAD8x8 0 -.endm - -.macro KERNEL8x8_L1_L4 Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro END8x8_NORMAL - END8x8 0, AO, BO, 32,32 -.endm - -.macro Zero8X8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - -.endm - -.macro LOAD8x8 Zero - - lxv vs24, 0(BO) - lxv vs28, 16(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endif -.endm - - -.macro END8x8 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - -.endif -.endm - -.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - - lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - - - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - - - lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - - -.if \Complete==0 - lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP32(\Index,128) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.endm - -.macro KERNEL8x8 First - - LOAD8x8 0 - END8x8 \First, AO, BO, 32,32 -.endm - -.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - -.endif - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - -.endif -.if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) - -.else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP16(\Index,64) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - - xvmulsp vs48, vs4,vs12 - xvmulsp vs49, vs5,vs12 - - xvmulsp vs52, vs4,vs13 - xvmulsp vs53, vs5,vs13 - - xvmulsp vs56, vs4,vs14 - xvmulsp vs57, vs5,vs14 - - xvmulsp vs60, vs4,vs15 - xvmulsp vs61, vs5,vs15 - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.endif - -.endm - - -.macro SAVE8x8 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - add T4, T2, T10 - add T5, T3, T10 - - add T6, T4, T10 - add T7, T5, T10 - -#ifndef TRMMKERNEL - lxv vs34, 0(CO) - lxv vs35, 16(CO) - lxv vs38, 0(T1) - lxv vs39, 16(T1) - lxv vs42, 0(T2) - lxv vs43, 16(T2) - lxv vs46, 0(T3) - lxv vs47, 16(T3) - - lxv vs50, 0(T4) - lxv vs51, 16(T4) - lxv vs54, 0(T5) - lxv vs55, 16(T5) - lxv vs58, 0(T6) - lxv vs59, 16(T6) - lxv vs62, 0(T7) - lxv vs63, 16(T7) -#endif - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs34, vs8, alpha_r - xvmulsp vs35, vs12, alpha_r - xvmulsp vs38, vs9, alpha_r - xvmulsp vs39, vs13, alpha_r - xvmulsp vs42, vs10, alpha_r - xvmulsp vs43, vs14, alpha_r - xvmulsp vs46, vs11, alpha_r - xvmulsp vs47, vs15, alpha_r -#else - xvmaddasp vs34, vs8, alpha_r - xvmaddasp vs35, vs12, alpha_r - xvmaddasp vs38, vs9, alpha_r - xvmaddasp vs39, vs13, alpha_r - xvmaddasp vs42, vs10, alpha_r - xvmaddasp vs43, vs14, alpha_r - xvmaddasp vs46, vs11, alpha_r - xvmaddasp vs47, vs15, alpha_r -#endif - - - xxmrglw vs8, vs48, vs60 - xxmrglw vs10, vs52, vs56 - - xxmrghw vs1, vs48, vs60 - xxmrghw vs0, vs52, vs56 - stxv vs34, 0(CO) - stxv vs35, 16(CO) - xxmrglw vs12, vs49, vs61 - xxmrglw vs14, vs53, vs57 - stxv vs38, 0(T1) - stxv vs39, 16(T1) - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 - stxv vs42, 0(T2) - stxv vs43, 16(T2) - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - stxv vs46, 0(T3) - stxv vs47, 16(T3) - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - - - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - #ifdef TRMMKERNEL - xvmulsp vs50, vs8, alpha_r - xvmulsp vs51, vs12, alpha_r - xvmulsp vs54, vs9, alpha_r - xvmulsp vs55, vs13, alpha_r - xvmulsp vs58, vs10, alpha_r - xvmulsp vs59, vs14, alpha_r - xvmulsp vs62, vs11, alpha_r - xvmulsp vs63, vs15, alpha_r -#else - xvmaddasp vs50, vs8, alpha_r - xvmaddasp vs51, vs12, alpha_r - xvmaddasp vs54, vs9, alpha_r - xvmaddasp vs55, vs13, alpha_r - xvmaddasp vs58, vs10, alpha_r - xvmaddasp vs59, vs14, alpha_r - xvmaddasp vs62, vs11, alpha_r - xvmaddasp vs63, vs15, alpha_r -#endif - - stxv vs50, 0(T4) - stxv vs51, 16(T4) - stxv vs54, 0(T5) - stxv vs55, 16(T5) - stxv vs58, 0(T6) - stxv vs59, 16(T6) - stxv vs62, 0(T7) - stxv vs63, 16(T7) - - addi CO,CO,32 - -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=4 -**********************************************************************************************/ - -.macro LOAD8x4_1 - LOAD8x4 1 -.endm - -.macro LOAD8x4_0 - LOAD8x4 0 -.endm - -.macro KERNEL8x4_L1_L4 Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero8X4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - -.endm - -.macro LOAD8x4 Zero - - lxv vs0, 0(AO) - lxv vs24, 0(BO) - lxv vs25, 16(BO) - - - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 -.endif -.endm - -.macro END8x4_NORMAL - END8x4 0, AO, BO, 16,32 -.endm - -.macro END8x4 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - - xvmulsp vs48, vs25, vs0 - xvmulsp vs49, vs25, vs1 - xvmulsp vs50, vs25, vs2 - xvmulsp vs51, vs25, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - -.endif -.endm - -.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - - lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) - lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 - - - lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - -.if \Complete==0 - - lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) - lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) - lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - -.else - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP32(\Index,128) - -.endif -.endif - - -.endm - -.macro KERNEL8x4 First - LOAD8x4 0 - END8x4 \First, AO, BO, 16,32 -.endm - -.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - - xvmulsp vs48, vs25, vs0 - xvmulsp vs49, vs25, vs1 - xvmulsp vs50, vs25, vs2 - xvmulsp vs51, vs25, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 -.endif - -.if \Complete==0 - - lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) - lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - -.if \First==1 - xvmulsp vs32, vs26, vs4 - xvmulsp vs33, vs26, vs5 - xvmulsp vs34, vs26, vs6 - xvmulsp vs35, vs26, vs7 - - xvmulsp vs48, vs27, vs4 - xvmulsp vs49, vs27, vs5 - xvmulsp vs50, vs27, vs6 - xvmulsp vs51, vs27, vs7 - - -.else - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - -.else - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP16(\Index,64) - -.endif -.endif - - -.endm - - -.macro SAVE8x4 - slwi T10, LDC , 1 - add T1, CO, LDC -#if !defined(TRMMKERNEL) - lxv vs36, 0(CO) - lxv vs37, 0(T1) -#endif - add T2, CO, T10 - add T3, T1, T10 -#if !defined(TRMMKERNEL) - lxv vs38, 0(T2) - lxv vs39, 0(T3) -#endif - add T4, T2, T10 - add T5, T3, T10 -#if !defined(TRMMKERNEL) - lxv vs40, 0(T4) - lxv vs41, 0(T5) -#endif - add T6, T4, T10 - add T7, T5, T10 -#if !defined(TRMMKERNEL) - lxv vs42, 0(T6) - lxv vs43, 0(T7) -#endif - xxmrglw vs0, vs35,vs32 - xxmrglw vs1, vs34,vs33 - xxmrglw vs4, vs32,vs35 - xxmrglw vs5, vs33,vs34 - - - xxmrghw vs2, vs35,vs32 - xxmrghw vs3, vs34,vs33 - xxmrghw vs6, vs32,vs35 - xxmrghw vs7, vs33,vs34 - - xxmrgld vs24, vs1, vs0 - xxmrghd vs25,vs5,vs4 - - xxmrgld vs26, vs2, vs3 - xxmrghd vs27,vs6,vs7 - - - xxmrglw vs0, vs51,vs48 - xxmrglw vs1, vs50,vs49 - xxmrglw vs4, vs48,vs51 - xxmrglw vs5, vs49,vs50 - - xxmrghw vs2, vs51,vs48 - xxmrghw vs3, vs50,vs49 - xxmrghw vs6, vs48,vs51 - xxmrghw vs7, vs49,vs50 - - xxmrgld vs28, vs1, vs0 - xxmrghd vs29,vs5,vs4 - - xxmrgld vs30, vs2, vs3 - xxmrghd vs31,vs6,vs7 -#if defined(TRMMKERNEL) - - xvmulsp vs36, vs24, alpha_r - xvmulsp vs37, vs25, alpha_r - xvmulsp vs38, vs26, alpha_r - xvmulsp vs39, vs27, alpha_r - xvmulsp vs40, vs28, alpha_r - xvmulsp vs41, vs29, alpha_r - xvmulsp vs42, vs30, alpha_r - xvmulsp vs43, vs31, alpha_r -#else - xvmaddasp vs36, vs24, alpha_r - xvmaddasp vs37, vs25, alpha_r - xvmaddasp vs38, vs26, alpha_r - xvmaddasp vs39, vs27, alpha_r - xvmaddasp vs40, vs28, alpha_r - xvmaddasp vs41, vs29, alpha_r - xvmaddasp vs42, vs30, alpha_r - xvmaddasp vs43, vs31, alpha_r -#endif - - stxv vs36, 0(CO) - stxv vs37, 0(T1) - stxv vs38, 0(T2) - stxv vs39, 0(T3) - stxv vs40, 0(T4) - stxv vs41, 0(T5) - stxv vs42, 0(T6) - stxv vs43, 0(T7) - - - addi CO,CO,16 -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=2 -**********************************************************************************************/ - - -.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - - -.macro Zero8x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 - -.endm - -.macro KERNEL8x2 - KERNEL8x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) - xxspltw vs8, vs36, 0 - xxspltw vs9, vs36, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs26, vs9 - xvmulsp vs3, vs27, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs26, vs9 - xvmaddasp vs3, vs27, vs9 - - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP8(\Index,32) - -.endm - -.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) - xxspltw vs8, vs4, 2 - xxspltw vs9, vs4, 3 - xxspltw vs10, vs4, 0 - xxspltw vs11, vs4, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs26, vs9 - xvmulsp vs3, vs27, vs9 - - xvmulsp vs0, vs28, vs10 - xvmulsp vs1, vs29, vs10 - xvmulsp vs2, vs28, vs11 - xvmulsp vs3, vs29, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs26, vs9 - xvmaddasp vs3, vs27, vs9 - - xvmaddasp vs0, vs28, vs10 - xvmaddasp vs1, vs29, vs10 - xvmaddasp vs2, vs28, vs11 - xvmaddasp vs3, vs29, vs11 - .endif - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE8x2 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - add T4, T2, T10 - add T5, T3, T10 - add T6, T4, T10 - add T7, T5, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v1,4(CO) - - lxssp v2,0(T1) - lxssp v3,4(T1) - - lxssp v4,0(T2) - lxssp v5,4(T2) - - lxssp v6,0(T3) - lxssp v7,4(T3) - - lxssp v8,0(T4) - lxssp v9,4(T4) - - lxssp v10,0(T5) - lxssp v11,4(T5) - - lxssp v12,0(T6) - lxssp v13,4(T6) - - lxssp v14,0(T7) - lxssp v15,4(T7) -#endif - xscvspdp vs5, vs2 - xxspltw vs6, vs2, 1 - xxspltw vs7, vs2, 2 - xxspltw vs8, vs2, 3 - xscvspdp vs6,vs6 - xscvspdp vs7,vs7 - xscvspdp vs8,vs8 - - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - - xscvspdp vs9, vs3 - xxspltw vs10, vs3, 1 - xxspltw vs11, vs3, 2 - xxspltw vs12, vs3, 3 - xscvspdp vs10,vs10 - xscvspdp vs11,vs11 - xscvspdp vs12,vs12 - - xscvspdp vs28, vs1 - xxspltw vs29, vs1, 1 - xxspltw vs30, vs1, 2 - xxspltw vs31, vs1, 3 - xscvspdp vs29,vs29 - xscvspdp vs30,vs30 - xscvspdp vs31,vs31 - - - - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs8, vs4 - xsmuldp vs33,vs27, vs4 - - xsmuldp vs34,vs7, vs4 - xsmuldp vs35,vs26, vs4 - - xsmuldp vs36,vs6, vs4 - xsmuldp vs37,vs25, vs4 - - xsmuldp vs38,vs5, vs4 - xsmuldp vs39,vs24, vs4 - - xsmuldp vs40,vs12, vs4 - xsmuldp vs41,vs31, vs4 - - xsmuldp vs42,vs11, vs4 - xsmuldp vs43,vs30, vs4 - - xsmuldp vs44,vs10, vs4 - xsmuldp vs45,vs29, vs4 - - xsmuldp vs46,vs9, vs4 - xsmuldp vs47,vs28, vs4 -#else - xsmaddadp vs32,vs8, vs4 - xsmaddadp vs33,vs27, vs4 - - xsmaddadp vs34,vs7, vs4 - xsmaddadp vs35,vs26, vs4 - - xsmaddadp vs36,vs6, vs4 - xsmaddadp vs37,vs25, vs4 - - xsmaddadp vs38,vs5, vs4 - xsmaddadp vs39,vs24, vs4 - - xsmaddadp vs40,vs12, vs4 - xsmaddadp vs41,vs31, vs4 - - xsmaddadp vs42,vs11, vs4 - xsmaddadp vs43,vs30, vs4 - - xsmaddadp vs44,vs10, vs4 - xsmaddadp vs45,vs29, vs4 - - xsmaddadp vs46,vs9, vs4 - xsmaddadp vs47,vs28, vs4 -#endif - - stxssp v0,0(CO) - stxssp v1,4(CO) - - stxssp v2,0(T1) - stxssp v3,4(T1) - - stxssp v4,0(T2) - stxssp v5,4(T2) - - stxssp v6,0(T3) - stxssp v7,4(T3) - - stxssp v8,0(T4) - stxssp v9,4(T4) - - stxssp v10,0(T5) - stxssp v11,4(T5) - - stxssp v12,0(T6) - stxssp v13,4(T6) - - stxssp v14,0(T7) - stxssp v15,4(T7) - - - addi CO,CO,8 -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=1 -**********************************************************************************************/ -.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro Zero8x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 -.endm - -.macro KERNEL8x1 - KERNEL8x1_1 AO,BO, 0 -.endm - -.macro KERNEL8x1_2 - KERNEL8x1_2_1 AO,BO, 0 -.endm - -.macro KERNEL8x1_1 AREG,BREG,First - lxvwsx vs8, 0, \AREG - lxv vs26, 0(\BREG) - lxv vs27, 16(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - .endif - addi \AREG, \AREG, 4 - addi \BREG, \BREG, 32 -.endm - -.macro KERNEL8x1_2_1 AREG,BREG,First - lxsd v4, 0(\AREG) - lxv vs26, 0(\BREG) - lxv vs27, 16(\BREG) - lxv vs28, 32(\BREG) - lxv vs29, 48(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs1, vs29, vs9 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs1, vs29, vs9 - .endif - addi \AREG, \AREG, 8 - addi \BREG, \BREG, 64 -.endm - -.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - xxspltw vs8, vs4, 3 - xxspltw vs9, vs4, 2 - xxspltw vs10, vs4, 1 - xxspltw vs11, vs4, 0 - lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) - lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) - lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) - lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) - lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs1, vs29, vs9 - xvmulsp vs0, vs30, vs10 - xvmulsp vs1, vs31, vs10 - xvmulsp vs0, vs32, vs11 - xvmulsp vs1, vs33, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs1, vs29, vs9 - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - xvmaddasp vs0, vs32, vs11 - xvmaddasp vs1, vs33, vs11 - .endif -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP32(\Index,128) -.endif -.endm - -.macro SAVE8x1 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - add T4, T2, T10 - add T5, T3, T10 - add T6, T4, T10 - add T7, T5, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v2,0(T1) - lxssp v4,0(T2) - lxssp v6,0(T3) - lxssp v8,0(T4) - lxssp v10,0(T5) - lxssp v12,0(T6) - lxssp v14,0(T7) -#endif - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - xscvspdp vs28, vs1 - xxspltw vs29, vs1, 1 - xxspltw vs30, vs1, 2 - xxspltw vs31, vs1, 3 - xscvspdp vs29,vs29 - xscvspdp vs30,vs30 - xscvspdp vs31,vs31 -#if defined(TRMMKERNEL) - xsmuldp vs32,vs27, vs4 - xsmuldp vs34,vs26, vs4 - xsmuldp vs36,vs25, vs4 - xsmuldp vs38,vs24, vs4 - xsmuldp vs40,vs31, vs4 - xsmuldp vs42,vs30, vs4 - xsmuldp vs44,vs29, vs4 - xsmuldp vs46,vs28, vs4 -#else - xsmaddadp vs32,vs27, vs4 - xsmaddadp vs34,vs26, vs4 - xsmaddadp vs36,vs25, vs4 - xsmaddadp vs38,vs24, vs4 - xsmaddadp vs40,vs31, vs4 - xsmaddadp vs42,vs30, vs4 - xsmaddadp vs44,vs29, vs4 - xsmaddadp vs46,vs28, vs4 -#endif - stxssp v0,0(CO) - stxssp v2,0(T1) - stxssp v4,0(T2) - stxssp v6,0(T3) - stxssp v8,0(T4) - stxssp v10,0(T5) - stxssp v12,0(T6) - stxssp v14,0(T7) - addi CO,CO,4 -.endm - - - -/********************************************************************************************** -* Macros for N=4 and M=16 -**********************************************************************************************/ - -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm - -.macro KERNEL4x16_L1_L4 Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero4X16 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - -.endif -.endm - -.macro END4x16_NORMAL - END4x16 0, AO, BO, 64,16 -.endm - -.macro END4x16 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - -.endif -.endm - -.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - - - lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - - xxpermdi vs27, vs26, vs26,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - - -.if \Complete==0 - lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP64(\Index,256) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - -.endm - -.macro KERNEL4x16 First - - LOAD4x16 0 - END4x16 \First, AO, BO, 64,16 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.endif - - xxpermdi vs11, vs10, vs10,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - -.endif -.if \Complete==0 - lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) - -.else - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - xvmulsp vs34, vs6,vs8 - xvmulsp vs35, vs7,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - xvmulsp vs38, vs6,vs9 - xvmulsp vs39, vs7,vs9 -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - xvmulsp vs42, vs6,vs10 - xvmulsp vs43, vs7,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - xvmulsp vs46, vs6,vs11 - xvmulsp vs47, vs7,vs11 - - - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - -.endif - -.endm - - -.macro SAVE4x16 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxmrglw vs16, vs34, vs46 - xxmrglw vs18, vs38, vs42 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxmrghw vs4, vs38, vs42 - xxmrghw vs5, vs34, vs46 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs35, vs47 - xxmrglw vs26, vs39, vs43 - - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - - xxmrghw vs30, vs39, vs43 - xxmrghw vs31, vs35, vs47 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) - lxv vs34, 32(CO) - lxv vs35, 48(CO) -#endif - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - -#ifndef TRMMKERNEL - lxv vs36, 0(T1) - lxv vs37, 16(T1) - lxv vs38, 32(T1) - lxv vs39, 48(T1) -#endif -#ifndef TRMMKERNEL - lxv vs40, 0(T2) - lxv vs41, 16(T2) - lxv vs42, 32(T2) - lxv vs43, 48(T2) -#endif -#ifndef TRMMKERNEL - lxv vs44, 0(T3) - lxv vs45, 16(T3) - lxv vs46, 32(T3) - lxv vs47, 48(T3) -#endif - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 - - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 - - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - -#endif - - stxv vs32, 0(CO) - stxv vs33, 16(CO) - stxv vs34, 32(CO) - stxv vs35, 48(CO) - - stxv vs36, 0(T1) - stxv vs37, 16(T1) - stxv vs38, 32(T1) - stxv vs39, 48(T1) - - stxv vs40, 0(T2) - stxv vs41, 16(T2) - stxv vs42, 32(T2) - stxv vs43, 48(T2) - stxv vs44, 0(T3) - stxv vs45, 16(T3) - stxv vs46, 32(T3) - stxv vs47, 48(T3) - - addi CO,CO,64 - - -.endm - - - -/********************************************************************************************** -* Macros for N=4 and M=8 -**********************************************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm - -.macro KERNEL4x8_L1_L4 Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro END4x8_NORMAL - END4x8 0, AO, BO, 32,16 -.endm - -.macro Zero4X8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - -.endm - -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - xxpermdi vs27, vs26, vs26,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - -.endif -.endm - - -.macro END4x8 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - -.endif -.endm - -.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - - - lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - - xxpermdi vs27, vs26, vs26,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - - - lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - - -.if \Complete==0 - lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - - -.endm - -.macro KERNEL4x8 First - - LOAD4x8 0 - END4x8 \First, AO, BO, 32,16 -.endm - -.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - -.endif - - xxpermdi vs11, vs10, vs10,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - -.endif -.if \Complete==0 - lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) - -.else - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP16(\Index,64) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - -.endif - -.endm - - -.macro SAVE4x8 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - - -#ifndef TRMMKERNEL - lxv vs34, 0(CO) - lxv vs35, 16(CO) - lxv vs38, 0(T1) - lxv vs39, 16(T1) - lxv vs42, 0(T2) - lxv vs43, 16(T2) - lxv vs46, 0(T3) - lxv vs47, 16(T3) - - -#endif - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs34, vs8, alpha_r - xvmulsp vs35, vs12, alpha_r - xvmulsp vs38, vs9, alpha_r - xvmulsp vs39, vs13, alpha_r - xvmulsp vs42, vs10, alpha_r - xvmulsp vs43, vs14, alpha_r - xvmulsp vs46, vs11, alpha_r - xvmulsp vs47, vs15, alpha_r -#else - xvmaddasp vs34, vs8, alpha_r - xvmaddasp vs35, vs12, alpha_r - xvmaddasp vs38, vs9, alpha_r - xvmaddasp vs39, vs13, alpha_r - xvmaddasp vs42, vs10, alpha_r - xvmaddasp vs43, vs14, alpha_r - xvmaddasp vs46, vs11, alpha_r - xvmaddasp vs47, vs15, alpha_r -#endif - - - stxv vs34, 0(CO) - stxv vs35, 16(CO) - stxv vs38, 0(T1) - stxv vs39, 16(T1) - stxv vs42, 0(T2) - stxv vs43, 16(T2) - stxv vs46, 0(T3) - stxv vs47, 16(T3) - - - addi CO,CO,32 - -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=4 -**********************************************************************************************/ - -.macro LOAD4x4_1 - LOAD4x4 1 -.endm - -.macro LOAD4x4_0 - LOAD4x4 0 -.endm - -.macro KERNEL4x4_L1_L4 Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero4X4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - -.macro LOAD4x4 Zero - - lxv vs0, 0(AO) - lxv vs24, 0(BO) - - - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endif -.endm - -.macro END4x4_NORMAL - END4x4 0, AO, BO, 16,16 -.endm - -.macro END4x4 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - -.endif -.endm - -.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - - lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - - - lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - -.if \Complete==0 - - lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - -.else - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP16(\Index,64) - -.endif -.endif - - -.endm - -.macro KERNEL4x4 First - LOAD4x4 0 - END4x4 \First, AO, BO, 16,16 -.endm - -.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - -.endif - -.if \Complete==0 - - lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - -.if \First==1 - xvmulsp vs32, vs26, vs4 - xvmulsp vs33, vs26, vs5 - xvmulsp vs34, vs26, vs6 - xvmulsp vs35, vs26, vs7 - - -.else - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - -.else - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP8(\Index,32) - -.endif -.endif - - -.endm - - -.macro SAVE4x4 - slwi T10, LDC , 1 - add T1, CO, LDC -#if !defined(TRMMKERNEL) - lxv vs36, 0(CO) - lxv vs37, 0(T1) -#endif - add T2, CO, T10 - add T3, T1, T10 -#if !defined(TRMMKERNEL) - lxv vs38, 0(T2) - lxv vs39, 0(T3) -#endif - - xxmrglw vs0, vs35,vs32 - xxmrglw vs1, vs34,vs33 - xxmrglw vs4, vs32,vs35 - xxmrglw vs5, vs33,vs34 - - - xxmrghw vs2, vs35,vs32 - xxmrghw vs3, vs34,vs33 - xxmrghw vs6, vs32,vs35 - xxmrghw vs7, vs33,vs34 - - xxmrgld vs24, vs1, vs0 - xxmrghd vs25,vs5,vs4 - - xxmrgld vs26, vs2, vs3 - xxmrghd vs27,vs6,vs7 - - #if defined(TRMMKERNEL) - xvmulsp vs36, vs24, alpha_r - xvmulsp vs37, vs25, alpha_r - xvmulsp vs38, vs26, alpha_r - xvmulsp vs39, vs27, alpha_r -#else - xvmaddasp vs36, vs24, alpha_r - xvmaddasp vs37, vs25, alpha_r - xvmaddasp vs38, vs26, alpha_r - xvmaddasp vs39, vs27, alpha_r - #endif - stxv vs36, 0(CO) - stxv vs37, 0(T1) - stxv vs38, 0(T2) - stxv vs39, 0(T3) - - - - addi CO,CO,16 -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=2 -**********************************************************************************************/ - - -.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - - -.macro Zero4x2 - xxlxor vs0, vs0, vs0 - xxlxor vs2, vs2, vs2 - -.endm - -.macro KERNEL4x2 - KERNEL4x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 0 - xxspltw vs9, vs36, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs2, vs26, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs2, vs26, vs9 - - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP4(\Index,16) - -.endm - -.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) - xxspltw vs8, vs4, 2 - xxspltw vs9, vs4, 3 - xxspltw vs10, vs4, 0 - xxspltw vs11, vs4, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs2, vs26, vs9 - - xvmulsp vs0, vs28, vs10 - xvmulsp vs2, vs28, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs2, vs26, vs9 - - xvmaddasp vs0, vs28, vs10 - xvmaddasp vs2, vs28, vs11 - .endif - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE4x2 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v1,4(CO) - - lxssp v2,0(T1) - lxssp v3,4(T1) - - lxssp v4,0(T2) - lxssp v5,4(T2) - - lxssp v6,0(T3) - lxssp v7,4(T3) - - -#endif - xscvspdp vs5, vs2 - xxspltw vs6, vs2, 1 - xxspltw vs7, vs2, 2 - xxspltw vs8, vs2, 3 - xscvspdp vs6,vs6 - xscvspdp vs7,vs7 - xscvspdp vs8,vs8 - - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs8, vs4 - xsmuldp vs33,vs27, vs4 - - xsmuldp vs34,vs7, vs4 - xsmuldp vs35,vs26, vs4 - - xsmuldp vs36,vs6, vs4 - xsmuldp vs37,vs25, vs4 - - xsmuldp vs38,vs5, vs4 - xsmuldp vs39,vs24, vs4 - - -#else - xsmaddadp vs32,vs8, vs4 - xsmaddadp vs33,vs27, vs4 - - xsmaddadp vs34,vs7, vs4 - xsmaddadp vs35,vs26, vs4 - - xsmaddadp vs36,vs6, vs4 - xsmaddadp vs37,vs25, vs4 - - xsmaddadp vs38,vs5, vs4 - xsmaddadp vs39,vs24, vs4 - - -#endif - - stxssp v0,0(CO) - stxssp v1,4(CO) - - stxssp v2,0(T1) - stxssp v3,4(T1) - - stxssp v4,0(T2) - stxssp v5,4(T2) - - stxssp v6,0(T3) - stxssp v7,4(T3) - - - - - addi CO,CO,8 -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=1 -**********************************************************************************************/ -.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro Zero4x1 - xxlxor vs0, vs0, vs0 -.endm - -.macro KERNEL4x1 - KERNEL4x1_1 AO,BO, 0 -.endm - -.macro KERNEL4x1_2 - KERNEL4x1_2_1 AO,BO, 0 -.endm - -.macro KERNEL4x1_1 AREG,BREG,First - lxvwsx vs8, 0, \AREG - lxv vs26, 0(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 -.else - xvmaddasp vs0, vs26, vs8 - .endif - addi \AREG, \AREG, 4 - addi \BREG, \BREG, 16 -.endm - -.macro KERNEL4x1_2_1 AREG,BREG,First - lxsd v4, 0(\AREG) - lxv vs26, 0(\BREG) - lxv vs28, 16(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs0, vs28, vs9 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs0, vs28, vs9 - .endif - addi \AREG, \AREG, 8 - addi \BREG, \BREG, 32 -.endm - -.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - xxspltw vs8, vs4, 3 - xxspltw vs9, vs4, 2 - xxspltw vs10, vs4, 1 - xxspltw vs11, vs4, 0 - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs0, vs30, vs10 - xvmulsp vs0, vs32, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs0, vs32, vs11 - .endif -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP16(\Index,64) -.endif -.endm - -.macro SAVE4x1 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v2,0(T1) - lxssp v4,0(T2) - lxssp v6,0(T3) -#endif - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs27, vs4 - xsmuldp vs34,vs26, vs4 - xsmuldp vs36,vs25, vs4 - xsmuldp vs38,vs24, vs4 -#else - xsmaddadp vs32,vs27, vs4 - xsmaddadp vs34,vs26, vs4 - xsmaddadp vs36,vs25, vs4 - xsmaddadp vs38,vs24, vs4 -#endif - stxssp v0,0(CO) - stxssp v2,0(T1) - stxssp v4,0(T2) - stxssp v6,0(T3) - addi CO,CO,4 -.endm - -/****************************N=2 section*****************/ - -.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero2x16 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - xxlxor vs6, vs6, vs6 - xxlxor vs7, vs7, vs7 -.endm - -.macro KERNEL2x16 - KERNEL2x16_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs28, vs8 - xvmulsp vs3, vs29, vs8 - - xvmulsp vs4, vs26, vs9 - xvmulsp vs5, vs27, vs9 - xvmulsp vs6, vs28, vs9 - xvmulsp vs7, vs29, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP16(\Index,64) - -.endm - - - - -.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) - - lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) - - lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) - lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) - lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) - lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) - - lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) - lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) - lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) - lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs2, vs18, vs10 - xvmaddasp vs3, vs19, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - xvmaddasp vs6, vs18, vs11 - xvmaddasp vs7, vs19, vs11 - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs31, vs12 - xvmaddasp vs2, vs32, vs12 - xvmaddasp vs3, vs33, vs12 - - xvmaddasp vs4, vs30, vs13 - xvmaddasp vs5, vs31, vs13 - xvmaddasp vs6, vs32, vs13 - xvmaddasp vs7, vs33, vs13 - - xvmaddasp vs0, vs34, vs14 - xvmaddasp vs1, vs35, vs14 - xvmaddasp vs2, vs36, vs14 - xvmaddasp vs3, vs37, vs14 - - xvmaddasp vs4, vs34, vs15 - xvmaddasp vs5, vs35, vs15 - xvmaddasp vs6, vs36, vs15 - xvmaddasp vs7, vs37, vs15 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP64(\Index,256) -.endif - -.endm - -.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) - lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs2, vs18, vs10 - xvmaddasp vs3, vs19, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - xvmaddasp vs6, vs18, vs11 - xvmaddasp vs7, vs19, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - - -.macro SAVE2x16 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) - lxv vs18, 32(CO) - lxv vs19, 48(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - lxv vs27, 16(T1) - lxv vs28, 32(T1) - lxv vs29, 48(T1) -#endif - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs18, vs2, alpha_r - xvmulsp vs19, vs3, alpha_r - xvmulsp vs26, vs4, alpha_r - xvmulsp vs27, vs5, alpha_r - xvmulsp vs28, vs6, alpha_r - xvmulsp vs29, vs7, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs18, vs2, alpha_r - xvmaddasp vs19, vs3, alpha_r - xvmaddasp vs26, vs4, alpha_r - xvmaddasp vs27, vs5, alpha_r - xvmaddasp vs28, vs6, alpha_r - xvmaddasp vs29, vs7, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - stxv vs18, 32(CO) - stxv vs19, 48(CO) - - stxv vs26, 0(T1) - stxv vs27, 16(T1) - stxv vs28, 32(T1) - stxv vs29, 48(T1) - - addi CO,CO,64 - -.endm - -/* M=8 N=2 */ - -.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero2x8 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - -.endm - -.macro KERNEL2x8 - KERNEL2x8_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - - xvmulsp vs4, vs26, vs9 - xvmulsp vs5, vs27, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP8(\Index,32) - -.endm - - - - -.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - - lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) - - lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - - lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) - lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs31, vs12 - xvmaddasp vs4, vs30, vs13 - xvmaddasp vs5, vs31, vs13 - - xvmaddasp vs0, vs34, vs14 - xvmaddasp vs1, vs35, vs14 - xvmaddasp vs4, vs34, vs15 - xvmaddasp vs5, vs35, vs15 - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - -.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE2x8 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - lxv vs27, 16(T1) - -#endif - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs26, vs4, alpha_r - xvmulsp vs27, vs5, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs26, vs4, alpha_r - xvmaddasp vs27, vs5, alpha_r -#endif - - stxv vs16, 0(CO) - stxv vs17, 16(CO) - - - stxv vs26, 0(T1) - stxv vs27, 16(T1) - - addi CO,CO,32 - -.endm - - -/*M=4*/ - - -.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - /* we will aggregate on save vs0 +vs4 vs11+vs5 */ -.macro Zero2x4 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - -.endm - -.macro KERNEL2x4 - KERNEL2x4_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs26, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP4(\Index,16) - -.endm - - - - -.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) - - lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs4, vs16, vs10 - xvmaddasp vs5, vs16, vs11 - - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs30, vs13 - xvmaddasp vs4, vs34, vs14 - xvmaddasp vs5, vs34, vs15 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs4, vs16, vs10 - xvmaddasp vs5, vs16, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE2x4 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - -#endif - /*aggregate vectors*/ - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs26, vs1, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs26, vs1, alpha_r -#endif - - stxv vs16, 0(CO) - stxv vs26, 0(T1) - - addi CO,CO,16 - -.endm - - -/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ -.macro SWITCH_PERMUTE_INNER - xxpermdi permute_mask, permute_mask, permute_mask,2 -.endm - -.macro Zero2x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - SWITCH_PERMUTE_INNER -.endm - -.macro KERNEL2x2 - KERNEL2x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxperm vs9, vs36, permute_mask - lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs37, vs36 - xvmulsp vs1, vs37, vs9 - -.else - xvmaddasp vs0, vs37, vs36 - xvmaddasp vs1, vs37, vs9 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP2(\Index,8) - -.endm - - - - -.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) - - - xxperm vs9, vs8, permute_mask - xxperm vs11, vs10, permute_mask - - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs16, vs11 - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - -.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - - xxperm vs9, vs8, permute_mask - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP4(\Index,16) -.endif -.endm - - -.macro SAVE2x2 - -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) - -#endif - /*aggregate vectors*/ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - /* */ - /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ - xxperm vs1,vs1, permute_mask - - - xxmrghw vs2 ,vs1,vs0 - xxpermdi vs2,vs2,vs2,2 - xxmrghw vs3 ,vs0,vs1 -#if defined(TRMMKERNEL) - xvmulsp vs36, vs2, alpha_r - xvmulsp vs37, vs3, alpha_r -#else - xvmaddasp vs36, vs2, alpha_r - xvmaddasp vs37, vs3, alpha_r -#endif - /**** store last two words*/ - - - stxsd v4, 0(CO) - stxsd v5, 0(T1) - - addi CO,CO,8 - -.endm - -/*--------------------------- M=1 N=2 */ -.macro Zero2x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2,vs2,vs2 - xxlxor vs3,vs3,vs3 -.endm - -.macro KERNEL2x1 - KERNEL2x1_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone then will add it to batched ones - */ -.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) - lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs2, vs37, vs35 - xvmulsp vs3, vs37, vs36 - -.else - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP1(\Index,4) - -.endm - - - - -.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - xxmrglw vs5, vs26,vs26 - xxmrghw vs6, vs26,vs26 - - xvmaddasp vs0, vs8, vs5 - xvmaddasp vs1, vs10, vs6 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) - lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) - lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) - lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) - lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) - lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) - - - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - - xsmaddadp vs2, vs38, vs39 - xsmaddadp vs3, vs38, vs40 - - - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP2(\Index,8) -.endm - - -.macro SAVE2x1 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxssp v5 , 0(T1) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors 2x2_4 */ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - xvaddsp vs0,vs0,vs1 -/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs2,vs2,vs6 - xsadddp vs3,vs3,vs5 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs2, vs16 - xsmuldp vs37,vs3, vs16 - -#else - xsmaddadp vs36,vs2, vs16 - xsmaddadp vs37,vs3, vs16 -#endif - - stxssp v4, 0(CO) - stxssp v5, 0(T1) - - addi CO,CO,4 - -.endm - - - -/****************************N=1 section*****************/ - -.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x16 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x16 - KERNEL1x16_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs28, vs8 - xvmulsp vs3, vs29, vs8 - - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP16(\Index,64) - -.endm - - - - -.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) - - lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) - lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) - lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) - lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) - - lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) - lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) - lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) - lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - - xvmaddasp vs0, vs16, vs9 - xvmaddasp vs1, vs17, vs9 - xvmaddasp vs2, vs18, vs9 - xvmaddasp vs3, vs19, vs9 - - - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - xvmaddasp vs2, vs32, vs10 - xvmaddasp vs3, vs33, vs10 - - - xvmaddasp vs0, vs34, vs11 - xvmaddasp vs1, vs35, vs11 - xvmaddasp vs2, vs36, vs11 - xvmaddasp vs3, vs37, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP64(\Index,256) -.endif - -.endm - -.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) - lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - - xvmaddasp vs0, vs16, vs9 - xvmaddasp vs1, vs17, vs9 - xvmaddasp vs2, vs18, vs9 - xvmaddasp vs3, vs19, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - - -.macro SAVE1x16 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) - lxv vs18, 32(CO) - lxv vs19, 48(CO) -#endif - - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs18, vs2, alpha_r - xvmulsp vs19, vs3, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs18, vs2, alpha_r - xvmaddasp vs19, vs3, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - stxv vs18, 32(CO) - stxv vs19, 48(CO) - - addi CO,CO,64 - -.endm - -/* M=8 N=1 */ - -.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x8 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x8 - KERNEL1x8_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP8(\Index,32) - -.endm - - - - -.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - - lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - - lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) - lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - - xvmaddasp vs2, vs16, vs9 - xvmaddasp vs3, vs17, vs9 - - - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - - - xvmaddasp vs2, vs34, vs11 - xvmaddasp vs3, vs35, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - -.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - - xvmaddasp vs2, vs16, vs9 - xvmaddasp vs3, vs17, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE1x8 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) -#endif - /* aggregate vs0 vs2 and vs1 vs3*/ - xvaddsp vs0,vs0,vs2 - xvaddsp vs1,vs1,vs3 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - - addi CO,CO,32 - -.endm -/*M=4*/ - -.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x4 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x4 - KERNEL1x4_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 -.else - xvmaddasp vs0, vs26, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP4(\Index,16) - -.endm - - - - -.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - - xvmaddasp vs1, vs27, vs9 - - xvmaddasp vs2, vs30, vs10 - - - xvmaddasp vs3, vs31, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE1x4 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) -#endif - /* aggregate */ - xvaddsp vs0,vs0,vs2 - xvaddsp vs1,vs1,vs3 - xvaddsp vs0,vs1,vs0 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r -#endif - stxv vs16, 0(CO) - - addi CO,CO,16 - -.endm - -/* M=2 N=1*/ -.macro Zero1x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2,vs2,vs2 - xxlxor vs3,vs3,vs3 -.endm - -.macro KERNEL1x2 - KERNEL1x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone then will add it to batched ones - */ -.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) - lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) - - -.if \First==1 - xvmuldp vs2, vs37, vs35 - xvmuldp vs3, vs37, vs36 - -.else - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP1(\Index,4) - -.endm - - - - -.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) - - lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) - - xxmrglw vs5, vs26,vs26 - xxmrghw vs6, vs26,vs26 - - xvmaddasp vs0, vs8, vs5 - xvmaddasp vs1, vs10, vs6 - - -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) - lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) - lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) - lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) - lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) - lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) - - - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - - xsmaddadp vs2, vs38, vs39 - xsmaddadp vs3, vs38, vs40 - - - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP2(\Index,8) -.endm - - -.macro SAVE1x2 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) - lxssp v5 , 4(CO) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors 1x2_4 */ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - xvaddsp vs0,vs0,vs1 -/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs2,vs2,vs6 - xsadddp vs3,vs3,vs5 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs2, vs16 - xsmuldp vs37,vs3, vs16 - -#else - xsmaddadp vs36,vs2, vs16 - xsmaddadp vs37,vs3, vs16 -#endif - - stxssp v4, 0(CO) - stxssp v5, 4(CO) - - addi CO,CO,8 - -.endm -/*///////////////// N=1 M=1 //////////////////*/ -.macro Zero1x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2,vs2 - xxlxor vs3,vs3,vs3 - xxlxor vs4,vs4,vs4 -.endm - -.macro KERNEL1x1 - KERNEL1x1_1 AO,BO, 1, 0,0,0 -.endm - -.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone ( FIRST==1 to zero vs4) - */ -.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) - - -.if \First==1 - xvmuldp vs4, vs37, vs35 - -.else - xsmaddadp vs4, vs37, vs35 - .endif - - addi \AREG, \AREG, DISP1(\Index,4) - addi \BREG, \BREG, DISP1(\Index,4) - -.endm - - -.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) - lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) - lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) - lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) - lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) - lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) - lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) - xvmaddasp vs0, vs8, vs26 - xvmaddasp vs1, vs9, vs16 - xvmaddasp vs2, vs10, vs17 - xvmaddasp vs3, vs11, vs18 -.if \IsLast==1 - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) - lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) - lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) - xvmaddasp vs0, vs8, vs26 - xvmaddasp vs1, vs9, vs16 - -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP8(\Index,32) -.endif - -.endm - - -.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) - lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) - - xvmaddasp vs0, vs8, vs26 - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) - lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) - - xvmaddasp vs0, vs36, vs37 - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP2(\Index,8) -.endm - - -.macro SAVE1x1 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors */ - xvaddsp vs0,vs0,vs1 - xvaddsp vs2,vs2,vs3 - xvaddsp vs0,vs0,vs2 - - xxpermdi vs7,vs0,vs0,2 - xvaddsp vs0,vs0,vs7 -/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs7,vs5,vs6 - xsadddp vs4,vs4,vs7 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs4, vs16 - -#else - xsmaddadp vs36,vs4, vs16 -#endif - - stxssp v4, 0(CO) - - addi CO,CO,4 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 3 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 2 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 4 +#define DISP64(ind,disp) (ind*unit_size*64+disp) +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + + + +.macro KERNEL8x16_L1_L4 Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD8x16 OffsetA,OffsetB + + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endm + +.macro END8x16_NORMAL + END8x16 0, AO, BO, 64,32 +.endm + +.macro END8x16_WITHOUT_ADD + END8x16 0, AO,BO,0,0 +.endm + +.macro END8x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.endm + +.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + +KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete + +.endm + +.macro KERNEL8x16 First + + LOAD8x16 0,0 + END8x16 \First, AO, BO, 64,32 +.endm + +.macro LOAD8x16_2 + LOAD8x16_2O AO,BO, 0,0 +.endm + +.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB + lxv vs8, (\OffsetB)(\BREG) + lxv vs12, (16+\OffsetB)(\BREG) + lxv vs24, (32+\OffsetB)(\BREG) + lxv vs28, (32+16+\OffsetB)(\BREG) + lxv vs4, (0+\OffsetA)(\AREG) + lxv vs5, (16+\OffsetA)(\AREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(\AREG) + lxv vs7, (48+\OffsetA)(\AREG) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(\AREG) + lxv vs1, (64+16+\OffsetA)(\AREG) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(\AREG) + lxv vs3, (64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + +.macro END8x16_2 + /*for load2 offset will be 128 and 64*/ + KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.if \Complete==0 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + +.if \Complete==0 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,\OffsetB) + addi \AREG, \AREG, DISP32(\Index,\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + + +.macro SAVE8x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + + + + /* permute to restore butterfly rank 1 updateto normal promoted one */ + /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ + /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ + /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ + /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) +#endif + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 +#ifndef TRMMKERNEL + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) +#endif + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + + + +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 +#ifndef TRMMKERNEL + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 +#ifndef TRMMKERNEL + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + + + + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + + stxv vs32, 0(CO) + stxv vs33, 16(CO) +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + stxv vs34, 32(CO) + stxv vs35, 48(CO) +#ifdef TRMMKERNEL + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T1) + stxv vs37, 16(T1) +#ifdef TRMMKERNEL + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + stxv vs38, 32(T1) + stxv vs39, 48(T1) + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + + stxv vs40, 0(T2) + stxv vs41, 16(T2) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif + stxv vs42, 32(T2) + stxv vs43, 48(T2) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + stxv vs44, 0(T3) + stxv vs45, 16(T3) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + /*****the same with the second 8X8 ****/ + #ifndef TRMMKERNEL + lxv vs32, 0(T4) + lxv vs33, 16(T4) +#endif + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs34, 32(T4) + lxv vs35, 48(T4) +#endif + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs36, 0(T5) + lxv vs37, 16(T5) +#endif + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 +#ifndef TRMMKERNEL + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) +#endif + xxmrglw vs16, vs50, vs62 + xxmrglw vs18, vs54, vs58 +#ifndef TRMMKERNEL + lxv vs42, 32(T6) + lxv vs43, 48(T6) +#endif + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + xxmrghw vs4, vs54, vs58 + xxmrghw vs5, vs50, vs62 +#ifndef TRMMKERNEL + lxv vs44, 0(T7) + lxv vs45, 16(T7) +#endif + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs51, vs63 + xxmrglw vs26, vs55, vs59 +#ifndef TRMMKERNEL + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + xxmrghw vs30, vs55, vs59 + xxmrghw vs31, vs51, vs63 + + + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + #ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + stxv vs32, 0(T4) + stxv vs33, 16(T4) + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + stxv vs34, 32(T4) + stxv vs35, 48(T4) + +#ifdef TRMMKERNEL + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T5) + stxv vs37, 16(T5) + +#ifdef TRMMKERNEL + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + + + stxv vs38, 32(T5) + stxv vs39, 48(T5) + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + stxv vs40, 0(T6) + stxv vs41, 16(T6) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif + stxv vs42, 32(T6) + stxv vs43, 48(T6) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + + stxv vs44, 0(T7) + stxv vs45, 16(T7) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + + stxv vs46, 32(T7) + stxv vs47, 48(T7) + + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + LOAD8x8 1 +.endm + +.macro LOAD8x8_0 + LOAD8x8 0 +.endm + +.macro KERNEL8x8_L1_L4 Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END8x8_NORMAL + END8x8 0, AO, BO, 32,32 +.endm + +.macro Zero8X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + +.endm + +.macro LOAD8x8 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endif +.endm + + +.macro END8x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.endm + +.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + +.if \Complete==0 + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endm + +.macro KERNEL8x8 First + + LOAD8x8 0 + END8x8 \First, AO, BO, 32,32 +.endm + +.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endif + +.endm + + +.macro SAVE8x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + lxv vs50, 0(T4) + lxv vs51, 16(T4) + lxv vs54, 0(T5) + lxv vs55, 16(T5) + lxv vs58, 0(T6) + lxv vs59, 16(T6) + lxv vs62, 0(T7) + lxv vs63, 16(T7) +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + stxv vs34, 0(CO) + stxv vs35, 16(CO) + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + stxv vs38, 0(T1) + stxv vs39, 16(T1) + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + stxv vs42, 0(T2) + stxv vs43, 16(T2) + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + stxv vs46, 0(T3) + stxv vs47, 16(T3) + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + #ifdef TRMMKERNEL + xvmulsp vs50, vs8, alpha_r + xvmulsp vs51, vs12, alpha_r + xvmulsp vs54, vs9, alpha_r + xvmulsp vs55, vs13, alpha_r + xvmulsp vs58, vs10, alpha_r + xvmulsp vs59, vs14, alpha_r + xvmulsp vs62, vs11, alpha_r + xvmulsp vs63, vs15, alpha_r +#else + xvmaddasp vs50, vs8, alpha_r + xvmaddasp vs51, vs12, alpha_r + xvmaddasp vs54, vs9, alpha_r + xvmaddasp vs55, vs13, alpha_r + xvmaddasp vs58, vs10, alpha_r + xvmaddasp vs59, vs14, alpha_r + xvmaddasp vs62, vs11, alpha_r + xvmaddasp vs63, vs15, alpha_r +#endif + + stxv vs50, 0(T4) + stxv vs51, 16(T4) + stxv vs54, 0(T5) + stxv vs55, 16(T5) + stxv vs58, 0(T6) + stxv vs59, 16(T6) + stxv vs62, 0(T7) + stxv vs63, 16(T7) + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + LOAD8x4 1 +.endm + +.macro LOAD8x4_0 + LOAD8x4 0 +.endm + +.macro KERNEL8x4_L1_L4 Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + +.endm + +.macro LOAD8x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + lxv vs25, 16(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 +.endif +.endm + +.macro END8x4_NORMAL + END8x4 0, AO, BO, 16,32 +.endm + +.macro END8x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.endif +.endm + +.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + +.macro KERNEL8x4 First + LOAD8x4 0 + END8x4 \First, AO, BO, 16,32 +.endm + +.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + xvmulsp vs48, vs27, vs4 + xvmulsp vs49, vs27, vs5 + xvmulsp vs50, vs27, vs6 + xvmulsp vs51, vs27, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + + +.macro SAVE8x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + add T4, T2, T10 + add T5, T3, T10 +#if !defined(TRMMKERNEL) + lxv vs40, 0(T4) + lxv vs41, 0(T5) +#endif + add T6, T4, T10 + add T7, T5, T10 +#if !defined(TRMMKERNEL) + lxv vs42, 0(T6) + lxv vs43, 0(T7) +#endif + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + + xxmrglw vs0, vs51,vs48 + xxmrglw vs1, vs50,vs49 + xxmrglw vs4, vs48,vs51 + xxmrglw vs5, vs49,vs50 + + xxmrghw vs2, vs51,vs48 + xxmrghw vs3, vs50,vs49 + xxmrghw vs6, vs48,vs51 + xxmrghw vs7, vs49,vs50 + + xxmrgld vs28, vs1, vs0 + xxmrghd vs29,vs5,vs4 + + xxmrgld vs30, vs2, vs3 + xxmrghd vs31,vs6,vs7 +#if defined(TRMMKERNEL) + + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r + xvmulsp vs40, vs28, alpha_r + xvmulsp vs41, vs29, alpha_r + xvmulsp vs42, vs30, alpha_r + xvmulsp vs43, vs31, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + xvmaddasp vs40, vs28, alpha_r + xvmaddasp vs41, vs29, alpha_r + xvmaddasp vs42, vs30, alpha_r + xvmaddasp vs43, vs31, alpha_r +#endif + + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + stxv vs40, 0(T4) + stxv vs41, 0(T5) + stxv vs42, 0(T6) + stxv vs43, 0(T7) + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + + +.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero8x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + +.macro KERNEL8x2 + KERNEL8x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP8(\Index,32) + +.endm + +.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs1, vs29, vs10 + xvmulsp vs2, vs28, vs11 + xvmulsp vs3, vs29, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs1, vs29, vs10 + xvmaddasp vs2, vs28, vs11 + xvmaddasp vs3, vs29, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE8x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + lxssp v8,0(T4) + lxssp v9,4(T4) + + lxssp v10,0(T5) + lxssp v11,4(T5) + + lxssp v12,0(T6) + lxssp v13,4(T6) + + lxssp v14,0(T7) + lxssp v15,4(T7) +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + xscvspdp vs9, vs3 + xxspltw vs10, vs3, 1 + xxspltw vs11, vs3, 2 + xxspltw vs12, vs3, 3 + xscvspdp vs10,vs10 + xscvspdp vs11,vs11 + xscvspdp vs12,vs12 + + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 + + + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + xsmuldp vs40,vs12, vs4 + xsmuldp vs41,vs31, vs4 + + xsmuldp vs42,vs11, vs4 + xsmuldp vs43,vs30, vs4 + + xsmuldp vs44,vs10, vs4 + xsmuldp vs45,vs29, vs4 + + xsmuldp vs46,vs9, vs4 + xsmuldp vs47,vs28, vs4 +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + xsmaddadp vs40,vs12, vs4 + xsmaddadp vs41,vs31, vs4 + + xsmaddadp vs42,vs11, vs4 + xsmaddadp vs43,vs30, vs4 + + xsmaddadp vs44,vs10, vs4 + xsmaddadp vs45,vs29, vs4 + + xsmaddadp vs46,vs9, vs4 + xsmaddadp vs47,vs28, vs4 +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + stxssp v8,0(T4) + stxssp v9,4(T4) + + stxssp v10,0(T5) + stxssp v11,4(T5) + + stxssp v12,0(T6) + stxssp v13,4(T6) + + stxssp v14,0(T7) + stxssp v15,4(T7) + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ +.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero8x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + +.macro KERNEL8x1 + KERNEL8x1_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_2 + KERNEL8x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL8x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) + lxv vs28, 32(\BREG) + lxv vs29, 48(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 64 +.endm + +.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) + lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) + lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) + lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) + lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs1, vs31, vs10 + xvmulsp vs0, vs32, vs11 + xvmulsp vs1, vs33, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs0, vs32, vs11 + xvmaddasp vs1, vs33, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP32(\Index,128) +.endif +.endm + +.macro SAVE8x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) + lxssp v8,0(T4) + lxssp v10,0(T5) + lxssp v12,0(T6) + lxssp v14,0(T7) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 + xsmuldp vs40,vs31, vs4 + xsmuldp vs42,vs30, vs4 + xsmuldp vs44,vs29, vs4 + xsmuldp vs46,vs28, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 + xsmaddadp vs40,vs31, vs4 + xsmaddadp vs42,vs30, vs4 + xsmaddadp vs44,vs29, vs4 + xsmaddadp vs46,vs28, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + stxssp v8,0(T4) + stxssp v10,0(T5) + stxssp v12,0(T6) + stxssp v14,0(T7) + addi CO,CO,4 +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm + +.macro KERNEL4x16_L1_L4 Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + +.endif +.endm + +.macro END4x16_NORMAL + END4x16 0, AO, BO, 64,16 +.endm + +.macro END4x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + +.endif +.endm + +.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endm + +.macro KERNEL4x16 First + + LOAD4x16 0 + END4x16 \First, AO, BO, 64,16 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endif + +.endm + + +.macro SAVE4x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm + +.macro KERNEL4x8_L1_L4 Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END4x8_NORMAL + END4x8 0, AO, BO, 32,16 +.endm + +.macro Zero4X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endm + +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endif +.endm + + +.macro END4x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.endm + +.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + +.endm + +.macro KERNEL4x8 First + + LOAD4x8 0 + END4x8 \First, AO, BO, 32,16 +.endm + +.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + +.endif + +.endm + + +.macro SAVE4x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + stxv vs34, 0(CO) + stxv vs35, 16(CO) + stxv vs38, 0(T1) + stxv vs39, 16(T1) + stxv vs42, 0(T2) + stxv vs43, 16(T2) + stxv vs46, 0(T3) + stxv vs47, 16(T3) + + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + LOAD4x4 1 +.endm + +.macro LOAD4x4_0 + LOAD4x4 0 +.endm + +.macro KERNEL4x4_L1_L4 Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + +.macro LOAD4x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endif +.endm + +.macro END4x4_NORMAL + END4x4 0, AO, BO, 16,16 +.endm + +.macro END4x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.endif +.endm + +.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + +.macro KERNEL4x4 First + LOAD4x4 0 + END4x4 \First, AO, BO, 16,16 +.endm + +.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) + +.endif +.endif + + +.endm + + +.macro SAVE4x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + #if defined(TRMMKERNEL) + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + #endif + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + + +.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero4x2 + xxlxor vs0, vs0, vs0 + xxlxor vs2, vs2, vs2 + +.endm + +.macro KERNEL4x2 + KERNEL4x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP4(\Index,16) + +.endm + +.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs2, vs28, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs2, vs28, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE4x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ +.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero4x1 + xxlxor vs0, vs0, vs0 +.endm + +.macro KERNEL4x1 + KERNEL4x1_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_2 + KERNEL4x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 16 +.endm + +.macro KERNEL4x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs28, 16(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs0, vs32, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs0, vs32, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif +.endm + +.macro SAVE4x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + addi CO,CO,4 +.endm + +/****************************N=2 section*****************/ + +.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 +.endm + +.macro KERNEL2x16 + KERNEL2x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + xvmulsp vs6, vs28, vs9 + xvmulsp vs7, vs29, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs2, vs32, vs12 + xvmaddasp vs3, vs33, vs12 + + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + xvmaddasp vs6, vs32, vs13 + xvmaddasp vs7, vs33, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs2, vs36, vs14 + xvmaddasp vs3, vs37, vs14 + + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + xvmaddasp vs6, vs36, vs15 + xvmaddasp vs7, vs37, vs15 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE2x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + lxv vs28, 32(T1) + lxv vs29, 48(T1) +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r + xvmulsp vs28, vs6, alpha_r + xvmulsp vs29, vs7, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r + xvmaddasp vs28, vs6, alpha_r + xvmaddasp vs29, vs7, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + stxv vs28, 32(T1) + stxv vs29, 48(T1) + + addi CO,CO,64 + +.endm + +/* M=8 N=2 */ + +.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x8 + KERNEL2x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE2x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + + addi CO,CO,32 + +.endm + + +/*M=4*/ + + +.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + /* we will aggregate on save vs0 +vs4 vs11+vs5 */ +.macro Zero2x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x4 + KERNEL2x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs30, vs13 + xvmaddasp vs4, vs34, vs14 + xvmaddasp vs5, vs34, vs15 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE2x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + +#endif + /*aggregate vectors*/ + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs26, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs26, vs1, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs26, 0(T1) + + addi CO,CO,16 + +.endm + + +/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ +.macro SWITCH_PERMUTE_INNER + xxpermdi permute_mask, permute_mask, permute_mask,2 +.endm + +.macro Zero2x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + SWITCH_PERMUTE_INNER +.endm + +.macro KERNEL2x2 + KERNEL2x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxperm vs9, vs36, permute_mask + lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs37, vs36 + xvmulsp vs1, vs37, vs9 + +.else + xvmaddasp vs0, vs37, vs36 + xvmaddasp vs1, vs37, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP2(\Index,8) + +.endm + + + + +.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + xxperm vs11, vs10, permute_mask + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs16, vs11 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + +.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP4(\Index,16) +.endif +.endm + + +.macro SAVE2x2 + +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) + +#endif + /*aggregate vectors*/ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + /* */ + /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ + xxperm vs1,vs1, permute_mask + + + xxmrghw vs2 ,vs1,vs0 + xxpermdi vs2,vs2,vs2,2 + xxmrghw vs3 ,vs0,vs1 +#if defined(TRMMKERNEL) + xvmulsp vs36, vs2, alpha_r + xvmulsp vs37, vs3, alpha_r +#else + xvmaddasp vs36, vs2, alpha_r + xvmaddasp vs37, vs3, alpha_r +#endif + /**** store last two words*/ + + + stxsd v4, 0(CO) + stxsd v5, 0(T1) + + addi CO,CO,8 + +.endm + +/*--------------------------- M=1 N=2 */ +.macro Zero2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL2x1 + KERNEL2x1_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs2, vs37, vs35 + xvmulsp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP2(\Index,8) +.endm + + +.macro SAVE2x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxssp v5 , 0(T1) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 2x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 0(T1) + + addi CO,CO,4 + +.endm + + + +/****************************N=1 section*****************/ + +.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x16 + KERNEL1x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs2, vs32, vs10 + xvmaddasp vs3, vs33, vs10 + + + xvmaddasp vs0, vs34, vs11 + xvmaddasp vs1, vs35, vs11 + xvmaddasp vs2, vs36, vs11 + xvmaddasp vs3, vs37, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE1x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + addi CO,CO,64 + +.endm + +/* M=8 N=1 */ + +.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x8 + KERNEL1x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + + + xvmaddasp vs2, vs34, vs11 + xvmaddasp vs3, vs35, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE1x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + /* aggregate vs0 vs2 and vs1 vs3*/ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + addi CO,CO,32 + +.endm +/*M=4*/ + +.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x4 + KERNEL1x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + + xvmaddasp vs1, vs27, vs9 + + xvmaddasp vs2, vs30, vs10 + + + xvmaddasp vs3, vs31, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE1x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + /* aggregate */ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 + xvaddsp vs0,vs1,vs0 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r +#endif + stxv vs16, 0(CO) + + addi CO,CO,16 + +.endm + +/* M=2 N=1*/ +.macro Zero1x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL1x2 + KERNEL1x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs2, vs37, vs35 + xvmuldp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x2 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + lxssp v5 , 4(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 1x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 4(CO) + + addi CO,CO,8 + +.endm +/*///////////////// N=1 M=1 //////////////////*/ +.macro Zero1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2,vs2 + xxlxor vs3,vs3,vs3 + xxlxor vs4,vs4,vs4 +.endm + +.macro KERNEL1x1 + KERNEL1x1_1 AO,BO, 1, 0,0,0 +.endm + +.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone ( FIRST==1 to zero vs4) + */ +.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs4, vs37, vs35 + +.else + xsmaddadp vs4, vs37, vs35 + .endif + + addi \AREG, \AREG, DISP1(\Index,4) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + +.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) + lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) + lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) + lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) + lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + xvmaddasp vs2, vs10, vs17 + xvmaddasp vs3, vs11, vs18 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs8, vs26 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) + lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs36, vs37 + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors */ + xvaddsp vs0,vs0,vs1 + xvaddsp vs2,vs2,vs3 + xvaddsp vs0,vs0,vs2 + + xxpermdi vs7,vs0,vs0,2 + xvaddsp vs0,vs0,vs7 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs7,vs5,vs6 + xsadddp vs4,vs4,vs7 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs4, vs16 + +#else + xsmaddadp vs36,vs4, vs16 +#endif + + stxssp v4, 0(CO) + + addi CO,CO,4 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 3 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 2 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 5dfb18f5b..f5c1ba729 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -1,470 +1,470 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/gemv_n.c" - -#else - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float v_x4 = {x4,x4,x4,x4}; - __vector float v_x5 = {x5,x5,x5,x5}; - __vector float v_x6 = {x6,x6,x6,x6}; - __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i++) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; - v_y[i] =vy; - } - -} - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i++ ) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i] =vy; - } - -} - -static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - } - -} - - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); - } - - - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); - } - - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); - } - - - return(0); -} - -#endif - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_n.c" + +#else + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float v_x4 = {x4,x4,x4,x4}; + __vector float v_x5 = {x5,x5,x5,x5}; + __vector float v_x6 = {x6,x6,x6,x6}; + __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i++) + { + register __vector float vy=v_y[i]; + vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; + v_y[i] =vy; + } + +} + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i++ ) + { + register __vector float vy=v_y[i]; + vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + v_y[i] =vy; + } + +} + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i++ ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + } + +} + + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i++ ) + { + v_y[i] += v_x0 * va0[i] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + +#endif + diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c index 64696236a..0edb79129 100644 --- a/kernel/power/sgemv_n_8.c +++ b/kernel/power/sgemv_n_8.c @@ -1,514 +1,514 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could provide barebone for switching to inline assembly -*/ - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - register __vector float v_x0 = {x0,x0,x0,x0}; - register __vector float v_x1 = {x1,x1,x1,x1}; - register __vector float v_x2 = {x2,x2,x2,x2}; - register __vector float v_x3 = {x3,x3,x3,x3}; - register __vector float v_x4 = {x4,x4,x4,x4}; - register __vector float v_x5 = {x5,x5,x5,x5}; - register __vector float v_x6 = {x6,x6,x6,x6}; - register __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i+=2) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float vb0_1=vb0[i] ; - register __vector float vb0_2=vb0[i+1] ; - register __vector float vb1_1=vb1[i] ; - register __vector float vb1_2=vb1[i+1] ; - register __vector float vb2_1=vb2[i] ; - register __vector float vb2_2=vb2[i+1] ; - register __vector float vb3_1=vb3[i] ; - register __vector float vb3_2=vb3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i+=2 ) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } - -} - - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 7 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - - if ( m3 & 4 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - if ( lda == 4 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; - temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; - - temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; - temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; - temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; - temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; - - a_ptr += 16; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0] ; - a_ptr +=4; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - y_ptr += inc_y; - y_ptr[0] += alpha * temp3; - y_ptr += inc_y; - a += 4; - } - - - if ( m3 & 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - a += 2; - } - - if ( m3 & 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - - - } - - - return(0); -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could provide barebone for switching to inline assembly +*/ + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + register __vector float v_x0 = {x0,x0,x0,x0}; + register __vector float v_x1 = {x1,x1,x1,x1}; + register __vector float v_x2 = {x2,x2,x2,x2}; + register __vector float v_x3 = {x3,x3,x3,x3}; + register __vector float v_x4 = {x4,x4,x4,x4}; + register __vector float v_x5 = {x5,x5,x5,x5}; + register __vector float v_x6 = {x6,x6,x6,x6}; + register __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i+=2) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float vb0_1=vb0[i] ; + register __vector float vb0_2=vb0[i+1] ; + register __vector float vb1_1=vb1[i] ; + register __vector float vb1_2=vb1[i+1] ; + register __vector float vb2_1=vb2[i] ; + register __vector float vb2_2=vb2[i+1] ; + register __vector float vb3_1=vb3[i] ; + register __vector float vb3_2=vb3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i+=2 ) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; + } + +} + + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] ; + v_y[i+1] += v_x0 * va0[i+1] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 7 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + + if ( m3 & 4 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + if ( lda == 4 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; + temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; + + temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; + temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; + temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; + temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; + + a_ptr += 16; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0] ; + a_ptr +=4; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + y_ptr += inc_y; + y_ptr[0] += alpha * temp3; + y_ptr += inc_y; + a += 4; + } + + + if ( m3 & 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + a += 2; + } + + if ( m3 & 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + + + } + + + return(0); +} + + diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 62c517a9d..c3fc8e77a 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -1,484 +1,484 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/gemv_t.c" - -#else - -#include "common.h" - -#define NBMAX 2048 - -#include - -static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - temp4 += v_x[i] * va4[i]; - temp5 += v_x[i] * va5[i]; - temp6 += v_x[i] * va6[i]; - temp7 += v_x[i] * va7[i]; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i] ; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 == 3) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 3 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - aj += 3; - } - - } else { - - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr += inc_y; - aj += lda; - } - - } - - } - return (0); - } - - if (m3 == 2) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - return (0); - - } - - FLOAT xtemp = *x_ptr * alpha; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - } - - return (0); - -} - -#endif +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" + +#else + +#include "common.h" + +#define NBMAX 2048 + +#include + +static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i] ; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + +#endif diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index b90512162..1ee7c8aeb 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -1,508 +1,508 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could be used as base for switching to inline assembly -*/ - -#include "common.h" -#include -#define NBMAX 4096 - -#include - -static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i +=2) { - register __vector float vx1=v_x[i] ; - register __vector float vx2=v_x[i+1] ; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float va4_1=va4[i] ; - register __vector float va4_2=va4[i+1] ; - register __vector float va5_1=va5[i] ; - register __vector float va5_2=va5[i+1] ; - register __vector float va6_1=va6[i] ; - register __vector float va6_2=va6[i+1] ; - register __vector float va7_1=va7[i] ; - register __vector float va7_2=va7[i+1] ; - temp0 += vx1* va0_1 + vx2 * va0_2; - temp1 += vx1* va1_1 + vx2 * va1_2; - temp2 += vx1* va2_1 + vx2 * va2_2; - temp3 += vx1* va3_1 + vx2 * va3_2; - temp4 += vx1* va4_1 + vx2 * va4_2; - temp5 += vx1* va5_1 + vx2 * va5_2; - temp6 += vx1* va6_1 + vx2 * va6_2; - temp7 += vx1* va7_1 + vx2 * va7_2; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 7; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 & 4) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp3 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 4 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; - y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; - y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; - aj += 16; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - aj += 4; - } - - } else if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; - y_ptr += inc_y; - aj += lda; - } - - } - if (m3==4) return (0); - a_ptr += 4; - } - - if (m3 & 2 ) { - - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - if (m3==2) return (0); - a_ptr += 2; - } - if (m3 & 1) { - - FLOAT xtemp = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - - } - a_ptr += 1; - } - return (0); - -} - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could be used as base for switching to inline assembly +*/ + +#include "common.h" +#include +#define NBMAX 4096 + +#include + +static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i +=2) { + register __vector float vx1=v_x[i] ; + register __vector float vx2=v_x[i+1] ; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float va4_1=va4[i] ; + register __vector float va4_2=va4[i+1] ; + register __vector float va5_1=va5[i] ; + register __vector float va5_2=va5[i+1] ; + register __vector float va6_1=va6[i] ; + register __vector float va6_2=va6[i+1] ; + register __vector float va7_1=va7[i] ; + register __vector float va7_2=va7[i+1] ; + temp0 += vx1* va0_1 + vx2 * va0_2; + temp1 += vx1* va1_1 + vx2 * va1_2; + temp2 += vx1* va2_1 + vx2 * va2_2; + temp3 += vx1* va3_1 + vx2 * va3_2; + temp4 += vx1* va4_1 + vx2 * va4_2; + temp5 += vx1* va5_1 + vx2 * va5_2; + temp6 += vx1* va6_1 + vx2 * va6_2; + temp7 += vx1* va7_1 + vx2 * va7_2; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; + temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + } + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 7; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 & 4) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp3 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 4 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; + y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; + y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; + aj += 16; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + aj += 4; + } + + } else if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; + y_ptr += inc_y; + aj += lda; + } + + } + if (m3==4) return (0); + a_ptr += 4; + } + + if (m3 & 2 ) { + + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + if (m3==2) return (0); + a_ptr += 2; + } + if (m3 & 1) { + + FLOAT xtemp = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + + } + a_ptr += 1; + } + return (0); + +} + diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index d1e60da6c..f9320d516 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -1,245 +1,245 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - -#define LOAD ld - -#define STACKSIZE 512 - -#define FZERO 312+192(SP) - -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ - -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - - -#define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 - -#define VECSAVE r11 - -#define FRAMEPOINTER r12 - -#define T10 r14 - -#define L r15 -#define T8 r16 -#define T5 r17 -#define T2 r19 -#define TEMP_REG r20 -#define T6 r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T7 r27 -#define T3 r28 -#define T4 r29 - -#define PRE r30 -#define T1 r31 - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - mr FRAMEPOINTER, SP - addi SP, SP, -STACKSIZE - mflr r0 - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - xxspltd alpha_r,vs1,0 /*copy from register f1 */ - xxspltd alpha_i,vs2,0 /*copy from register f2 */ - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - std r0, FLINK_SAVE(SP) - - -#if defined(linux) || defined(__FreeBSD__) - ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) -#endif - - -#ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) -#endif -#endif - - -#include "zgemm_macros_power9.S" - - - - slwi LDC, LDC, ZBASE_SHIFT - li PRE, 512 - li r0, 0 - - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegdp alpha_r,alpha_r - xvnegdp alpha_i,alpha_i -#endif - .align 4 - -#include "zgemm_logic_power9.S" - -L999: - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power9.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power9.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE #endif \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index fe5d8ade2..850b41aff 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -1,1891 +1,1891 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define MY_ALIGN .align 3 -b ZGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -ZGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 -ZGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_L2 256,64,31,0 - KERNEL2x8_L2 256,64,32,0 - KERNEL2x8_L2 256,64,33,0 - KERNEL2x8_L2 256,64,34,0 - KERNEL2x8_L2 256,64,35,0 - KERNEL2x8_L2 256,64,36,0 - KERNEL2x8_L2 256,64,37,0 - KERNEL2x8_L2 256,64,38,0 - KERNEL2x8_L2 256,64,39,0 - KERNEL2x8_L2 256,64,40,0 - KERNEL2x8_L2 256,64,41,0 - KERNEL2x8_L2 256,64,42,0 - KERNEL2x8_L2 256,64,43,0 - KERNEL2x8_L2 256,64,44,0 - KERNEL2x8_L2 256,64,45,0 - KERNEL2x8_L2 256,64,46,0 - KERNEL2x8_L2 256,64,47,0 - KERNEL2x8_L2 256,64,48,0 - KERNEL2x8_L2 256,64,49,0 - KERNEL2x8_L2 256,64,50,0 - KERNEL2x8_L2 256,64,51,0 - KERNEL2x8_L2 256,64,52,0 - KERNEL2x8_L2 256,64,53,0 - KERNEL2x8_L2 256,64,54,0 - KERNEL2x8_L2 256,64,55,0 - KERNEL2x8_L2 256,64,56,0 - KERNEL2x8_L2 256,64,57,0 - KERNEL2x8_L2 256,64,58,0 - KERNEL2x8_L2 256,64,59,0 - KERNEL2x8_L2 256,64,60,0 - KERNEL2x8_L2 256,64,61,0 - KERNEL2x8_L2 256,64,62,0 - KERNEL2x8_L2 256,64,63,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN -ZGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -ZGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_E2 256,64,31,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_E2 256,64,15,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_E2 256,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -ZGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,0,0 -ZGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_L2 128,64,7,0 - KERNEL2x4_L2 128,64,8,0 - KERNEL2x4_L2 128,64,9,0 - KERNEL2x4_L2 128,64,10,0 - KERNEL2x4_L2 128,64,11,0 - KERNEL2x4_L2 128,64,12,0 - KERNEL2x4_L2 128,64,13,0 - KERNEL2x4_L2 128,64,14,0 - KERNEL2x4_L2 128,64,15,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN -ZGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -ZGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_E2 128,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_E2 128,64,3,1 - blr - - -ZGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -ZGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,0,0 -ZGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_L2 64,64,7,0 - KERNEL2x2_L2 64,64,8,0 - KERNEL2x2_L2 64,64,9,0 - KERNEL2x2_L2 64,64,10,0 - KERNEL2x2_L2 64,64,11,0 - KERNEL2x2_L2 64,64,12,0 - KERNEL2x2_L2 64,64,13,0 - KERNEL2x2_L2 64,64,14,0 - KERNEL2x2_L2 64,64,15,1 - bdnz ZGEMM_L2x2_LOOP - MY_ALIGN - - -ZGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -ZGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_E2 64,64,7,1 - blr - MY_ALIGN -ZGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_E2 64,64,3,1 - blr - - -ZGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -ZGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,0,0 -ZGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_L2 32,64,7,0 - KERNEL2x1_L2 32,64,8,0 - KERNEL2x1_L2 32,64,9,0 - KERNEL2x1_L2 32,64,10,0 - KERNEL2x1_L2 32,64,11,0 - KERNEL2x1_L2 32,64,12,0 - KERNEL2x1_L2 32,64,13,0 - KERNEL2x1_L2 32,64,14,0 - KERNEL2x1_L2 32,64,15,1 - bdnz ZGEMM_L2x1_LOOP - MY_ALIGN -ZGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -ZGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_E2 32,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_E2 32,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -ZGEMM_L2: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 1 - ble ZGEMM_L2_END - - -ZGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - bl ZGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 - - -ZGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8O 128,32 - END2x8_WITHOUT_ADD - LOAD2x8_2O 256, 64 - mtctr T8 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-256 - LOAD2x8_2O 256,64 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - MY_ALIGN - - -ZGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L2x8_SUB2_32 - bl ZGEMM_2x8_L64_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L2x8_SUB2_16 - bl ZGEMM_2x8_L32_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x8_SUB2_8 - bl ZGEMM_2x8_L16_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_L2 256,64, 1,0 - KERNEL2x8_L2 256,64, 2,0 - KERNEL2x8_E2 256,64, 3,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_E2 256,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 256,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x8_SAVE - KERNEL2x8 - - -ZGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt ZGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END - b ZGEMM_L2x4_BEGIN - MY_ALIGN - - -ZGEMM_L2x8_END: -/*----------------------------------------*/ - - -ZGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble ZGEMM_L2x4_SUB0 - bl ZGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 - - -ZGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4O 64,32 - END2x4_WITHOUT_ADD - LOAD2x4_2O 128, 64 - mtctr T8 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD2x4_2O 128,64 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x4_SUB2_8 - bl ZGEMM_2x4_L16_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x4_SUB2_4 - bl ZGEMM_2x4_L8_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 128,64, 0,0 - KERNEL2x4_E2 128,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 128,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x4_SAVE - KERNEL2x4 - - -ZGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -ZGEMM_L2x4_END: -/*----------------------------------------*/ - - -ZGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble ZGEMM_L2x2_SUB0 - bl ZGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x2_SAVE - b ZGEMM_L2x2_SUB2 - - -ZGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2O 32,32 - END2x2_WITHOUT_ADD - LOAD2x2_2O 64, 64 - mtctr T8 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD2x2_2O 64,64 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x2_SUB2_8 - bl ZGEMM_2x2_L16_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x2_SUB2_4 - bl ZGEMM_2x2_L8_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 64,64, 0,0 - KERNEL2x2_E2 64,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 64,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x2_SAVE - KERNEL2x2 - - -ZGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -ZGEMM_L2x2_END: -/*----------------------------------------*/ - - -ZGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble ZGEMM_L2x1_SUB0 - bl ZGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x1_SAVE - b ZGEMM_L2x1_SUB2 - - -ZGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1O 16,32 - END2x1_WITHOUT_ADD - LOAD2x1_2O 32, 64 - mtctr T8 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD2x1_2O 32,64 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x1_SUB2_8 - bl ZGEMM_2x1_L16_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x1_SUB2_4 - bl ZGEMM_2x1_L8_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 32,64, 0,0 - KERNEL2x1_E2 32,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 32,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x1_SAVE - KERNEL2x1 - - -ZGEMM_L2x1_SAVE: -/*----------------------------------------*/ - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -ZGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - bgt ZGEMM_L2_BEGIN - - -ZGEMM_L2_END: - -b ZGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -ZGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 -ZGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_L2 256,32,31,0 - KERNEL1x8_L2 256,32,32,0 - KERNEL1x8_L2 256,32,33,0 - KERNEL1x8_L2 256,32,34,0 - KERNEL1x8_L2 256,32,35,0 - KERNEL1x8_L2 256,32,36,0 - KERNEL1x8_L2 256,32,37,0 - KERNEL1x8_L2 256,32,38,0 - KERNEL1x8_L2 256,32,39,0 - KERNEL1x8_L2 256,32,40,0 - KERNEL1x8_L2 256,32,41,0 - KERNEL1x8_L2 256,32,42,0 - KERNEL1x8_L2 256,32,43,0 - KERNEL1x8_L2 256,32,44,0 - KERNEL1x8_L2 256,32,45,0 - KERNEL1x8_L2 256,32,46,0 - KERNEL1x8_L2 256,32,47,0 - KERNEL1x8_L2 256,32,48,0 - KERNEL1x8_L2 256,32,49,0 - KERNEL1x8_L2 256,32,50,0 - KERNEL1x8_L2 256,32,51,0 - KERNEL1x8_L2 256,32,52,0 - KERNEL1x8_L2 256,32,53,0 - KERNEL1x8_L2 256,32,54,0 - KERNEL1x8_L2 256,32,55,0 - KERNEL1x8_L2 256,32,56,0 - KERNEL1x8_L2 256,32,57,0 - KERNEL1x8_L2 256,32,58,0 - KERNEL1x8_L2 256,32,59,0 - KERNEL1x8_L2 256,32,60,0 - KERNEL1x8_L2 256,32,61,0 - KERNEL1x8_L2 256,32,62,0 - KERNEL1x8_L2 256,32,63,1 - bdnz ZGEMM_L1x8_LOOP - MY_ALIGN -ZGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -ZGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_E2 256,32,31,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_E2 256,32,15,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_E2 256,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN - - -ZGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,0,0 - - -ZGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_L2 128,32,7,0 - KERNEL1x4_L2 128,32,8,0 - KERNEL1x4_L2 128,32,9,0 - KERNEL1x4_L2 128,32,10,0 - KERNEL1x4_L2 128,32,11,0 - KERNEL1x4_L2 128,32,12,0 - KERNEL1x4_L2 128,32,13,0 - KERNEL1x4_L2 128,32,14,0 - KERNEL1x4_L2 128,32,15,1 - bdnz ZGEMM_L1x4_LOOP - MY_ALIGN - - -ZGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -ZGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_E2 128,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_E2 128,32,3,1 - blr - - -ZGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN - - -ZGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,0,0 - - -ZGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_L2 64,32,7,0 - KERNEL1x2_L2 64,32,8,0 - KERNEL1x2_L2 64,32,9,0 - KERNEL1x2_L2 64,32,10,0 - KERNEL1x2_L2 64,32,11,0 - KERNEL1x2_L2 64,32,12,0 - KERNEL1x2_L2 64,32,13,0 - KERNEL1x2_L2 64,32,14,0 - KERNEL1x2_L2 64,32,15,1 - bdnz ZGEMM_L1x2_LOOP - MY_ALIGN - - -ZGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN - - -ZGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_E2 64,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_E2 64,32,3,1 - blr - - -ZGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN - - -ZGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,0,0 - - -ZGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_L2 32,32,7,0 - KERNEL1x1_L2 32,32,8,0 - KERNEL1x1_L2 32,32,9,0 - KERNEL1x1_L2 32,32,10,0 - KERNEL1x1_L2 32,32,11,0 - KERNEL1x1_L2 32,32,12,0 - KERNEL1x1_L2 32,32,13,0 - KERNEL1x1_L2 32,32,14,0 - KERNEL1x1_L2 32,32,15,1 - bdnz ZGEMM_L1x1_LOOP - MY_ALIGN - - -ZGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - MY_ALIGN - - -ZGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_E2 32,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_E2 32,32,3,1 - blr - - -/*----------------------N1 BEGINS---------*/ -ZGEMM_L1: -/*----------------------------------------*/ - andi. T1, N, 1 - ble ZGEMM_L1_END - -ZGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble ZGEMM_L1x8_SUB0 - bl ZGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L1x8_SAVE - b ZGEMM_L1x8_SUB2 - - -ZGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8O 128,16 - END1x8_WITHOUT_ADD - LOAD1x8_2O 256, 32 - mtctr T8 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-256 - LOAD1x8_2O 256,32 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - MY_ALIGN - - -ZGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L1x8_SUB2_32 - bl ZGEMM_1x8_L64_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L1x8_SUB2_16 - bl ZGEMM_1x8_L32_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x8_SUB2_8 - bl ZGEMM_1x8_L16_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_L2 256,32, 1,0 - KERNEL1x8_L2 256,32, 2,0 - KERNEL1x8_E2 256,32, 3,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_E2 256,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 256,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x8_SAVE - KERNEL1x8 - - -ZGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt ZGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END - b ZGEMM_L1x4_BEGIN - MY_ALIGN - - -ZGEMM_L1x8_END: -/*----------------------------------------*/ - - -ZGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x4 - ble ZGEMM_L1x4_SUB0 - bl ZGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x4_SAVE - b ZGEMM_L1x4_SUB2 - - -ZGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4O 64,16 - END1x4_WITHOUT_ADD - LOAD1x4_2O 128, 32 - mtctr T8 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD1x4_2O 128,32 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x4_SUB2_8 - bl ZGEMM_1x4_L16_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x4_SUB2_4 - bl ZGEMM_1x4_L8_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 128,32, 0,0 - KERNEL1x4_E2 128,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 128,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x4_SAVE - KERNEL1x4 - - -ZGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -ZGEMM_L1x4_END: -/*----------------------------------------*/ - - -ZGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x2 - ble ZGEMM_L1x2_SUB0 - bl ZGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x2_SAVE - b ZGEMM_L1x2_SUB2 - - -ZGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2O 32,16 - END1x2_WITHOUT_ADD - LOAD1x2_2O 64, 32 - mtctr T8 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD1x2_2O 64,32 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x2_SUB2_8 - bl ZGEMM_1x2_L16_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x2_SUB2_4 - bl ZGEMM_1x2_L8_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 64,32, 0,0 - KERNEL1x2_E2 64,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 64,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x2_SAVE - KERNEL1x2 - - -ZGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -ZGEMM_L1x2_END: -/*----------------------------------------*/ - - -ZGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x1 - ble ZGEMM_L1x1_SUB0 - bl ZGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x1_SAVE - b ZGEMM_L1x1_SUB2 - - -ZGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1O 16,16 - END1x1_WITHOUT_ADD - LOAD1x1_2O 32, 32 - mtctr T8 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD1x1_2O 32,32 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x1_SUB2_8 - bl ZGEMM_1x1_L16_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x1_SUB2_4 - bl ZGEMM_1x1_L8_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 32,32, 0,0 - KERNEL1x1_E2 32,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 32,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x1_SAVE - KERNEL1x1 - - -ZGEMM_L1x1_SAVE: -/*----------------------------------------*/ - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -ZGEMM_L1x1_END: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - - -ZGEMM_L1_END: -/*----------------------------------------*/ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_L2 256,64,31,0 + KERNEL2x8_L2 256,64,32,0 + KERNEL2x8_L2 256,64,33,0 + KERNEL2x8_L2 256,64,34,0 + KERNEL2x8_L2 256,64,35,0 + KERNEL2x8_L2 256,64,36,0 + KERNEL2x8_L2 256,64,37,0 + KERNEL2x8_L2 256,64,38,0 + KERNEL2x8_L2 256,64,39,0 + KERNEL2x8_L2 256,64,40,0 + KERNEL2x8_L2 256,64,41,0 + KERNEL2x8_L2 256,64,42,0 + KERNEL2x8_L2 256,64,43,0 + KERNEL2x8_L2 256,64,44,0 + KERNEL2x8_L2 256,64,45,0 + KERNEL2x8_L2 256,64,46,0 + KERNEL2x8_L2 256,64,47,0 + KERNEL2x8_L2 256,64,48,0 + KERNEL2x8_L2 256,64,49,0 + KERNEL2x8_L2 256,64,50,0 + KERNEL2x8_L2 256,64,51,0 + KERNEL2x8_L2 256,64,52,0 + KERNEL2x8_L2 256,64,53,0 + KERNEL2x8_L2 256,64,54,0 + KERNEL2x8_L2 256,64,55,0 + KERNEL2x8_L2 256,64,56,0 + KERNEL2x8_L2 256,64,57,0 + KERNEL2x8_L2 256,64,58,0 + KERNEL2x8_L2 256,64,59,0 + KERNEL2x8_L2 256,64,60,0 + KERNEL2x8_L2 256,64,61,0 + KERNEL2x8_L2 256,64,62,0 + KERNEL2x8_L2 256,64,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +ZGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_E2 256,64,31,1 + blr + MY_ALIGN + + +ZGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_E2 256,64,15,1 + blr + MY_ALIGN + + +ZGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_E2 256,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,0,0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_L2 128,64,7,0 + KERNEL2x4_L2 128,64,8,0 + KERNEL2x4_L2 128,64,9,0 + KERNEL2x4_L2 128,64,10,0 + KERNEL2x4_L2 128,64,11,0 + KERNEL2x4_L2 128,64,12,0 + KERNEL2x4_L2 128,64,13,0 + KERNEL2x4_L2 128,64,14,0 + KERNEL2x4_L2 128,64,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +ZGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_E2 128,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_E2 128,64,3,1 + blr + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,0,0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_L2 64,64,7,0 + KERNEL2x2_L2 64,64,8,0 + KERNEL2x2_L2 64,64,9,0 + KERNEL2x2_L2 64,64,10,0 + KERNEL2x2_L2 64,64,11,0 + KERNEL2x2_L2 64,64,12,0 + KERNEL2x2_L2 64,64,13,0 + KERNEL2x2_L2 64,64,14,0 + KERNEL2x2_L2 64,64,15,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +ZGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_E2 64,64,7,1 + blr + MY_ALIGN +ZGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_E2 64,64,3,1 + blr + + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,0,0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_L2 32,64,7,0 + KERNEL2x1_L2 32,64,8,0 + KERNEL2x1_L2 32,64,9,0 + KERNEL2x1_L2 32,64,10,0 + KERNEL2x1_L2 32,64,11,0 + KERNEL2x1_L2 32,64,12,0 + KERNEL2x1_L2 32,64,13,0 + KERNEL2x1_L2 32,64,14,0 + KERNEL2x1_L2 32,64,15,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +ZGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_E2 32,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_E2 32,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + ble ZGEMM_L2_END + + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8O 128,32 + END2x8_WITHOUT_ADD + LOAD2x8_2O 256, 64 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-256 + LOAD2x8_2O 256,64 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + bl ZGEMM_2x8_L16_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_L2 256,64, 1,0 + KERNEL2x8_L2 256,64, 2,0 + KERNEL2x8_E2 256,64, 3,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_E2 256,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 256,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4O 64,32 + END2x4_WITHOUT_ADD + LOAD2x4_2O 128, 64 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD2x4_2O 128,64 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + bl ZGEMM_2x4_L16_SUB + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + bl ZGEMM_2x4_L8_SUB + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 128,64, 0,0 + KERNEL2x4_E2 128,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 128,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2O 32,32 + END2x2_WITHOUT_ADD + LOAD2x2_2O 64, 64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD2x2_2O 64,64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + bl ZGEMM_2x2_L16_SUB + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + bl ZGEMM_2x2_L8_SUB + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 64,64, 0,0 + KERNEL2x2_E2 64,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 64,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1O 16,32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD2x1_2O 32,64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + bl ZGEMM_2x1_L16_SUB + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + bl ZGEMM_2x1_L8_SUB + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32,64, 0,0 + KERNEL2x1_E2 32,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + bgt ZGEMM_L2_BEGIN + + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_L2 256,32,31,0 + KERNEL1x8_L2 256,32,32,0 + KERNEL1x8_L2 256,32,33,0 + KERNEL1x8_L2 256,32,34,0 + KERNEL1x8_L2 256,32,35,0 + KERNEL1x8_L2 256,32,36,0 + KERNEL1x8_L2 256,32,37,0 + KERNEL1x8_L2 256,32,38,0 + KERNEL1x8_L2 256,32,39,0 + KERNEL1x8_L2 256,32,40,0 + KERNEL1x8_L2 256,32,41,0 + KERNEL1x8_L2 256,32,42,0 + KERNEL1x8_L2 256,32,43,0 + KERNEL1x8_L2 256,32,44,0 + KERNEL1x8_L2 256,32,45,0 + KERNEL1x8_L2 256,32,46,0 + KERNEL1x8_L2 256,32,47,0 + KERNEL1x8_L2 256,32,48,0 + KERNEL1x8_L2 256,32,49,0 + KERNEL1x8_L2 256,32,50,0 + KERNEL1x8_L2 256,32,51,0 + KERNEL1x8_L2 256,32,52,0 + KERNEL1x8_L2 256,32,53,0 + KERNEL1x8_L2 256,32,54,0 + KERNEL1x8_L2 256,32,55,0 + KERNEL1x8_L2 256,32,56,0 + KERNEL1x8_L2 256,32,57,0 + KERNEL1x8_L2 256,32,58,0 + KERNEL1x8_L2 256,32,59,0 + KERNEL1x8_L2 256,32,60,0 + KERNEL1x8_L2 256,32,61,0 + KERNEL1x8_L2 256,32,62,0 + KERNEL1x8_L2 256,32,63,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +ZGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_E2 256,32,31,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_E2 256,32,15,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_E2 256,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,0,0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_L2 128,32,7,0 + KERNEL1x4_L2 128,32,8,0 + KERNEL1x4_L2 128,32,9,0 + KERNEL1x4_L2 128,32,10,0 + KERNEL1x4_L2 128,32,11,0 + KERNEL1x4_L2 128,32,12,0 + KERNEL1x4_L2 128,32,13,0 + KERNEL1x4_L2 128,32,14,0 + KERNEL1x4_L2 128,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +ZGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_E2 128,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_E2 128,32,3,1 + blr + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,0,0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_L2 64,32,7,0 + KERNEL1x2_L2 64,32,8,0 + KERNEL1x2_L2 64,32,9,0 + KERNEL1x2_L2 64,32,10,0 + KERNEL1x2_L2 64,32,11,0 + KERNEL1x2_L2 64,32,12,0 + KERNEL1x2_L2 64,32,13,0 + KERNEL1x2_L2 64,32,14,0 + KERNEL1x2_L2 64,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN + + +ZGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_E2 64,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_E2 64,32,3,1 + blr + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,0,0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_L2 32,32,7,0 + KERNEL1x1_L2 32,32,8,0 + KERNEL1x1_L2 32,32,9,0 + KERNEL1x1_L2 32,32,10,0 + KERNEL1x1_L2 32,32,11,0 + KERNEL1x1_L2 32,32,12,0 + KERNEL1x1_L2 32,32,13,0 + KERNEL1x1_L2 32,32,14,0 + KERNEL1x1_L2 32,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +ZGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_E2 32,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_E2 32,32,3,1 + blr + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8O 128,16 + END1x8_WITHOUT_ADD + LOAD1x8_2O 256, 32 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-256 + LOAD1x8_2O 256,32 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + bl ZGEMM_1x8_L64_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + bl ZGEMM_1x8_L32_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + bl ZGEMM_1x8_L16_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_L2 256,32, 1,0 + KERNEL1x8_L2 256,32, 2,0 + KERNEL1x8_E2 256,32, 3,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_E2 256,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 256,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4O 64,16 + END1x4_WITHOUT_ADD + LOAD1x4_2O 128, 32 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD1x4_2O 128,32 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + bl ZGEMM_1x4_L16_SUB + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + bl ZGEMM_1x4_L8_SUB + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 128,32, 0,0 + KERNEL1x4_E2 128,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 128,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2O 32,16 + END1x2_WITHOUT_ADD + LOAD1x2_2O 64, 32 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD1x2_2O 64,32 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + bl ZGEMM_1x2_L16_SUB + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + bl ZGEMM_1x2_L8_SUB + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 64,32, 0,0 + KERNEL1x2_E2 64,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 64,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1O 16,16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD1x1_2O 32,32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x1_SUB2_8 + bl ZGEMM_1x1_L16_SUB + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x1_SUB2_4 + bl ZGEMM_1x1_L8_SUB + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32,32, 0,0 + KERNEL1x1_E2 32,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ \ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 8670e9574..68024b826 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -1,1825 +1,1825 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 16 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) -/* HELPERS FOR SAVE */ -/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ - - -.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET -#ifndef TRMMKERNEL - lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) - lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) - xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 - xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 -#endif -.endm -/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ - - -.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ -.endm -/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ - - -.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ -.endm -/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ - - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead instead to fix sign*/ - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm -/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ - - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 -#ifndef TRMMKERNEL - xvmsubadp \VSOUT1,\VSINII, alpha_i - xvmaddadp \VSOUT2,\VSINRR, alpha_i -#else - xvmuldp \VSOUT1,\VSINII, alpha_i - xvmuldp \VSOUT2,\VSINRR, alpha_i -#endif -.endm -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubadp \VSOUT1,\VSINRR, alpha_r - xvmaddadp \VSOUT2,\VSINII, alpha_r -.endm -/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ - - -.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrghd \VSOUT1,\VSIN2,\VSIN1 - xxmrgld \VSOUT2,\VSIN2,\VSIN1 -.endm - - -.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 - stxv \VSIN1, DISPX(\LOFFSET)(\REG) - stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) -.endm - - -.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 - MULT_APLHA_PART1 vs6,vs8,vs16,vs17 - MULT_APLHA_PART2 vs2,vs4,vs14,vs15 - AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - MULT_APLHA_PART1 vs10,vs12, vs24,vs25 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - MULT_APLHA_PART2 vs10,vs12,vs24,vs25 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 - MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 - UNPACK_FOR_STORE vs24,vs25,vs10,vs12 - UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 - STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 - STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 -.endm - - -.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART1 vs6,vs8, vs16,vs17 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 -.endm - - - -.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 -.endm - - - -.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 -#ifndef TRMMKERNEL - lxv vs18, (\LOFFSET)(\BASE_REG) - xxmrgld vs14,vs18,vs18 - xxmrghd vs15,vs18,vs18 -#endif - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - xxmrghd vs7,vs15,vs14 - stxv vs7, (\LOFFSET)(\BASE_REG) -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,128,32 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x8_2 - /*for load2 offset will be 256 and 64*/ - KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 -.endm - - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs48, vs8, vs22 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs49, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs50, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs51, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs52, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs53, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs54, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs55, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs56, vs12, vs22 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs57, vs12, vs23 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs58, vs13, vs22 - xvmaddadp vs43, vs13, vs21 - xvmaddadp vs59, vs13, vs23 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs60, vs14, vs22 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs61, vs14, vs23 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs62, vs15, vs22 - xvmaddadp vs47, vs15, vs21 - xvmaddadp vs63, vs15, vs23 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 128,32 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,64,32 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 - -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_2 - /*for load2 offset will be 128 and 64*/ - KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs41, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs43, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs45, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs47, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 64,32 -.endm - - - -.macro SAVE2x4 - add T1, CO ,LDC - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,32,32 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 - -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_2 - /*for load2 offset will be 64 and 64*/ - KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 -.endm - - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs36, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs37, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs38, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs39, vs9, vs23 -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 32,32 -.endm - - - -.macro SAVE2x2 - add T1, CO ,LDC - SAVE2 vs32,vs33,vs34,vs35,CO,0 - SAVE2 vs36,vs37,vs38,vs39,T1,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,16,32 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_2 - /*for load2 offset will be 32 and 64*/ - KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 -.endm - - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs34, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs35, vs8, vs23 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 16,32 -.endm - - - -.macro SAVE2x1 - add T1, CO ,LDC - SAVE1 vs32,vs33,CO,0 - SAVE1 vs34,vs35,T1,0 - addi CO, CO, 16 -.endm - -/********************************************************************************************** -* - -.macros for N=1 and M=8 -**********************************************************************************************/ - - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,128,16 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 - - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 - -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x8_2 - /*for load2 offset will be 256 and 32*/ - KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 -.endm - - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs43, vs13, vs21 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs47, vs15, vs21 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 128,16 -.endm - - -.macro SAVE1x8 - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,64,16 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x4_2 - /*for load2 offset will be 128 and 32*/ - KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 -.endm - - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 64,16 -.endm - - - -.macro SAVE1x4 - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,32,16 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x2_2 - /*for load2 offset will be 64 and 32*/ - KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 -.endm - - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 32,16 -.endm - - - -.macro SAVE1x2 - SAVE2 vs32,vs33,vs34,vs35,CO,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - xxswapd vs17, vs16 - -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,16,16 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x1_2 - /*for load2 offset will be 32 and 32*/ - KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 -.endm - - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 16,16 -.endm - - - -.macro SAVE1x1 - SAVE1 vs32,vs33,CO,0 - addi CO, CO, 16 -.endm - -/****************************TRMM POINTER REFRESH - -.macroSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 8 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 4 - .endif -.endm -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ - - -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ - - -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - #endif - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs6,vs8,vs16,vs17 + MULT_APLHA_PART2 vs2,vs4,vs14,vs15 + AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + MULT_APLHA_PART1 vs10,vs12, vs24,vs25 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + MULT_APLHA_PART2 vs10,vs12,vs24,vs25 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 + UNPACK_FOR_STORE vs24,vs25,vs10,vs12 + UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART1 vs6,vs8, vs16,vs17 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 +.endm + + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 +.endm + + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 +#ifndef TRMMKERNEL + lxv vs18, (\LOFFSET)(\BASE_REG) + xxmrgld vs14,vs18,vs18 + xxmrghd vs15,vs18,vs18 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + xxmrghd vs7,vs15,vs14 + stxv vs7, (\LOFFSET)(\BASE_REG) +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,128,32 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x8_2 + /*for load2 offset will be 256 and 64*/ + KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 +.endm + + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs48, vs8, vs22 +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs49, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs50, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs51, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs52, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs53, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs54, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs55, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs56, vs12, vs22 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs57, vs12, vs23 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs58, vs13, vs22 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs59, vs13, vs23 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs60, vs14, vs22 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs61, vs14, vs23 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs62, vs15, vs22 + xvmaddadp vs47, vs15, vs21 + xvmaddadp vs63, vs15, vs23 +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 128,32 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 + addi CO, CO, 128 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,64,32 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 + +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_2 + /*for load2 offset will be 128 and 64*/ + KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs41, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs43, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs47, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 64,32 +.endm + + + +.macro SAVE2x4 + add T1, CO ,LDC + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 + addi CO, CO, 64 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,32,32 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 + +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_2 + /*for load2 offset will be 64 and 64*/ + KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 +.endm + + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs37, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs39, vs9, vs23 +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 32,32 +.endm + + + +.macro SAVE2x2 + add T1, CO ,LDC + SAVE2 vs32,vs33,vs34,vs35,CO,0 + SAVE2 vs36,vs37,vs38,vs39,T1,0 + addi CO, CO, 32 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + + + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,16,32 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs35, vs8, vs23 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs32,vs33,CO,0 + SAVE1 vs34,vs35,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,128,16 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 + +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x8_2 + /*for load2 offset will be 256 and 32*/ + KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 +.endm + + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs43, vs13, vs21 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs47, vs15, vs21 +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 128,16 +.endm + + +.macro SAVE1x8 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + addi CO, CO, 128 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,64,16 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x4_2 + /*for load2 offset will be 128 and 32*/ + KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 +.endm + + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 64,16 +.endm + + + +.macro SAVE1x4 + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + addi CO, CO, 64 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,32,16 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x2_2 + /*for load2 offset will be 64 and 32*/ + KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 +.endm + + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 32,16 +.endm + + + +.macro SAVE1x2 + SAVE2 vs32,vs33,vs34,vs35,CO,0 + addi CO, CO, 32 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + + + +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs17, vs16 + +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,16,16 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs32,vs33,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index ef156fd27..76ea12fee 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -1,6806 +1,6806 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/********************************************************************* -* 2014/07/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* SGEMM_DEFAULT_UNROLL_N 4 -* SGEMM_DEFAULT_UNROLL_M 16 -* SGEMM_DEFAULT_P 768 -* SGEMM_DEFAULT_Q 384 -* A_PR1 512 -* B_PR1 512 -* -* -* 2014/07/28 Saar -* Performance at 9216x9216x9216: -* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) -* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) -* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) -* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) -* -*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define BO2 %rbp -#define SP %rbx - -#define BO1 %rdi -#define CO2 %rdx - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#if defined(OS_WINDOWS) -#define L_BUFFER_SIZE 8192 -#else -#define L_BUFFER_SIZE 12288 -#endif - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 - -#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 - -#else - -#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 - -#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 - -#endif - - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* 6 lines of N -*******************************************************************************************/ - -.macro KERNEL16x6_SUB - vmovups -16 * SIZE(AO), %ymm0 - vmovups -8 * SIZE(AO), %ymm1 - vbroadcastss -4 * SIZE(BO), %ymm2 - vbroadcastss -3 * SIZE(BO), %ymm3 - prefetcht0 A_PR1(AO) - - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - - vbroadcastss -2 * SIZE(BO), %ymm2 - vbroadcastss -1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - - vbroadcastss 0 * SIZE(BO), %ymm2 - vbroadcastss 1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) - - addq $ 6*SIZE, BO - addq $ 16*SIZE, AO - decq %rax -.endm - -.macro SAVE16x6 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - vmulps %ymm0 , %ymm12, %ymm12 - vmulps %ymm0 , %ymm13, %ymm13 - vmulps %ymm0 , %ymm14, %ymm14 - vmulps %ymm0 , %ymm15, %ymm15 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO1, LDC,2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 - - vaddps (CO2), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2), %ymm11,%ymm11 - - vaddps (CO2, LDC), %ymm12,%ymm12 - vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 - - vaddps (CO2, LDC,2), %ymm14,%ymm14 - vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO1, LDC,2) - vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) - - vmovups %ymm10, (CO2) - vmovups %ymm11, 8 * SIZE(CO2) - - vmovups %ymm12, (CO2, LDC) - vmovups %ymm13, 8 * SIZE(CO2, LDC) - - vmovups %ymm14, (CO2, LDC,2) - vmovups %ymm15, 8 * SIZE(CO2, LDC,2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x6_SUB - vmovups -16 * SIZE(AO), %ymm0 - vbroadcastss -4 * SIZE(BO), %ymm2 - vbroadcastss -3 * SIZE(BO), %ymm3 - - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - - vbroadcastss -2 * SIZE(BO), %ymm2 - vbroadcastss -1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - - vbroadcastss 0 * SIZE(BO), %ymm2 - vbroadcastss 1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - - addq $ 6*SIZE, BO - addq $ 8*SIZE, AO - decq %rax -.endm - -.macro SAVE8x6 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm12, %ymm12 - vmulps %ymm0 , %ymm14, %ymm14 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO1, LDC,2), %ymm8,%ymm8 - vaddps (CO2), %ymm10,%ymm10 - vaddps (CO2, LDC), %ymm12,%ymm12 - vaddps (CO2, LDC,2), %ymm14,%ymm14 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO1, LDC,2) - vmovups %ymm10, (CO2) - vmovups %ymm12, (CO2, LDC) - vmovups %ymm14, (CO2, LDC,2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x6_SUB - vmovups -16 * SIZE(AO), %xmm0 - vbroadcastss -4 * SIZE(BO), %xmm2 - vbroadcastss -3 * SIZE(BO), %xmm3 - - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - - vbroadcastss -2 * SIZE(BO), %xmm2 - vbroadcastss -1 * SIZE(BO), %xmm3 - VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - - vbroadcastss 0 * SIZE(BO), %xmm2 - vbroadcastss 1 * SIZE(BO), %xmm3 - VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 4*SIZE, AO - decq %rax -.endm - -.macro SAVE4x6 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - vmulps %xmm0 , %xmm12, %xmm12 - vmulps %xmm0 , %xmm14, %xmm14 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO1, LDC,2), %xmm8,%xmm8 - vaddps (CO2), %xmm10,%xmm10 - vaddps (CO2, LDC), %xmm12,%xmm12 - vaddps (CO2, LDC,2), %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO1, LDC,2) - vmovups %xmm10, (CO2) - vmovups %xmm12, (CO2, LDC) - vmovups %xmm14, (CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x6_SUB - vmovss -16 * SIZE(AO), %xmm0 - vmovss -15 * SIZE(AO), %xmm1 - vmovss -4 * SIZE(BO), %xmm2 - vmovss -3 * SIZE(BO), %xmm3 - - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - - vmovss -2 * SIZE(BO), %xmm2 - vmovss -1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - - vmovss 0 * SIZE(BO), %xmm2 - vmovss 1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) - - addq $ 6*SIZE, BO - addq $ 2*SIZE, AO - decq %rax -.endm - -.macro SAVE2x6 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - vmulss %xmm0 , %xmm12, %xmm12 - vmulss %xmm0 , %xmm13, %xmm13 - vmulss %xmm0 , %xmm14, %xmm14 - vmulss %xmm0 , %xmm15, %xmm15 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO1, LDC,2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 - - vaddss (CO2), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2), %xmm11,%xmm11 - - vaddss (CO2, LDC), %xmm12,%xmm12 - vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 - - vaddss (CO2, LDC,2), %xmm14,%xmm14 - vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO1, LDC,2) - vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) - - vmovss %xmm10, (CO2) - vmovss %xmm11, 1 * SIZE(CO2) - - vmovss %xmm12, (CO2, LDC) - vmovss %xmm13, 1 * SIZE(CO2, LDC) - - vmovss %xmm14, (CO2, LDC,2) - vmovss %xmm15, 1 * SIZE(CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x6_SUB - vmovss -16 * SIZE(AO), %xmm0 - vmovss -4 * SIZE(BO), %xmm2 - vmovss -3 * SIZE(BO), %xmm3 - - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - - vmovss -2 * SIZE(BO), %xmm2 - vmovss -1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - - vmovss 0 * SIZE(BO), %xmm2 - vmovss 1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 1*SIZE, AO - decq %rax -.endm - -.macro SAVE1x6 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm12, %xmm12 - vmulss %xmm0 , %xmm14, %xmm14 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO1, LDC,2), %xmm8,%xmm8 - vaddss (CO2), %xmm10,%xmm10 - vaddss (CO2, LDC), %xmm12,%xmm12 - vaddss (CO2, LDC,2), %xmm14,%xmm14 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO1, LDC,2) - vmovss %xmm10, (CO2) - vmovss %xmm12, (CO2, LDC) - vmovss %xmm14, (CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - - -/******************************************************************************************* -* 4 lines of N -*******************************************************************************************/ - -.macro KERNEL16x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO2), %ymm9,%ymm9 - - vaddps (CO2, LDC), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO2) - vmovups %ymm9 , 8 * SIZE(CO2) - - vmovups %ymm10, (CO2, LDC) - vmovups %ymm11, 8 * SIZE(CO2, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - prefetcht0 64(CO2) - prefetcht0 64(CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - addq $ 4 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO2), %ymm8,%ymm8 - vaddps (CO2, LDC), %ymm10,%ymm10 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO2) - vmovups %ymm10, (CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - addq $ 4 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x4 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO2) - vmovups %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - addq $ 4 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO2), %xmm9,%xmm9 - - vaddss (CO2, LDC), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO2) - vmovss %xmm9 , 1 * SIZE(CO2) - - vmovss %xmm10, (CO2, LDC) - vmovss %xmm11, 1 * SIZE(CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - addq $ 4 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO2), %xmm8,%xmm8 - vaddss (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO2) - vmovss %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - addq $ 2 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - addq $ 2 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - addq $ 2 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - addq $ 2 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - addq $ 1 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - addq $ 1 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - addq $ 1 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - addq $ 1 , BI - addq $ 2 , %rax -.endm - -.macro SAVE2x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - addq $ 1 , BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - -/************************************************************************************* -* GEMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $12, %rdi - divq %rdi // N / 12 - movq %rax, Ndiv6 // N / 12 - movq %rdx, Nmod6 // N % 12 - - movq Ndiv6, J - cmpq $0, J - je .L4_00 - ALIGN_4 - - -/*******************************************************************************************/ - -.L6_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 4 values of B - leaq (B, %rax,4), BO2 - movq BO2, B // next offset of B - movq K, %rax - - ALIGN_4 - - -.L6_02c: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 4*SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L6_02c - - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc - leaq (C, LDC, 4), C - leaq (C, LDC, 2), C // c = c + 6 * ldc - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - - ALIGN_4 - -.L6_12: - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L6_16 - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL16x6_SUB - - jnz .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE16x6 - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L6_60 // to next 6 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - - ALIGN_4 - -.L6_20_2: - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L6_20_6 - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - ALIGN_4 - -.L6_20_7: - - KERNEL8x6_SUB - - jnz .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - SAVE8x6 - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - - ALIGN_4 - -.L6_22: - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L6_26 - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - ALIGN_4 - -.L6_27: - - KERNEL4x6_SUB - - jnz .L6_27 - ALIGN_4 - - -.L6_29: - - SAVE4x6 - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - - ALIGN_4 - -.L6_32: - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L6_36 - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - ALIGN_4 - -.L6_37: - - KERNEL2x6_SUB - - jnz .L6_37 - ALIGN_4 - - -.L6_39: - - SAVE2x6 - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L6_60 // to next 4 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - - ALIGN_4 - -.L6_42: - - prefetcht0 A_PR1(AO) - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L6_46 - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - ALIGN_4 - -.L6_47: - - KERNEL1x6_SUB - - jnz .L6_47 - ALIGN_4 - - -.L6_49: - - SAVE1x6 - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L6_60: - - -/*******************************************************************************************/ - - -.L7_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 4 values of B - leaq (B, %rax,4), BO2 - movq K, %rax - - ALIGN_4 - - -.L7_02c: - - vmovsd 2*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L7_02c - - movq BO2, B // next offset of B - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc - leaq (C, LDC, 4), C - leaq (C, LDC, 2), C // c = c + 6 * ldc - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - - ALIGN_4 - -.L7_12: - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L7_16 - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - ALIGN_4 - -.L7_17: - - KERNEL16x6_SUB - - jnz .L7_17 - ALIGN_4 - - -.L7_19: - - SAVE16x6 - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 6 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - - ALIGN_4 - -.L7_20_2: - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L7_20_6 - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - ALIGN_4 - -.L7_20_7: - - KERNEL8x6_SUB - - jnz .L7_20_7 - ALIGN_4 - - -.L7_20_9: - - SAVE8x6 - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - - ALIGN_4 - -.L7_22: - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L7_26 - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - ALIGN_4 - -.L7_27: - - KERNEL4x6_SUB - - jnz .L7_27 - ALIGN_4 - - -.L7_29: - - SAVE4x6 - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - - ALIGN_4 - -.L7_32: - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L7_36 - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - ALIGN_4 - -.L7_37: - - KERNEL2x6_SUB - - jnz .L7_37 - ALIGN_4 - - -.L7_39: - - SAVE2x6 - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 4 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - - ALIGN_4 - -.L7_42: - - prefetcht0 A_PR1(AO) - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L7_46 - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - ALIGN_4 - -.L7_47: - - KERNEL1x6_SUB - - jnz .L7_47 - ALIGN_4 - - -.L7_49: - - SAVE1x6 - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L7_60: - - decq J // j -- - jg .L6_01 // next 12 lines of N - - - - -/*******************************************************************************************/ -.L4_00: - - movq Nmod6, J - sarq $2, J // j = j / 4 - cmpq $ 0, J - je .L2_00 - ALIGN_4 - - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_00: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#else - -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv6 // N / 4 - movq %rdx, Nmod6 // N % 4 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -/*******************************************************************************************/ - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_0: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#endif - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define BO2 %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#if defined(OS_WINDOWS) +#define L_BUFFER_SIZE 8192 +#else +#define L_BUFFER_SIZE 12288 +#endif + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 6 lines of N +*******************************************************************************************/ + +.macro KERNEL16x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vmovups -8 * SIZE(AO), %ymm1 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro SAVE16x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm13, %ymm13 + vmulps %ymm0 , %ymm14, %ymm14 + vmulps %ymm0 , %ymm15, %ymm15 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 + + vaddps (CO2), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2), %ymm11,%ymm11 + + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 + + vaddps (CO2, LDC,2), %ymm14,%ymm14 + vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) + + vmovups %ymm10, (CO2) + vmovups %ymm11, 8 * SIZE(CO2) + + vmovups %ymm12, (CO2, LDC) + vmovups %ymm13, 8 * SIZE(CO2, LDC) + + vmovups %ymm14, (CO2, LDC,2) + vmovups %ymm15, 8 * SIZE(CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE8x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm14, %ymm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps (CO2), %ymm10,%ymm10 + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps (CO2, LDC,2), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm10, (CO2) + vmovups %ymm12, (CO2, LDC) + vmovups %ymm14, (CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x6_SUB + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + + vbroadcastss -2 * SIZE(BO), %xmm2 + vbroadcastss -1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + + vbroadcastss 0 * SIZE(BO), %xmm2 + vbroadcastss 1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE4x6 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + vmulps %xmm0 , %xmm12, %xmm12 + vmulps %xmm0 , %xmm14, %xmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO1, LDC,2), %xmm8,%xmm8 + vaddps (CO2), %xmm10,%xmm10 + vaddps (CO2, LDC), %xmm12,%xmm12 + vaddps (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO1, LDC,2) + vmovups %xmm10, (CO2) + vmovups %xmm12, (CO2, LDC) + vmovups %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -15 * SIZE(AO), %xmm1 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE2x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm13, %xmm13 + vmulss %xmm0 , %xmm14, %xmm14 + vmulss %xmm0 , %xmm15, %xmm15 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 + + vaddss (CO2), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2), %xmm11,%xmm11 + + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 + + vaddss (CO2, LDC,2), %xmm14,%xmm14 + vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) + + vmovss %xmm10, (CO2) + vmovss %xmm11, 1 * SIZE(CO2) + + vmovss %xmm12, (CO2, LDC) + vmovss %xmm13, 1 * SIZE(CO2, LDC) + + vmovss %xmm14, (CO2, LDC,2) + vmovss %xmm15, 1 * SIZE(CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO + decq %rax +.endm + +.macro SAVE1x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss (CO2), %xmm10,%xmm10 + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm10, (CO2) + vmovss %xmm12, (CO2, LDC) + vmovss %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv6 // N / 12 + movq %rdx, Nmod6 // N % 12 + + movq Ndiv6, J + cmpq $0, J + je .L4_00 + ALIGN_4 + + +/*******************************************************************************************/ + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + + +.L6_02c: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02c + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + + ALIGN_4 + +.L6_12: + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L6_16 + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x6_SUB + + jnz .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L6_60 // to next 6 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + ALIGN_4 + +.L6_20_7: + + KERNEL8x6_SUB + + jnz .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x6_SUB + + jnz .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + + ALIGN_4 + +.L6_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x6_SUB + + jnz .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L6_60 // to next 4 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x6_SUB + + jnz .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L6_60: + + +/*******************************************************************************************/ + + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + + +.L7_02c: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02c + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + + ALIGN_4 + +.L7_12: + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L7_16 + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_4 + +.L7_17: + + KERNEL16x6_SUB + + jnz .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 6 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x6_SUB + + jnz .L7_20_7 + ALIGN_4 + + +.L7_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x6_SUB + + jnz .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x6_SUB + + jnz .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 4 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x6_SUB + + jnz .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L7_60: + + decq J // j -- + jg .L6_01 // next 12 lines of N + + + + +/*******************************************************************************************/ +.L4_00: + + movq Nmod6, J + sarq $2, J // j = j / 4 + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_00: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#else + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h index 36b7aa1a3..970d63578 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h @@ -1,226 +1,226 @@ -/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ -/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ -/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ - -#define init_m8n4(c1,c2,c3,c4)\ - "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ - "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" -#define INIT_m8n4 init_m8n4(4,5,6,7) -#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) -#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) - -#define init_m4n4(c1,c2,c3,c4)\ - "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ - "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" -#define INIT_m4n4 init_m4n4(4,5,6,7) -#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) -#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) - -#define init_m2n4(c1,c2)\ - "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" -#define INIT_m2n4 init_m2n4(4,5) -#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) -#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) - -#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" -#define INIT_m1n4 init_m1n4(4) -#define INIT_m1n8 INIT_m1n4 init_m1n4(5) -#define INIT_m1n12 INIT_m1n8 init_m1n4(6) - -#define GEMM_KERNEL_k1m8n4 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ - "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ - "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" -#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ - "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ - "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" -#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ - "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ - "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" - -#define GEMM_KERNEL_k1m4n4 \ - "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ - "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ - "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" -#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ - "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ - "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" -#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ - "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ - "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" - -#define GEMM_KERNEL_k1m2n4 \ - "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ - "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" -#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ - "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" -#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ - "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" - -#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" -#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" -#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" - -#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ - "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ - "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ - "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ - "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" - -#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ - "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ - "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ - "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ - "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ - "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ - "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ - "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ - "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ - "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" - -#define GEMM_SUM_REORDER_2x4(c1,c2)\ - "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ - "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ - "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ - -#define GEMM_SUM_REORDER_1x4(c1)\ - "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ - "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ - "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" - -#define SOLVE_le_m4n2(b_off,c1,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ - "vmovsldup %%ymm"#c1",%%ymm1;" - -#define SOLVE_le_m8n2(b_off,c1,c2,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ - "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" - -#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ - "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SOLVE_ri_m4n2(b_off,c1,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ - "vmovshdup %%ymm"#c1",%%ymm1;" - -#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ - "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" - -#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ - "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $0,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $85,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $170,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $255,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ - "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ - "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ - "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ - "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m4n2(c1,a_off)\ - "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ - "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ - "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m1n4(c1,a_off)\ - "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ +/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ + +#define init_m8n4(c1,c2,c3,c4)\ + "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ + "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" +#define INIT_m8n4 init_m8n4(4,5,6,7) +#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) +#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) + +#define init_m4n4(c1,c2,c3,c4)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ + "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" +#define INIT_m4n4 init_m4n4(4,5,6,7) +#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) +#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) + +#define init_m2n4(c1,c2)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" +#define INIT_m2n4 init_m2n4(4,5) +#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) +#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) + +#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" +#define INIT_m1n4 init_m1n4(4) +#define INIT_m1n8 INIT_m1n4 init_m1n4(5) +#define INIT_m1n12 INIT_m1n8 init_m1n4(6) + +#define GEMM_KERNEL_k1m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ + "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" +#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" +#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" + +#define GEMM_KERNEL_k1m4n4 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ + "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ + "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ + "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" +#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ + "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" + +#define GEMM_KERNEL_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ + "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" +#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ + "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ + "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" + +#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" +#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" +#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" + +#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" + +#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ + "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" + +#define GEMM_SUM_REORDER_2x4(c1,c2)\ + "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ + "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ + +#define GEMM_SUM_REORDER_1x4(c1)\ + "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" + +#define SOLVE_le_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovsldup %%ymm"#c1",%%ymm1;" + +#define SOLVE_le_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" + +#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_ri_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovshdup %%ymm"#c1",%%ymm1;" + +#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" + +#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ + "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ + "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m4n2(c1,a_off)\ + "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m1n4(c1,a_off)\ + "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" diff --git a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S index 94e2f6117..6c8b4c872 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S +++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S @@ -1,1404 +1,1404 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd -#else -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd -#endif - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L2_40 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL2x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L1_40 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL2x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L1_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S index 848b6f237..bffe5439d 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S +++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S @@ -1,1429 +1,1429 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/********************************************************************* -* -* 2014/06/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/30 Saar -* -* Parameter: -* UNROLL_M 2 -* UNROLL_N 2 -* ZGEMM_P 384 -* ZGEMM_Q 168 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) -* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) -* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) -* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) -* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) -* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) -* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) -* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) -* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 256*8*4 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd -#else -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd -#endif - - -#define A_PR1 512 -#define B_PR1 256 - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L2_40 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL2x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L1_40 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL2x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L1_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2014/06/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/30 Saar +* +* Parameter: +* UNROLL_M 2 +* UNROLL_N 2 +* ZGEMM_P 384 +* ZGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) +* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) +* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) +* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) +* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) +* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) +* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) +* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) +* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 256*8*4 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index f91bfa89b..29729b101 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -1,3881 +1,3881 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/******************************************************************************** -* 2014/07/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* ZGEMM_DEFAULT_UNROLL_N 2 -* ZGEMM_DEFAULT_UNROLL_M 4 -* ZGEMM_DEFAULT_P 256 -* ZGEMM_DEFAULT_Q 128 -* A_PR1 512 -* B_PR1 512 -* -* 2014/07/28 Saar -* Performance at 4608x4608x4608: -* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) -* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) -* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) -* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) -* -********************************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(BULLDOZER) - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#else - -#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#endif - -#else - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#else - -#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#endif - -#endif - -#define A_PR1 512 -#define B_PR1 512 - - - -/***************************************************************************************************/ - -.macro KERNEL4x3_SUB - vmovups (AO), %ymm0 - vmovups 4 * SIZE(AO), %ymm1 - prefetcht0 A_PR1(AO) - - vbroadcastsd (BO), %ymm2 - vbroadcastsd 1 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) - - vbroadcastsd 2 * SIZE(BO), %ymm2 - vbroadcastsd 3 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) - - vbroadcastsd 4 * SIZE(BO), %ymm2 - vbroadcastsd 5 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) - - addq $ 6*SIZE, BO - addq $ 8*SIZE, AO - decq %rax -.endm - -.macro SAVE4x3 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 - vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - vaddsubpd %ymm5 ,%ymm4 , %ymm4 - vaddsubpd %ymm7 ,%ymm6 , %ymm6 - - vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 - vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 - vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 - vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 - -#else - vaddsubpd %ymm8, %ymm9 ,%ymm9 - vaddsubpd %ymm10, %ymm11,%ymm11 - vaddsubpd %ymm12, %ymm13,%ymm13 - vaddsubpd %ymm14, %ymm15,%ymm15 - vaddsubpd %ymm4 , %ymm5 ,%ymm5 - vaddsubpd %ymm6 , %ymm7 ,%ymm7 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm11, %ymm10 - vmovapd %ymm13, %ymm12 - vmovapd %ymm15, %ymm14 - vmovapd %ymm5 , %ymm4 - vmovapd %ymm7 , %ymm6 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 - vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm10, %ymm0, %ymm10 - vmulpd %ymm12, %ymm0, %ymm12 - vmulpd %ymm14, %ymm0, %ymm14 - vmulpd %ymm4 , %ymm0, %ymm4 - vmulpd %ymm6 , %ymm0, %ymm6 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm11, %ymm1, %ymm11 - vmulpd %ymm13, %ymm1, %ymm13 - vmulpd %ymm15, %ymm1, %ymm15 - vmulpd %ymm5 , %ymm1, %ymm5 - vmulpd %ymm7 , %ymm1, %ymm7 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - vaddsubpd %ymm5 ,%ymm4 , %ymm4 - vaddsubpd %ymm7 ,%ymm6 , %ymm6 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - - vaddpd (CO1, LDC), %ymm10, %ymm10 - vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 - - vaddpd (CO1, LDC,2), %ymm4 , %ymm4 - vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 4 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 4 * SIZE(CO1, LDC) - - vmovups %ymm4 , (CO1, LDC, 2) - vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - - - -/***************************************************************************************************/ - -.macro KERNEL2x3_SUB - vmovups (AO), %xmm0 - vmovups 2 * SIZE(AO), %xmm1 - vmovddup (BO), %xmm2 - vmovddup 1 * SIZE(BO), %xmm3 - - VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) - - vmovddup 2 * SIZE(BO), %xmm2 - vmovddup 3 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) - - vmovddup 4 * SIZE(BO), %xmm2 - vmovddup 5 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) - - addq $ 6*SIZE, BO - addq $ 4*SIZE, AO - decq %rax -.endm - -.macro SAVE2x3 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 - vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - vaddsubpd %xmm5, %xmm4 , %xmm4 - vaddsubpd %xmm7, %xmm6 , %xmm6 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 - vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 - vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - vaddsubpd %xmm4, %xmm5 ,%xmm5 - vaddsubpd %xmm6, %xmm7 ,%xmm7 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - vmovapd %xmm5, %xmm4 - vmovapd %xmm7, %xmm6 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - vmulpd %xmm4 , %xmm0, %xmm4 - vmulpd %xmm6 , %xmm0, %xmm6 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - vmulpd %xmm5 , %xmm1, %xmm5 - vmulpd %xmm7 , %xmm1, %xmm7 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - vaddsubpd %xmm5, %xmm4 , %xmm4 - vaddsubpd %xmm7, %xmm6 , %xmm6 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - - vaddpd (CO1, LDC,2), %xmm4 , %xmm4 - vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - - vmovups %xmm4 , (CO1, LDC,2) - vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) - -.endm - - -/************************************************************************************************/ - - -.macro KERNEL1x3_SUB - vmovups (AO), %xmm0 - vmovddup (BO), %xmm2 - vmovddup 1 * SIZE(BO), %xmm3 - - VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) - - vmovddup 2 * SIZE(BO), %xmm2 - vmovddup 3 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) - - vmovddup 4 * SIZE(BO), %xmm2 - vmovddup 5 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 2*SIZE, AO - decq %rax -.endm - -.macro SAVE1x3 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm5, %xmm4 , %xmm4 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - vaddsubpd %xmm4, %xmm5, %xmm5 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm5, %xmm4 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm4 , %xmm0, %xmm4 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm5 , %xmm1, %xmm5 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm5, %xmm4 , %xmm4 - -#ifndef TRMMKERNEL - - vaddpd (CO1) , %xmm8 , %xmm8 - vaddpd (CO1, LDC) , %xmm10, %xmm10 - vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm4 , (CO1, LDC,2) - -.endm - - - - -/***************************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 - - vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 - vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) - VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) - VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) - VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) - - addq $ 4, BI - addq $ 8, %rax -.endm - -.macro SAVE4x2 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - - vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 - -#else - vaddsubpd %ymm8, %ymm9 ,%ymm9 - vaddsubpd %ymm10, %ymm11,%ymm11 - vaddsubpd %ymm12, %ymm13,%ymm13 - vaddsubpd %ymm14, %ymm15,%ymm15 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm11, %ymm10 - vmovapd %ymm13, %ymm12 - vmovapd %ymm15, %ymm14 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm10, %ymm0, %ymm10 - vmulpd %ymm12, %ymm0, %ymm12 - vmulpd %ymm14, %ymm0, %ymm14 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm11, %ymm1, %ymm11 - vmulpd %ymm13, %ymm1, %ymm13 - vmulpd %ymm15, %ymm1, %ymm15 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - - vaddpd (CO1, LDC), %ymm10, %ymm10 - vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 4 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 4 * SIZE(CO1, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - -/***************************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro SAVE2x2 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -.endm - -/************************************************************************************************/ - -/************************************************************************************************/ - - -.macro KERNEL1x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 2, %rax -.endm - -.macro SAVE1x2 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 - vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 - VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) - VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - - addq $ 2, BI - addq $ 8, %rax -.endm - -.macro SAVE4x1 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm13,%ymm12 , %ymm12 - - vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - -#else - vaddsubpd %ymm8, %ymm9 , %ymm9 - vaddsubpd %ymm12,%ymm13, %ymm13 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm13, %ymm12 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm12, %ymm0, %ymm12 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm13, %ymm1, %ymm13 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm13, %ymm12, %ymm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 ,4 * SIZE(CO1) - -.endm - - - -/************************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 4, %rax -.endm - -.macro SAVE2x1 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm - -.macro SAVE1x1 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -.endm - - -/************************************************************************************************/ - - - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -/************************************************************************************************/ -.L6_00_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L2_00_0 - ALIGN_4 - - - -.L6_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 2 * COMPSIZE - leaq (B, %rax,8), BO2 - movq BO2, B // next offset of B - movq K, %rax - ALIGN_4 - -.L6_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups (BO2), %xmm2 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - vmovups %xmm2, 4 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L6_00_02b - -.L6_00_02c: - - - -.L6_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L6_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L6_4_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_4_16 - ALIGN_4 - -.L6_4_12: - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L6_4_16 - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L6_4_16 - - jmp .L6_4_12 - ALIGN_4 - -.L6_4_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_4_19 - ALIGN_4 - -.L6_4_17: - - KERNEL4x3_SUB - - jnz .L6_4_17 - ALIGN_4 - - -.L6_4_19: - - SAVE4x3 - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L6_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L6_2_10: - testq $ 2, M - jz .L6_2_40 // to next 2 lines of N - -.L6_2_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_2_16 - ALIGN_4 - -.L6_2_12: - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L6_2_16 - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L6_2_16 - - jmp .L6_2_12 - ALIGN_4 - -.L6_2_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_2_19 - ALIGN_4 - -.L6_2_17: - - KERNEL2x3_SUB - - jnz .L6_2_17 - ALIGN_4 - - -.L6_2_19: - - SAVE2x3 - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_2_40: - testq $ 1, M - jz .L6_2_60 // to next 2 lines of N - - ALIGN_4 - -.L6_2_41: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_2_46 - - ALIGN_4 - -.L6_2_42: - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L6_2_46 - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L6_2_46 - - jmp .L6_2_42 - ALIGN_4 - -.L6_2_46: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_2_49 - - ALIGN_4 - -.L6_2_47: - - KERNEL1x3_SUB - - jnz .L6_2_47 - ALIGN_4 - - -.L6_2_49: - - SAVE1x3 - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L6_2_41 - ALIGN_4 - - - - -.L6_2_60: - - -/************************************************************************************************/ - -/************************************************************************************************/ - - -.L7_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 2 * COMPSIZE - leaq (B, %rax,8), BO2 - movq K, %rax - ALIGN_4 - -.L7_00_02b: - - vmovups 2 * SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - vmovups %xmm2, 4 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L7_00_02b - -.L7_00_02c: - - movq BO2, B // next offset of B - - -.L7_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L7_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L7_4_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_4_16 - ALIGN_4 - -.L7_4_12: - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L7_4_16 - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L7_4_16 - - jmp .L7_4_12 - ALIGN_4 - -.L7_4_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_4_19 - - ALIGN_4 - -.L7_4_17: - - KERNEL4x3_SUB - - jnz .L7_4_17 - ALIGN_4 - - -.L7_4_19: - - SAVE4x3 - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L7_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L7_2_10: - testq $ 2, M - jz .L7_2_40 // to next 2 lines of N - -.L7_2_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_2_16 - ALIGN_4 - -.L7_2_12: - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L7_2_16 - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L7_2_16 - - jmp .L7_2_12 - ALIGN_4 - -.L7_2_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_2_19 - - ALIGN_4 - -.L7_2_17: - - KERNEL2x3_SUB - - jnz .L7_2_17 - ALIGN_4 - - -.L7_2_19: - - SAVE2x3 - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_2_40: - testq $ 1, M - jz .L7_2_60 // to next 2 lines of N - - ALIGN_4 - -.L7_2_41: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_2_46 - - ALIGN_4 - -.L7_2_42: - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L7_2_46 - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L7_2_46 - - jmp .L7_2_42 - ALIGN_4 - -.L7_2_46: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_2_49 - ALIGN_4 - -.L7_2_47: - - KERNEL1x3_SUB - - jnz .L7_2_47 - ALIGN_4 - - -.L7_2_49: - - SAVE1x3 - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L7_2_41 - ALIGN_4 - - - - -.L7_2_60: - - decq J // j -- - jg .L6_00_01 // next 6 lines of N - -/************************************************************************************************/ - - - -/************************************************************************************************/ -.L2_00_0: - - movq Nmod6, J - sarq $1, J // j = j / 2 - cmpq $ 0, J - je .L1_2_0 - ALIGN_4 - - - -.L2_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_00_02b - -.L2_00_02c: - - movq BO1, B // next offset of B - - -.L2_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L2_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L2_2_10: - testq $ 2, M - jz .L2_2_40 // to next 2 lines of N - -.L2_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - jmp .L2_2_12 - ALIGN_4 - -.L2_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_17: - - KERNEL2x2_SUB - - jl .L2_2_17 - ALIGN_4 - - -.L2_2_19: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_2_40: - testq $ 1, M - jz .L2_2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - jmp .L2_2_42 - ALIGN_4 - -.L2_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_47: - - KERNEL1x2_SUB - - jl .L2_2_47 - ALIGN_4 - - -.L2_2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_2_41 - ALIGN_4 - - - - -.L2_2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_00_01 // next 2 lines of N - - - -.L1_2_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_00_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_00_02b - -.L1_00_02c: - - movq BO1, B // next offset of B - -.L1_00_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L1_2_10 - - ALIGN_4 - -/*******************************************************************************************************/ - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_4_11 - ALIGN_4 - - - - -/*******************************************************************************************************/ -.L1_2_10: - testq $ 2, M - jz .L1_2_40 - - -.L1_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - jmp .L1_2_12 - ALIGN_4 - -.L1_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_17: - - KERNEL2x1_SUB - - jl .L1_2_17 - ALIGN_4 - - -.L1_2_19: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_2_40: - testq $ 1, M - jz .L999 - - ALIGN_4 - -.L1_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - jmp .L1_2_42 - ALIGN_4 - -.L1_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_47: - - KERNEL1x1_SUB - - jl .L1_2_47 - ALIGN_4 - - -.L1_2_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L1_2_41 - ALIGN_4 - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************************ - TRMM Kernel -************************************************************************************************/ - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_00_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L1_2_0 - ALIGN_4 - - - -.L2_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_00_02b - -.L2_00_02c: - - movq BO1, B // next offset of B - - -.L2_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L2_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L2_2_10: - testq $ 2, M - jz .L2_2_40 // to next 2 lines of N - -.L2_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - jmp .L2_2_12 - ALIGN_4 - -.L2_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_17: - - KERNEL2x2_SUB - - jl .L2_2_17 - ALIGN_4 - - -.L2_2_19: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_2_40: - testq $ 1, M - jz .L2_2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - jmp .L2_2_42 - ALIGN_4 - -.L2_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_47: - - KERNEL1x2_SUB - - jl .L2_2_47 - ALIGN_4 - - -.L2_2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_2_41 - ALIGN_4 - - - - -.L2_2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_00_01 // next 2 lines of N - - - -.L1_2_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_00_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_00_02b - -.L1_00_02c: - - movq BO1, B // next offset of B - -.L1_00_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L1_2_10 - - ALIGN_4 - -/*******************************************************************************************************/ - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_4_11 - ALIGN_4 - - - - -/*******************************************************************************************************/ -.L1_2_10: - testq $ 2, M - jz .L1_2_40 - - -.L1_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - jmp .L1_2_12 - ALIGN_4 - -.L1_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_17: - - KERNEL2x1_SUB - - jl .L1_2_17 - ALIGN_4 - - -.L1_2_19: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_2_40: - testq $ 1, M - jz .L999 - - ALIGN_4 - -.L1_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - jmp .L1_2_42 - ALIGN_4 - -.L1_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_47: - - KERNEL1x1_SUB - - jl .L1_2_47 - ALIGN_4 - - -.L1_2_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L1_2_41 - ALIGN_4 - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE - -#endif - - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/******************************************************************************** +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* ZGEMM_DEFAULT_UNROLL_N 2 +* ZGEMM_DEFAULT_UNROLL_M 4 +* ZGEMM_DEFAULT_P 256 +* ZGEMM_DEFAULT_Q 128 +* A_PR1 512 +* B_PR1 512 +* +* 2014/07/28 Saar +* Performance at 4608x4608x4608: +* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) +* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) +* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) +* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) +* +********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(BULLDOZER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#endif + +#endif + +#define A_PR1 512 +#define B_PR1 512 + + + +/***************************************************************************************************/ + +.macro KERNEL4x3_SUB + vmovups (AO), %ymm0 + vmovups 4 * SIZE(AO), %ymm1 + prefetcht0 A_PR1(AO) + + vbroadcastsd (BO), %ymm2 + vbroadcastsd 1 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) + + vbroadcastsd 2 * SIZE(BO), %ymm2 + vbroadcastsd 3 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) + + vbroadcastsd 4 * SIZE(BO), %ymm2 + vbroadcastsd 5 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 + vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + vaddsubpd %ymm4 , %ymm5 ,%ymm5 + vaddsubpd %ymm6 , %ymm7 ,%ymm7 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + vmovapd %ymm5 , %ymm4 + vmovapd %ymm7 , %ymm6 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + vmulpd %ymm4 , %ymm0, %ymm4 + vmulpd %ymm6 , %ymm0, %ymm6 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + vmulpd %ymm5 , %ymm1, %ymm5 + vmulpd %ymm7 , %ymm1, %ymm7 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + + vaddpd (CO1, LDC,2), %ymm4 , %ymm4 + vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + vmovups %ymm4 , (CO1, LDC, 2) + vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + + + +/***************************************************************************************************/ + +.macro KERNEL2x3_SUB + vmovups (AO), %xmm0 + vmovups 2 * SIZE(AO), %xmm1 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE2x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + vaddsubpd %xmm4, %xmm5 ,%xmm5 + vaddsubpd %xmm6, %xmm7 ,%xmm7 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + vmovapd %xmm5, %xmm4 + vmovapd %xmm7, %xmm6 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + vmulpd %xmm4 , %xmm0, %xmm4 + vmulpd %xmm6 , %xmm0, %xmm6 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + vmulpd %xmm5 , %xmm1, %xmm5 + vmulpd %xmm7 , %xmm1, %xmm7 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + + vaddpd (CO1, LDC,2), %xmm4 , %xmm4 + vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + + vmovups %xmm4 , (CO1, LDC,2) + vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) + +.endm + + +/************************************************************************************************/ + + +.macro KERNEL1x3_SUB + vmovups (AO), %xmm0 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE1x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + vaddsubpd %xmm4, %xmm5, %xmm5 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm5, %xmm4 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm4 , %xmm0, %xmm4 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm5 , %xmm1, %xmm5 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + +#ifndef TRMMKERNEL + + vaddpd (CO1) , %xmm8 , %xmm8 + vaddpd (CO1, LDC) , %xmm10, %xmm10 + vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm4 , (CO1, LDC,2) + +.endm + + + + +/***************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + + vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) + + addq $ 4, BI + addq $ 8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro SAVE2x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.macro KERNEL1x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 2, %rax +.endm + +.macro SAVE1x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 + vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + + addq $ 2, BI + addq $ 8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13,%ymm12 , %ymm12 + + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + +#else + vaddsubpd %ymm8, %ymm9 , %ymm9 + vaddsubpd %ymm12,%ymm13, %ymm13 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm13, %ymm12 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm13, %ymm1, %ymm13 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13, %ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 ,4 * SIZE(CO1) + +.endm + + + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + + +/************************************************************************************************/ + + + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +/************************************************************************************************/ +.L6_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L2_00_0 + ALIGN_4 + + + +.L6_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq BO2, B // next offset of B + movq K, %rax + ALIGN_4 + +.L6_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups (BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_00_02b + +.L6_00_02c: + + + +.L6_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L6_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L6_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_4_16 + ALIGN_4 + +.L6_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + jmp .L6_4_12 + ALIGN_4 + +.L6_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_4_19 + ALIGN_4 + +.L6_4_17: + + KERNEL4x3_SUB + + jnz .L6_4_17 + ALIGN_4 + + +.L6_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L6_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L6_2_10: + testq $ 2, M + jz .L6_2_40 // to next 2 lines of N + +.L6_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_16 + ALIGN_4 + +.L6_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + jmp .L6_2_12 + ALIGN_4 + +.L6_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_19 + ALIGN_4 + +.L6_2_17: + + KERNEL2x3_SUB + + jnz .L6_2_17 + ALIGN_4 + + +.L6_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_2_40: + testq $ 1, M + jz .L6_2_60 // to next 2 lines of N + + ALIGN_4 + +.L6_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_46 + + ALIGN_4 + +.L6_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + jmp .L6_2_42 + ALIGN_4 + +.L6_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_49 + + ALIGN_4 + +.L6_2_47: + + KERNEL1x3_SUB + + jnz .L6_2_47 + ALIGN_4 + + +.L6_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L6_2_41 + ALIGN_4 + + + + +.L6_2_60: + + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.L7_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq K, %rax + ALIGN_4 + +.L7_00_02b: + + vmovups 2 * SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_00_02b + +.L7_00_02c: + + movq BO2, B // next offset of B + + +.L7_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L7_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L7_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_4_16 + ALIGN_4 + +.L7_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + jmp .L7_4_12 + ALIGN_4 + +.L7_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_4_19 + + ALIGN_4 + +.L7_4_17: + + KERNEL4x3_SUB + + jnz .L7_4_17 + ALIGN_4 + + +.L7_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L7_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L7_2_10: + testq $ 2, M + jz .L7_2_40 // to next 2 lines of N + +.L7_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_16 + ALIGN_4 + +.L7_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + jmp .L7_2_12 + ALIGN_4 + +.L7_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_19 + + ALIGN_4 + +.L7_2_17: + + KERNEL2x3_SUB + + jnz .L7_2_17 + ALIGN_4 + + +.L7_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_2_40: + testq $ 1, M + jz .L7_2_60 // to next 2 lines of N + + ALIGN_4 + +.L7_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_46 + + ALIGN_4 + +.L7_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + jmp .L7_2_42 + ALIGN_4 + +.L7_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_49 + ALIGN_4 + +.L7_2_47: + + KERNEL1x3_SUB + + jnz .L7_2_47 + ALIGN_4 + + +.L7_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L7_2_41 + ALIGN_4 + + + + +.L7_2_60: + + decq J // j -- + jg .L6_00_01 // next 6 lines of N + +/************************************************************************************************/ + + + +/************************************************************************************************/ +.L2_00_0: + + movq Nmod6, J + sarq $1, J // j = j / 2 + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************************ + TRMM Kernel +************************************************************************************************/ + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + +#endif + + diff --git a/relapack/src/CMakeLists.txt b/relapack/src/CMakeLists.txt index 78fb1431f..b92089418 100644 --- a/relapack/src/CMakeLists.txt +++ b/relapack/src/CMakeLists.txt @@ -1,86 +1,86 @@ -include_directories(${PROJECT_SOURCE_DIR}) -include_directories(${PROJECT_BINARY_DIR}) -include_directories(${PROJECT_SOURCE_DIR}/relapack) - -set(RELAFILES -clauum.c -ctrsyl_rec2.c -dsytrf.c -spbtrf.c -strsyl_rec2.c -zhetrf_rook_rec2.c -ztrsyl.c -cgbtrf.c -cpbtrf.c -ctrtri.c -dsytrf_rec2.c -spotrf.c -strtri.c -zlauum.c -ztrsyl_rec2.c -cgemmt.c -cpotrf.c -dgbtrf.c -dsytrf_rook.c -lapack_wrappers.c -ssygst.c -zgbtrf.c -zpbtrf.c -ztrtri.c -cgetrf.c -csytrf.c -dgemmt.c -dsytrf_rook_rec2.c -ssytrf.c -zgemmt.c -zpotrf.c -chegst.c -csytrf_rec2.c -dgetrf.c -dtgsyl.c -ssytrf_rec2.c -zgetrf.c -zsytrf.c -chetrf.c -csytrf_rook.c -dlauum.c -dtrsyl.c -sgbtrf.c -ssytrf_rook.c -zhegst.c -zsytrf_rec2.c -chetrf_rec2.c -csytrf_rook_rec2.c -dpbtrf.c -dtrsyl_rec2.c -sgemmt.c -ssytrf_rook_rec2.c -zhetrf.c -zsytrf_rook.c -chetrf_rook.c -ctgsyl.c -dpotrf.c -dtrtri.c -sgetrf.c -stgsyl.c -zhetrf_rec2.c -zsytrf_rook_rec2.c -chetrf_rook_rec2.c -ctrsyl.c -dsygst.c -f2c.c -slauum.c -strsyl.c -zhetrf_rook.c -ztgsyl.c -) - - - -# add relapack folder to the sources -set(RELA_SOURCES "") -foreach (RELA_FILE ${RELAFILES}) - list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") -endforeach () -add_library(relapack_src OBJECT ${RELA_SOURCES}) -set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") +include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) +include_directories(${PROJECT_SOURCE_DIR}/relapack) + +set(RELAFILES +clauum.c +ctrsyl_rec2.c +dsytrf.c +spbtrf.c +strsyl_rec2.c +zhetrf_rook_rec2.c +ztrsyl.c +cgbtrf.c +cpbtrf.c +ctrtri.c +dsytrf_rec2.c +spotrf.c +strtri.c +zlauum.c +ztrsyl_rec2.c +cgemmt.c +cpotrf.c +dgbtrf.c +dsytrf_rook.c +lapack_wrappers.c +ssygst.c +zgbtrf.c +zpbtrf.c +ztrtri.c +cgetrf.c +csytrf.c +dgemmt.c +dsytrf_rook_rec2.c +ssytrf.c +zgemmt.c +zpotrf.c +chegst.c +csytrf_rec2.c +dgetrf.c +dtgsyl.c +ssytrf_rec2.c +zgetrf.c +zsytrf.c +chetrf.c +csytrf_rook.c +dlauum.c +dtrsyl.c +sgbtrf.c +ssytrf_rook.c +zhegst.c +zsytrf_rec2.c +chetrf_rec2.c +csytrf_rook_rec2.c +dpbtrf.c +dtrsyl_rec2.c +sgemmt.c +ssytrf_rook_rec2.c +zhetrf.c +zsytrf_rook.c +chetrf_rook.c +ctgsyl.c +dpotrf.c +dtrtri.c +sgetrf.c +stgsyl.c +zhetrf_rec2.c +zsytrf_rook_rec2.c +chetrf_rook_rec2.c +ctrsyl.c +dsygst.c +f2c.c +slauum.c +strsyl.c +zhetrf_rook.c +ztgsyl.c +) + + + +# add relapack folder to the sources +set(RELA_SOURCES "") +foreach (RELA_FILE ${RELAFILES}) + list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") +endforeach () +add_library(relapack_src OBJECT ${RELA_SOURCES}) +set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")