|
- /***************************************************************************
- Copyright (c) 2013-2019, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
- #define unit_size 4
- #define DISP64(ind,disp) (ind*unit_size*64+disp)
- #define DISP32(ind,disp) (ind*unit_size*32+disp)
- #define DISP16(ind,disp) (ind*unit_size*16+disp)
- #define DISP8(ind,disp) (ind*unit_size*8+disp)
- #define DISP4(ind,disp) (ind*unit_size*4+disp)
- #define DISP2(ind,disp) (ind*unit_size*2+disp)
- #define DISP1(ind,disp) (ind*unit_size+disp)
-
- /**********************************************************************************************
- * Macros for N=8 and M=16
- **********************************************************************************************/
-
-
-
- .macro KERNEL8x16_L1_L4 Index,IsLast
- KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro Zero8X16
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs50, vs50, vs50
- xxlxor vs51, vs51, vs51
- xxlxor vs52, vs52, vs52
- xxlxor vs53, vs53, vs53
- xxlxor vs54, vs54, vs54
- xxlxor vs55, vs55, vs55
- xxlxor vs56, vs56, vs56
- xxlxor vs57, vs57, vs57
- xxlxor vs58, vs58, vs58
- xxlxor vs59, vs59, vs59
- xxlxor vs60, vs60, vs60
- xxlxor vs61, vs61, vs61
- xxlxor vs62, vs62, vs62
- xxlxor vs63, vs63, vs63
- .endm
-
- .macro LOAD8x16 OffsetA,OffsetB
-
- lxv vs24, (\OffsetB+0)(BO)
- lxv vs28, (\OffsetB+16)(BO)
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- lxv vs0, (\OffsetA+0)(AO)
- lxv vs1, (\OffsetA+16)(AO)
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
- lxv vs2, (\OffsetA+32)(AO)
- lxv vs3, (\OffsetA+48)(AO)
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- .endm
-
- .macro END8x16_NORMAL
- END8x16 0, AO, BO, 64,32
- .endm
-
- .macro END8x16_WITHOUT_ADD
- END8x16 0, AO,BO,0,0
- .endm
-
- .macro END8x16 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
- xvmulsp vs34, vs2,vs24
- xvmulsp vs35, vs3,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
- xvmulsp vs38, vs2,vs25
- xvmulsp vs39, vs3,vs25
-
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
- xvmulsp vs42, vs2,vs26
- xvmulsp vs43, vs3,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
- xvmulsp vs46, vs2,vs27
- xvmulsp vs47, vs3,vs27
-
- xvmulsp vs48, vs0,vs28
- xvmulsp vs49, vs1,vs28
- xvmulsp vs50, vs2,vs28
- xvmulsp vs51, vs3,vs28
-
- xvmulsp vs52, vs0,vs29
- xvmulsp vs53, vs1,vs29
- xvmulsp vs54, vs2,vs29
- xvmulsp vs55, vs3,vs29
-
- xvmulsp vs56, vs0,vs30
- xvmulsp vs57, vs1,vs30
- xvmulsp vs58, vs2,vs30
- xvmulsp vs59, vs3,vs30
-
- xvmulsp vs60, vs0,vs31
- xvmulsp vs61, vs1,vs31
- xvmulsp vs62, vs2,vs31
- xvmulsp vs63, vs3,vs31
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
- xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
-
- .endif
- .endm
-
- .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
- KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
-
- .endm
-
- .macro KERNEL8x16 First
-
- LOAD8x16 0,0
- END8x16 \First, AO, BO, 64,32
- .endm
-
- .macro LOAD8x16_2
- LOAD8x16_2O AO,BO, 0,0
- .endm
-
- .macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
- lxv vs8, (\OffsetB)(\BREG)
- lxv vs12, (16+\OffsetB)(\BREG)
- lxv vs24, (32+\OffsetB)(\BREG)
- lxv vs28, (32+16+\OffsetB)(\BREG)
- lxv vs4, (0+\OffsetA)(\AREG)
- lxv vs5, (16+\OffsetA)(\AREG)
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- lxv vs6, (32+\OffsetA)(\AREG)
- lxv vs7, (48+\OffsetA)(\AREG)
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
- lxv vs0, (64+\OffsetA)(\AREG)
- lxv vs1, (64+16+\OffsetA)(\AREG)
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
- lxv vs2, (64+32+\OffsetA)(\AREG)
- lxv vs3, (64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
- .endm
-
- .macro END8x16_2
- /*for load2 offset will be 128 and 64*/
- KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
- .endm
-
-
-
- .macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
- .endm
-
-
- .macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
- .endm
-
-
- .macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
-
- .if \Complete==0
- lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
- .endif
-
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs51, vs7,vs12
- .if \Complete==0
- lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
- lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
- .endif
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs59, vs7,vs14
- .if \Complete==0
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- .endif
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs55, vs7,vs13
- .if \Complete==0
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
- .endif
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs63, vs7,vs15
- .if \Complete==0
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
- .endif
-
- .if \Complete==0
- lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
- .endif
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
- .if \Complete==0
- lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
- .endif
-
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
- .if \Complete==0
- lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
- .endif
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
- .if \Complete==0
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- .endif
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
- .if \Complete==0
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
- .endif
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
- xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
- .endif
- .if \Complete==0
- lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
- .endif
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \BREG, \BREG, DISP16(\Index,\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,\OffsetA)
-
- .else
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP32(\Index,128)
-
- .endif
- .endif
-
-
- .endm
-
-
- .macro SAVE8x16
-
- slwi T10, LDC , 1
- add T1, CO, LDC
-
- add T2, CO, T10
- add T3, T1, T10
-
- add T4, T2, T10
- add T5, T3, T10
-
- add T6, T4, T10
- add T7, T5, T10
-
-
-
- /* permute to restore butterfly rank 1 updateto normal promoted one */
- /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */
- /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */
- /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */
- /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */
-
- xxmrglw vs8, vs32, vs44
- xxmrglw vs10, vs36, vs40
-
- xxmrghw vs1, vs32, vs44
- xxmrghw vs0, vs36, vs40
-
- xxmrglw vs12, vs33, vs45
- xxmrglw vs14, vs37, vs41
-
- xxmrghw vs2, vs37, vs41
- xxmrghw vs3, vs33, vs45
- #ifndef TRMMKERNEL
- lxv vs32, 0(CO)
- lxv vs33, 16(CO)
- #endif
- xxmrglw vs16, vs34, vs46
- xxmrglw vs18, vs38, vs42
-
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
-
- xxmrghw vs4, vs38, vs42
- xxmrghw vs5, vs34, vs46
-
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxmrglw vs24, vs35, vs47
- xxmrglw vs26, vs39, vs43
-
- xxlor vs17, vs16, vs16
- xxlor vs19, vs18, vs18
-
- xxmrghw vs30, vs39, vs43
- xxmrghw vs31, vs35, vs47
- #ifndef TRMMKERNEL
- lxv vs34, 32(CO)
- lxv vs35, 48(CO)
- #endif
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- #ifndef TRMMKERNEL
- lxv vs36, 0(T1)
- lxv vs37, 16(T1)
- #endif
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- #ifndef TRMMKERNEL
- lxv vs38, 32(T1)
- lxv vs39, 48(T1)
- #endif
-
- xxlor vs25, vs24, vs24
- xxlor vs27, vs26, vs26
-
-
-
- #ifndef TRMMKERNEL
- lxv vs40, 0(T2)
- lxv vs41, 16(T2)
- #endif
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
- #ifndef TRMMKERNEL
- lxv vs42, 32(T2)
- lxv vs43, 48(T2)
- #endif
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
- #ifndef TRMMKERNEL
- lxv vs44, 0(T3)
- lxv vs45, 16(T3)
- #endif
- xxperm vs16, vs4, save_permute_1
- xxperm vs18, vs5, save_permute_1
- #ifndef TRMMKERNEL
- lxv vs46, 32(T3)
- lxv vs47, 48(T3)
- #endif
-
-
-
-
-
- xxperm vs17, vs4, save_permute_2
- xxperm vs19, vs5, save_permute_2
- #ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
- #else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- #endif
- xxperm vs24, vs30, save_permute_1
- xxperm vs26, vs31, save_permute_1
-
-
- stxv vs32, 0(CO)
- stxv vs33, 16(CO)
- #ifdef TRMMKERNEL
- xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
- #else
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- #endif
-
- xxperm vs25, vs30, save_permute_2
- xxperm vs27, vs31, save_permute_2
-
-
- stxv vs34, 32(CO)
- stxv vs35, 48(CO)
- #ifdef TRMMKERNEL
- xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
- #else
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
- #endif
- stxv vs36, 0(T1)
- stxv vs37, 16(T1)
- #ifdef TRMMKERNEL
- xvmulsp vs38, vs17, alpha_r
- xvmulsp vs39, vs25, alpha_r
- #else
- xvmaddasp vs38, vs17, alpha_r
- xvmaddasp vs39, vs25, alpha_r
- #endif
- stxv vs38, 32(T1)
- stxv vs39, 48(T1)
-
- #ifdef TRMMKERNEL
- xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- #else
- xvmaddasp vs40, vs10, alpha_r
- xvmaddasp vs41, vs14, alpha_r
- #endif
-
- stxv vs40, 0(T2)
- stxv vs41, 16(T2)
- #ifdef TRMMKERNEL
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- #else
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- #endif
- stxv vs42, 32(T2)
- stxv vs43, 48(T2)
- #ifdef TRMMKERNEL
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- #else
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- #endif
- stxv vs44, 0(T3)
- stxv vs45, 16(T3)
- #ifdef TRMMKERNEL
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
- #else
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
- #endif
- stxv vs46, 32(T3)
- stxv vs47, 48(T3)
-
- /*****the same with the second 8X8 ****/
- #ifndef TRMMKERNEL
- lxv vs32, 0(T4)
- lxv vs33, 16(T4)
- #endif
- xxmrglw vs8, vs48, vs60
- xxmrglw vs10, vs52, vs56
- #ifndef TRMMKERNEL
- lxv vs34, 32(T4)
- lxv vs35, 48(T4)
- #endif
- xxmrghw vs1, vs48, vs60
- xxmrghw vs0, vs52, vs56
- #ifndef TRMMKERNEL
- lxv vs36, 0(T5)
- lxv vs37, 16(T5)
- #endif
- xxmrglw vs12, vs49, vs61
- xxmrglw vs14, vs53, vs57
- #ifndef TRMMKERNEL
- lxv vs38,32(T5)
- lxv vs39, 48(T5)
- #endif
-
- xxmrghw vs2, vs53, vs57
- xxmrghw vs3, vs49, vs61
- #ifndef TRMMKERNEL
- lxv vs40, 0(T6)
- lxv vs41, 16(T6)
- #endif
- xxmrglw vs16, vs50, vs62
- xxmrglw vs18, vs54, vs58
- #ifndef TRMMKERNEL
- lxv vs42, 32(T6)
- lxv vs43, 48(T6)
- #endif
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
- xxmrghw vs4, vs54, vs58
- xxmrghw vs5, vs50, vs62
- #ifndef TRMMKERNEL
- lxv vs44, 0(T7)
- lxv vs45, 16(T7)
- #endif
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxmrglw vs24, vs51, vs63
- xxmrglw vs26, vs55, vs59
- #ifndef TRMMKERNEL
- lxv vs46, 32(T7)
- lxv vs47, 48(T7)
- #endif
- xxlor vs17, vs16, vs16
- xxlor vs19, vs18, vs18
- xxmrghw vs30, vs55, vs59
- xxmrghw vs31, vs51, vs63
-
-
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
-
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- xxlor vs25, vs24, vs24
- xxlor vs27, vs26, vs26
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
- #ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
- #else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- #endif
- xxperm vs16, vs4, save_permute_1
- xxperm vs18, vs5, save_permute_1
- stxv vs32, 0(T4)
- stxv vs33, 16(T4)
- xxperm vs17, vs4, save_permute_2
- xxperm vs19, vs5, save_permute_2
- xxperm vs24, vs30, save_permute_1
- xxperm vs26, vs31, save_permute_1
- xxperm vs25, vs30, save_permute_2
- xxperm vs27, vs31, save_permute_2
-
- #ifdef TRMMKERNEL
- xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
- #else
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- #endif
- stxv vs34, 32(T4)
- stxv vs35, 48(T4)
-
- #ifdef TRMMKERNEL
- xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
- #else
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
- #endif
- stxv vs36, 0(T5)
- stxv vs37, 16(T5)
-
- #ifdef TRMMKERNEL
- xvmulsp vs38, vs17, alpha_r
- xvmulsp vs39, vs25, alpha_r
- #else
- xvmaddasp vs38, vs17, alpha_r
- xvmaddasp vs39, vs25, alpha_r
- #endif
-
-
-
-
- stxv vs38, 32(T5)
- stxv vs39, 48(T5)
-
-
- #ifdef TRMMKERNEL
- xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- #else
- xvmaddasp vs40, vs10, alpha_r
- xvmaddasp vs41, vs14, alpha_r
- #endif
- stxv vs40, 0(T6)
- stxv vs41, 16(T6)
- #ifdef TRMMKERNEL
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- #else
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- #endif
- stxv vs42, 32(T6)
- stxv vs43, 48(T6)
- #ifdef TRMMKERNEL
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- #else
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- #endif
-
- stxv vs44, 0(T7)
- stxv vs45, 16(T7)
- #ifdef TRMMKERNEL
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
- #else
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
- #endif
-
- stxv vs46, 32(T7)
- stxv vs47, 48(T7)
-
-
- addi CO,CO,64
-
-
- .endm
-
-
-
- /**********************************************************************************************
- * Macros for N=8 and M=8
- **********************************************************************************************/
-
- .macro LOAD8x8_1
- LOAD8x8 1
- .endm
-
- .macro LOAD8x8_0
- LOAD8x8 0
- .endm
-
- .macro KERNEL8x8_L1_L4 Index,IsLast
- KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro END8x8_NORMAL
- END8x8 0, AO, BO, 32,32
- .endm
-
- .macro Zero8X8
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
-
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
-
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
-
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
-
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
-
- xxlxor vs52, vs52, vs52
- xxlxor vs53, vs53, vs53
-
- xxlxor vs56, vs56, vs56
- xxlxor vs57, vs57, vs57
-
- xxlxor vs60, vs60, vs60
- xxlxor vs61, vs61, vs61
-
- .endm
-
- .macro LOAD8x8 Zero
-
- lxv vs24, 0(BO)
- lxv vs28, 16(BO)
- lxv vs0, 0(AO)
- lxv vs1, 16(AO)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs52, vs52, vs52
- xxlxor vs53, vs53, vs53
- xxlxor vs56, vs56, vs56
- xxlxor vs57, vs57, vs57
- xxlxor vs60, vs60, vs60
- xxlxor vs61, vs61, vs61
- .endif
- .endm
-
-
- .macro END8x8 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
-
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
-
- xvmulsp vs48, vs0,vs28
- xvmulsp vs49, vs1,vs28
-
- xvmulsp vs52, vs0,vs29
- xvmulsp vs53, vs1,vs29
-
- xvmulsp vs56, vs0,vs30
- xvmulsp vs57, vs1,vs30
-
- xvmulsp vs60, vs0,vs31
- xvmulsp vs61, vs1,vs31
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
-
- .endif
- .endm
-
- .macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG)
-
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
-
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
-
- lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
-
-
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
- lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
-
-
- lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
-
-
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- .if \Complete==0
- lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
- .endif
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- .if \Complete==0
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- .endif
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
-
-
- .if \Complete==0
- lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
- .endif
-
- .if \Complete==0
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
- .endif
- .if \IsLast==1
- .if \Complete==1
-
- addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
- .else
-
- addi \BREG, \BREG, DISP32(\Index,128)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
- .endif
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- .endif
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
-
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
-
- .endm
-
- .macro KERNEL8x8 First
-
- LOAD8x8 0
- END8x8 \First, AO, BO, 32,32
- .endm
-
- .macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
- lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
-
- lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- .endif
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- .if \First==1
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
-
- xvmulsp vs48, vs0,vs28
- xvmulsp vs49, vs1,vs28
-
- xvmulsp vs52, vs0,vs29
- xvmulsp vs53, vs1,vs29
-
- xvmulsp vs56, vs0,vs30
- xvmulsp vs57, vs1,vs30
-
- xvmulsp vs60, vs0,vs31
- xvmulsp vs61, vs1,vs31
-
- .else
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
-
- .endif
- .if \Complete==0
- lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
-
- lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
- lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
- .endif
- .if \IsLast==1
- .if \Complete==1
- addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
- addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
-
- .else
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
- .endif
-
- .if \First==1
- xvmulsp vs32, vs4,vs8
- xvmulsp vs33, vs5,vs8
-
- xvmulsp vs36, vs4,vs9
- xvmulsp vs37, vs5,vs9
-
- .else
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- .endif
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- .endif
- .if \First==1
- xvmulsp vs40, vs4,vs10
- xvmulsp vs41, vs5,vs10
-
- xvmulsp vs44, vs4,vs11
- xvmulsp vs45, vs5,vs11
-
- xvmulsp vs48, vs4,vs12
- xvmulsp vs49, vs5,vs12
-
- xvmulsp vs52, vs4,vs13
- xvmulsp vs53, vs5,vs13
-
- xvmulsp vs56, vs4,vs14
- xvmulsp vs57, vs5,vs14
-
- xvmulsp vs60, vs4,vs15
- xvmulsp vs61, vs5,vs15
-
- .else
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
-
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
-
- .endif
-
- .endm
-
-
- .macro SAVE8x8
-
- slwi T10, LDC , 1
- add T1, CO, LDC
-
- add T2, CO, T10
- add T3, T1, T10
-
- add T4, T2, T10
- add T5, T3, T10
-
- add T6, T4, T10
- add T7, T5, T10
-
- #ifndef TRMMKERNEL
- lxv vs34, 0(CO)
- lxv vs35, 16(CO)
- lxv vs38, 0(T1)
- lxv vs39, 16(T1)
- lxv vs42, 0(T2)
- lxv vs43, 16(T2)
- lxv vs46, 0(T3)
- lxv vs47, 16(T3)
-
- lxv vs50, 0(T4)
- lxv vs51, 16(T4)
- lxv vs54, 0(T5)
- lxv vs55, 16(T5)
- lxv vs58, 0(T6)
- lxv vs59, 16(T6)
- lxv vs62, 0(T7)
- lxv vs63, 16(T7)
- #endif
-
- xxmrglw vs8, vs32, vs44
- xxmrglw vs10, vs36, vs40
-
- xxmrghw vs1, vs32, vs44
- xxmrghw vs0, vs36, vs40
-
- xxmrglw vs12, vs33, vs45
- xxmrglw vs14, vs37, vs41
-
- xxmrghw vs2, vs37, vs41
- xxmrghw vs3, vs33, vs45
-
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
-
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
-
-
- /* multiply add normal way */
-
- #ifdef TRMMKERNEL
- xvmulsp vs34, vs8, alpha_r
- xvmulsp vs35, vs12, alpha_r
- xvmulsp vs38, vs9, alpha_r
- xvmulsp vs39, vs13, alpha_r
- xvmulsp vs42, vs10, alpha_r
- xvmulsp vs43, vs14, alpha_r
- xvmulsp vs46, vs11, alpha_r
- xvmulsp vs47, vs15, alpha_r
- #else
- xvmaddasp vs34, vs8, alpha_r
- xvmaddasp vs35, vs12, alpha_r
- xvmaddasp vs38, vs9, alpha_r
- xvmaddasp vs39, vs13, alpha_r
- xvmaddasp vs42, vs10, alpha_r
- xvmaddasp vs43, vs14, alpha_r
- xvmaddasp vs46, vs11, alpha_r
- xvmaddasp vs47, vs15, alpha_r
- #endif
-
-
- xxmrglw vs8, vs48, vs60
- xxmrglw vs10, vs52, vs56
-
- xxmrghw vs1, vs48, vs60
- xxmrghw vs0, vs52, vs56
- stxv vs34, 0(CO)
- stxv vs35, 16(CO)
- xxmrglw vs12, vs49, vs61
- xxmrglw vs14, vs53, vs57
- stxv vs38, 0(T1)
- stxv vs39, 16(T1)
- xxmrghw vs2, vs53, vs57
- xxmrghw vs3, vs49, vs61
- stxv vs42, 0(T2)
- stxv vs43, 16(T2)
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
- stxv vs46, 0(T3)
- stxv vs47, 16(T3)
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
-
-
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
-
- #ifdef TRMMKERNEL
- xvmulsp vs50, vs8, alpha_r
- xvmulsp vs51, vs12, alpha_r
- xvmulsp vs54, vs9, alpha_r
- xvmulsp vs55, vs13, alpha_r
- xvmulsp vs58, vs10, alpha_r
- xvmulsp vs59, vs14, alpha_r
- xvmulsp vs62, vs11, alpha_r
- xvmulsp vs63, vs15, alpha_r
- #else
- xvmaddasp vs50, vs8, alpha_r
- xvmaddasp vs51, vs12, alpha_r
- xvmaddasp vs54, vs9, alpha_r
- xvmaddasp vs55, vs13, alpha_r
- xvmaddasp vs58, vs10, alpha_r
- xvmaddasp vs59, vs14, alpha_r
- xvmaddasp vs62, vs11, alpha_r
- xvmaddasp vs63, vs15, alpha_r
- #endif
-
- stxv vs50, 0(T4)
- stxv vs51, 16(T4)
- stxv vs54, 0(T5)
- stxv vs55, 16(T5)
- stxv vs58, 0(T6)
- stxv vs59, 16(T6)
- stxv vs62, 0(T7)
- stxv vs63, 16(T7)
-
- addi CO,CO,32
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=8 and M=4
- **********************************************************************************************/
-
- .macro LOAD8x4_1
- LOAD8x4 1
- .endm
-
- .macro LOAD8x4_0
- LOAD8x4 0
- .endm
-
- .macro KERNEL8x4_L1_L4 Index,IsLast
- KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro Zero8X4
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
-
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs50, vs50, vs50
- xxlxor vs51, vs51, vs51
-
- .endm
-
- .macro LOAD8x4 Zero
-
- lxv vs0, 0(AO)
- lxv vs24, 0(BO)
- lxv vs25, 16(BO)
-
-
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
-
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs50, vs50, vs50
- xxlxor vs51, vs51, vs51
- .endif
- .endm
-
- .macro END8x4_NORMAL
- END8x4 0, AO, BO, 16,32
- .endm
-
- .macro END8x4 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs24, vs0
- xvmulsp vs33, vs24, vs1
- xvmulsp vs34, vs24, vs2
- xvmulsp vs35, vs24, vs3
-
- xvmulsp vs48, vs25, vs0
- xvmulsp vs49, vs25, vs1
- xvmulsp vs50, vs25, vs2
- xvmulsp vs51, vs25, vs3
- .else
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- xvmaddasp vs48, vs25, vs0
- xvmaddasp vs49, vs25, vs1
- xvmaddasp vs50, vs25, vs2
- xvmaddasp vs51, vs25, vs3
-
- .endif
- .endm
-
- .macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
-
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- xvmaddasp vs48, vs25, vs0
- xvmaddasp vs49, vs25, vs1
- xvmaddasp vs50, vs25, vs2
- xvmaddasp vs51, vs25, vs3
-
- lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
- lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG)
- lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
-
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
- xvmaddasp vs48, vs27, vs4
- xvmaddasp vs49, vs27, vs5
- xvmaddasp vs50, vs27, vs6
- xvmaddasp vs51, vs27, vs7
-
-
- lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
- lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG)
- lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
-
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- xvmaddasp vs48, vs25, vs0
- xvmaddasp vs49, vs25, vs1
- xvmaddasp vs50, vs25, vs2
- xvmaddasp vs51, vs25, vs3
-
- .if \Complete==0
-
- lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
- lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG)
- lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
- .endif
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
- xvmaddasp vs48, vs27, vs4
- xvmaddasp vs49, vs27, vs5
- xvmaddasp vs50, vs27, vs6
- xvmaddasp vs51, vs27, vs7
-
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
- addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
-
- .else
- addi \AREG, \AREG, DISP16(\Index,64)
- addi \BREG, \BREG, DISP32(\Index,128)
-
- .endif
- .endif
-
-
- .endm
-
- .macro KERNEL8x4 First
- LOAD8x4 0
- END8x4 \First, AO, BO, 16,32
- .endm
-
- .macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
- .if \First==1
- xvmulsp vs32, vs24, vs0
- xvmulsp vs33, vs24, vs1
- xvmulsp vs34, vs24, vs2
- xvmulsp vs35, vs24, vs3
-
- xvmulsp vs48, vs25, vs0
- xvmulsp vs49, vs25, vs1
- xvmulsp vs50, vs25, vs2
- xvmulsp vs51, vs25, vs3
- .else
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- xvmaddasp vs48, vs25, vs0
- xvmaddasp vs49, vs25, vs1
- xvmaddasp vs50, vs25, vs2
- xvmaddasp vs51, vs25, vs3
- .endif
-
- .if \Complete==0
-
- lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
- lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG)
- lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
- .endif
-
- .if \First==1
- xvmulsp vs32, vs26, vs4
- xvmulsp vs33, vs26, vs5
- xvmulsp vs34, vs26, vs6
- xvmulsp vs35, vs26, vs7
-
- xvmulsp vs48, vs27, vs4
- xvmulsp vs49, vs27, vs5
- xvmulsp vs50, vs27, vs6
- xvmulsp vs51, vs27, vs7
-
-
- .else
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
- xvmaddasp vs48, vs27, vs4
- xvmaddasp vs49, vs27, vs5
- xvmaddasp vs50, vs27, vs6
- xvmaddasp vs51, vs27, vs7
- .endif
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
- addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
-
- .else
- addi \AREG, \AREG, DISP8(\Index,32)
- addi \BREG, \BREG, DISP16(\Index,64)
-
- .endif
- .endif
-
-
- .endm
-
-
- .macro SAVE8x4
- slwi T10, LDC , 1
- add T1, CO, LDC
- #if !defined(TRMMKERNEL)
- lxv vs36, 0(CO)
- lxv vs37, 0(T1)
- #endif
- add T2, CO, T10
- add T3, T1, T10
- #if !defined(TRMMKERNEL)
- lxv vs38, 0(T2)
- lxv vs39, 0(T3)
- #endif
- add T4, T2, T10
- add T5, T3, T10
- #if !defined(TRMMKERNEL)
- lxv vs40, 0(T4)
- lxv vs41, 0(T5)
- #endif
- add T6, T4, T10
- add T7, T5, T10
- #if !defined(TRMMKERNEL)
- lxv vs42, 0(T6)
- lxv vs43, 0(T7)
- #endif
- xxmrglw vs0, vs35,vs32
- xxmrglw vs1, vs34,vs33
- xxmrglw vs4, vs32,vs35
- xxmrglw vs5, vs33,vs34
-
-
- xxmrghw vs2, vs35,vs32
- xxmrghw vs3, vs34,vs33
- xxmrghw vs6, vs32,vs35
- xxmrghw vs7, vs33,vs34
-
- xxmrgld vs24, vs1, vs0
- xxmrghd vs25,vs5,vs4
-
- xxmrgld vs26, vs2, vs3
- xxmrghd vs27,vs6,vs7
-
-
- xxmrglw vs0, vs51,vs48
- xxmrglw vs1, vs50,vs49
- xxmrglw vs4, vs48,vs51
- xxmrglw vs5, vs49,vs50
-
- xxmrghw vs2, vs51,vs48
- xxmrghw vs3, vs50,vs49
- xxmrghw vs6, vs48,vs51
- xxmrghw vs7, vs49,vs50
-
- xxmrgld vs28, vs1, vs0
- xxmrghd vs29,vs5,vs4
-
- xxmrgld vs30, vs2, vs3
- xxmrghd vs31,vs6,vs7
- #if defined(TRMMKERNEL)
-
- xvmulsp vs36, vs24, alpha_r
- xvmulsp vs37, vs25, alpha_r
- xvmulsp vs38, vs26, alpha_r
- xvmulsp vs39, vs27, alpha_r
- xvmulsp vs40, vs28, alpha_r
- xvmulsp vs41, vs29, alpha_r
- xvmulsp vs42, vs30, alpha_r
- xvmulsp vs43, vs31, alpha_r
- #else
- xvmaddasp vs36, vs24, alpha_r
- xvmaddasp vs37, vs25, alpha_r
- xvmaddasp vs38, vs26, alpha_r
- xvmaddasp vs39, vs27, alpha_r
- xvmaddasp vs40, vs28, alpha_r
- xvmaddasp vs41, vs29, alpha_r
- xvmaddasp vs42, vs30, alpha_r
- xvmaddasp vs43, vs31, alpha_r
- #endif
-
- stxv vs36, 0(CO)
- stxv vs37, 0(T1)
- stxv vs38, 0(T2)
- stxv vs39, 0(T3)
- stxv vs40, 0(T4)
- stxv vs41, 0(T5)
- stxv vs42, 0(T6)
- stxv vs43, 0(T7)
-
-
- addi CO,CO,16
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=8 and M=2
- **********************************************************************************************/
-
-
- .macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
-
- .macro Zero8x2
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
-
- .endm
-
- .macro KERNEL8x2
- KERNEL8x2_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 0
- xxspltw vs9, vs36, 1
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs2, vs26, vs9
- xvmulsp vs3, vs27, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs26, vs9
- xvmaddasp vs3, vs27, vs9
-
- .endif
-
- addi \AREG, \AREG, DISP2(\Index,8)
- addi \BREG, \BREG, DISP8(\Index,32)
-
- .endm
-
- .macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
-
- lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG)
- xxspltw vs8, vs4, 2
- xxspltw vs9, vs4, 3
- xxspltw vs10, vs4, 0
- xxspltw vs11, vs4, 1
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs2, vs26, vs9
- xvmulsp vs3, vs27, vs9
-
- xvmulsp vs0, vs28, vs10
- xvmulsp vs1, vs29, vs10
- xvmulsp vs2, vs28, vs11
- xvmulsp vs3, vs29, vs11
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs26, vs9
- xvmaddasp vs3, vs27, vs9
-
- xvmaddasp vs0, vs28, vs10
- xvmaddasp vs1, vs29, vs10
- xvmaddasp vs2, vs28, vs11
- xvmaddasp vs3, vs29, vs11
- .endif
-
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP16(\Index,64)
- .endif
-
- .endm
-
-
- .macro SAVE8x2
- slwi T10, LDC , 1
- add T1, CO, LDC
- add T2, CO, T10
- add T3, T1, T10
- add T4, T2, T10
- add T5, T3, T10
- add T6, T4, T10
- add T7, T5, T10
- /*convert alpha_r for multiply*/
- xscvspdp vs4,alpha_r
- /* v0 corresponds to vs32, do not forget*/
- #if !defined(TRMMKERNEL)
- lxssp v0,0(CO)
- lxssp v1,4(CO)
-
- lxssp v2,0(T1)
- lxssp v3,4(T1)
-
- lxssp v4,0(T2)
- lxssp v5,4(T2)
-
- lxssp v6,0(T3)
- lxssp v7,4(T3)
-
- lxssp v8,0(T4)
- lxssp v9,4(T4)
-
- lxssp v10,0(T5)
- lxssp v11,4(T5)
-
- lxssp v12,0(T6)
- lxssp v13,4(T6)
-
- lxssp v14,0(T7)
- lxssp v15,4(T7)
- #endif
- xscvspdp vs5, vs2
- xxspltw vs6, vs2, 1
- xxspltw vs7, vs2, 2
- xxspltw vs8, vs2, 3
- xscvspdp vs6,vs6
- xscvspdp vs7,vs7
- xscvspdp vs8,vs8
-
- xscvspdp vs24, vs0
- xxspltw vs25, vs0, 1
- xxspltw vs26, vs0, 2
- xxspltw vs27, vs0, 3
- xscvspdp vs25,vs25
- xscvspdp vs26,vs26
- xscvspdp vs27,vs27
-
- xscvspdp vs9, vs3
- xxspltw vs10, vs3, 1
- xxspltw vs11, vs3, 2
- xxspltw vs12, vs3, 3
- xscvspdp vs10,vs10
- xscvspdp vs11,vs11
- xscvspdp vs12,vs12
-
- xscvspdp vs28, vs1
- xxspltw vs29, vs1, 1
- xxspltw vs30, vs1, 2
- xxspltw vs31, vs1, 3
- xscvspdp vs29,vs29
- xscvspdp vs30,vs30
- xscvspdp vs31,vs31
-
-
-
-
- #if defined(TRMMKERNEL)
- xsmuldp vs32,vs8, vs4
- xsmuldp vs33,vs27, vs4
-
- xsmuldp vs34,vs7, vs4
- xsmuldp vs35,vs26, vs4
-
- xsmuldp vs36,vs6, vs4
- xsmuldp vs37,vs25, vs4
-
- xsmuldp vs38,vs5, vs4
- xsmuldp vs39,vs24, vs4
-
- xsmuldp vs40,vs12, vs4
- xsmuldp vs41,vs31, vs4
-
- xsmuldp vs42,vs11, vs4
- xsmuldp vs43,vs30, vs4
-
- xsmuldp vs44,vs10, vs4
- xsmuldp vs45,vs29, vs4
-
- xsmuldp vs46,vs9, vs4
- xsmuldp vs47,vs28, vs4
- #else
- xsmaddadp vs32,vs8, vs4
- xsmaddadp vs33,vs27, vs4
-
- xsmaddadp vs34,vs7, vs4
- xsmaddadp vs35,vs26, vs4
-
- xsmaddadp vs36,vs6, vs4
- xsmaddadp vs37,vs25, vs4
-
- xsmaddadp vs38,vs5, vs4
- xsmaddadp vs39,vs24, vs4
-
- xsmaddadp vs40,vs12, vs4
- xsmaddadp vs41,vs31, vs4
-
- xsmaddadp vs42,vs11, vs4
- xsmaddadp vs43,vs30, vs4
-
- xsmaddadp vs44,vs10, vs4
- xsmaddadp vs45,vs29, vs4
-
- xsmaddadp vs46,vs9, vs4
- xsmaddadp vs47,vs28, vs4
- #endif
-
- stxssp v0,0(CO)
- stxssp v1,4(CO)
-
- stxssp v2,0(T1)
- stxssp v3,4(T1)
-
- stxssp v4,0(T2)
- stxssp v5,4(T2)
-
- stxssp v6,0(T3)
- stxssp v7,4(T3)
-
- stxssp v8,0(T4)
- stxssp v9,4(T4)
-
- stxssp v10,0(T5)
- stxssp v11,4(T5)
-
- stxssp v12,0(T6)
- stxssp v13,4(T6)
-
- stxssp v14,0(T7)
- stxssp v15,4(T7)
-
-
- addi CO,CO,8
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=8 and M=1
- **********************************************************************************************/
- .macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast
- KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro Zero8x1
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- .endm
-
- .macro KERNEL8x1
- KERNEL8x1_1 AO,BO, 0
- .endm
-
- .macro KERNEL8x1_2
- KERNEL8x1_2_1 AO,BO, 0
- .endm
-
- .macro KERNEL8x1_1 AREG,BREG,First
- lxvwsx vs8, 0, \AREG
- lxv vs26, 0(\BREG)
- lxv vs27, 16(\BREG)
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- .endif
- addi \AREG, \AREG, 4
- addi \BREG, \BREG, 32
- .endm
-
- .macro KERNEL8x1_2_1 AREG,BREG,First
- lxsd v4, 0(\AREG)
- lxv vs26, 0(\BREG)
- lxv vs27, 16(\BREG)
- lxv vs28, 32(\BREG)
- lxv vs29, 48(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs0, vs28, vs9
- xvmulsp vs1, vs29, vs9
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs0, vs28, vs9
- xvmaddasp vs1, vs29, vs9
- .endif
- addi \AREG, \AREG, 8
- addi \BREG, \BREG, 64
- .endm
-
- .macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
- lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
- xxspltw vs8, vs4, 3
- xxspltw vs9, vs4, 2
- xxspltw vs10, vs4, 1
- xxspltw vs11, vs4, 0
- lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG)
- lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG)
- lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG)
- lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG)
- lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG)
- lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG)
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs0, vs28, vs9
- xvmulsp vs1, vs29, vs9
- xvmulsp vs0, vs30, vs10
- xvmulsp vs1, vs31, vs10
- xvmulsp vs0, vs32, vs11
- xvmulsp vs1, vs33, vs11
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs0, vs28, vs9
- xvmaddasp vs1, vs29, vs9
- xvmaddasp vs0, vs30, vs10
- xvmaddasp vs1, vs31, vs10
- xvmaddasp vs0, vs32, vs11
- xvmaddasp vs1, vs33, vs11
- .endif
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP32(\Index,128)
- .endif
- .endm
-
- .macro SAVE8x1
- slwi T10, LDC , 1
- add T1, CO, LDC
- add T2, CO, T10
- add T3, T1, T10
- add T4, T2, T10
- add T5, T3, T10
- add T6, T4, T10
- add T7, T5, T10
- /*convert alpha_r for multiply*/
- xscvspdp vs4,alpha_r
- /* v0 corresponds to vs32, do not forget*/
- #if !defined(TRMMKERNEL)
- lxssp v0,0(CO)
- lxssp v2,0(T1)
- lxssp v4,0(T2)
- lxssp v6,0(T3)
- lxssp v8,0(T4)
- lxssp v10,0(T5)
- lxssp v12,0(T6)
- lxssp v14,0(T7)
- #endif
- xscvspdp vs24, vs0
- xxspltw vs25, vs0, 1
- xxspltw vs26, vs0, 2
- xxspltw vs27, vs0, 3
- xscvspdp vs25,vs25
- xscvspdp vs26,vs26
- xscvspdp vs27,vs27
- xscvspdp vs28, vs1
- xxspltw vs29, vs1, 1
- xxspltw vs30, vs1, 2
- xxspltw vs31, vs1, 3
- xscvspdp vs29,vs29
- xscvspdp vs30,vs30
- xscvspdp vs31,vs31
- #if defined(TRMMKERNEL)
- xsmuldp vs32,vs27, vs4
- xsmuldp vs34,vs26, vs4
- xsmuldp vs36,vs25, vs4
- xsmuldp vs38,vs24, vs4
- xsmuldp vs40,vs31, vs4
- xsmuldp vs42,vs30, vs4
- xsmuldp vs44,vs29, vs4
- xsmuldp vs46,vs28, vs4
- #else
- xsmaddadp vs32,vs27, vs4
- xsmaddadp vs34,vs26, vs4
- xsmaddadp vs36,vs25, vs4
- xsmaddadp vs38,vs24, vs4
- xsmaddadp vs40,vs31, vs4
- xsmaddadp vs42,vs30, vs4
- xsmaddadp vs44,vs29, vs4
- xsmaddadp vs46,vs28, vs4
- #endif
- stxssp v0,0(CO)
- stxssp v2,0(T1)
- stxssp v4,0(T2)
- stxssp v6,0(T3)
- stxssp v8,0(T4)
- stxssp v10,0(T5)
- stxssp v12,0(T6)
- stxssp v14,0(T7)
- addi CO,CO,4
- .endm
-
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=16
- **********************************************************************************************/
-
- .macro LOAD4x16_1
- LOAD4x16 1
- .endm
-
- .macro LOAD4x16_0
- LOAD4x16 0
- .endm
-
- .macro KERNEL4x16_L1_L4 Index,IsLast
- KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro Zero4X16
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
- .endm
-
- .macro LOAD4x16 Zero
-
- lxv vs24, 0(BO)
- lxv vs0, 0(AO)
- lxv vs1, 16(AO)
- lxv vs2, 32(AO)
- lxv vs3, 48(AO)
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs27, vs26, vs26,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
-
- .endif
- .endm
-
- .macro END4x16_NORMAL
- END4x16 0, AO, BO, 64,16
- .endm
-
- .macro END4x16 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
- xvmulsp vs34, vs2,vs24
- xvmulsp vs35, vs3,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
- xvmulsp vs38, vs2,vs25
- xvmulsp vs39, vs3,vs25
-
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
- xvmulsp vs42, vs2,vs26
- xvmulsp vs43, vs3,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
- xvmulsp vs46, vs2,vs27
- xvmulsp vs47, vs3,vs27
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- .endif
- .endm
-
- .macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG)
- lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-
- xxpermdi vs11, vs10, vs10,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
-
-
- lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
-
- lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG)
- lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
-
- xxpermdi vs27, vs26, vs26,2
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
-
- lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
-
- lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG)
- lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG)
- lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG)
- lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-
- xxpermdi vs11, vs10, vs10,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
-
-
- .if \Complete==0
- lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
-
- lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG)
- lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG)
- lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG)
- lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
- .endif
- .if \IsLast==1
- .if \Complete==1
-
- addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
- addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
- .else
-
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP64(\Index,256)
- .endif
- .endif
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
-
- .endif
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
-
-
- .endm
-
- .macro KERNEL4x16 First
-
- LOAD4x16 0
- END4x16 \First, AO, BO, 64,16
- .endm
-
- .macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
- lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
- xvmulsp vs34, vs2,vs24
- xvmulsp vs35, vs3,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
- xvmulsp vs38, vs2,vs25
- xvmulsp vs39, vs3,vs25
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
- .endif
-
- xxpermdi vs11, vs10, vs10,2
-
- .if \First==1
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
- xvmulsp vs42, vs2,vs26
- xvmulsp vs43, vs3,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
- xvmulsp vs46, vs2,vs27
- xvmulsp vs47, vs3,vs27
-
-
- .else
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
-
- .endif
- .if \Complete==0
- lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
- lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
- lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
- .endif
- .if \IsLast==1
- .if \Complete==1
- addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
-
- .else
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
- .endif
-
- .if \First==1
- xvmulsp vs32, vs4,vs8
- xvmulsp vs33, vs5,vs8
- xvmulsp vs34, vs6,vs8
- xvmulsp vs35, vs7,vs8
-
- xvmulsp vs36, vs4,vs9
- xvmulsp vs37, vs5,vs9
- xvmulsp vs38, vs6,vs9
- xvmulsp vs39, vs7,vs9
- .else
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
- .endif
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
-
- .endif
- .if \First==1
- xvmulsp vs40, vs4,vs10
- xvmulsp vs41, vs5,vs10
- xvmulsp vs42, vs6,vs10
- xvmulsp vs43, vs7,vs10
-
- xvmulsp vs44, vs4,vs11
- xvmulsp vs45, vs5,vs11
- xvmulsp vs46, vs6,vs11
- xvmulsp vs47, vs7,vs11
-
-
-
- .else
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
-
-
- .endif
-
- .endm
-
-
- .macro SAVE4x16
-
- slwi T10, LDC , 1
- add T1, CO, LDC
-
- add T2, CO, T10
- add T3, T1, T10
-
-
-
- xxmrglw vs8, vs32, vs44
- xxmrglw vs10, vs36, vs40
-
- xxmrghw vs1, vs32, vs44
- xxmrghw vs0, vs36, vs40
-
- xxmrglw vs12, vs33, vs45
- xxmrglw vs14, vs37, vs41
-
- xxmrghw vs2, vs37, vs41
- xxmrghw vs3, vs33, vs45
-
- xxmrglw vs16, vs34, vs46
- xxmrglw vs18, vs38, vs42
-
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
-
- xxmrghw vs4, vs38, vs42
- xxmrghw vs5, vs34, vs46
-
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxmrglw vs24, vs35, vs47
- xxmrglw vs26, vs39, vs43
-
- xxlor vs17, vs16, vs16
- xxlor vs19, vs18, vs18
-
- xxmrghw vs30, vs39, vs43
- xxmrghw vs31, vs35, vs47
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- #ifndef TRMMKERNEL
- lxv vs32, 0(CO)
- lxv vs33, 16(CO)
- lxv vs34, 32(CO)
- lxv vs35, 48(CO)
- #endif
- xxlor vs25, vs24, vs24
- xxlor vs27, vs26, vs26
-
- #ifndef TRMMKERNEL
- lxv vs36, 0(T1)
- lxv vs37, 16(T1)
- lxv vs38, 32(T1)
- lxv vs39, 48(T1)
- #endif
- #ifndef TRMMKERNEL
- lxv vs40, 0(T2)
- lxv vs41, 16(T2)
- lxv vs42, 32(T2)
- lxv vs43, 48(T2)
- #endif
- #ifndef TRMMKERNEL
- lxv vs44, 0(T3)
- lxv vs45, 16(T3)
- lxv vs46, 32(T3)
- lxv vs47, 48(T3)
- #endif
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
-
- xxperm vs16, vs4, save_permute_1
- xxperm vs18, vs5, save_permute_1
-
- xxperm vs17, vs4, save_permute_2
- xxperm vs19, vs5, save_permute_2
-
- xxperm vs24, vs30, save_permute_1
- xxperm vs26, vs31, save_permute_1
-
- xxperm vs25, vs30, save_permute_2
- xxperm vs27, vs31, save_permute_2
-
-
- /* multiply add normal way */
-
- #ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
- xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
- xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
- xvmulsp vs38, vs17, alpha_r
- xvmulsp vs39, vs25, alpha_r
- #else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
- xvmaddasp vs38, vs17, alpha_r
- xvmaddasp vs39, vs25, alpha_r
- #endif
-
-
-
- #ifdef TRMMKERNEL
- xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
- #else
-
- xvmaddasp vs40, vs10, alpha_r
- xvmaddasp vs41, vs14, alpha_r
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
-
- #endif
-
- stxv vs32, 0(CO)
- stxv vs33, 16(CO)
- stxv vs34, 32(CO)
- stxv vs35, 48(CO)
-
- stxv vs36, 0(T1)
- stxv vs37, 16(T1)
- stxv vs38, 32(T1)
- stxv vs39, 48(T1)
-
- stxv vs40, 0(T2)
- stxv vs41, 16(T2)
- stxv vs42, 32(T2)
- stxv vs43, 48(T2)
- stxv vs44, 0(T3)
- stxv vs45, 16(T3)
- stxv vs46, 32(T3)
- stxv vs47, 48(T3)
-
- addi CO,CO,64
-
-
- .endm
-
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=8
- **********************************************************************************************/
-
- .macro LOAD4x8_1
- LOAD4x8 1
- .endm
-
- .macro LOAD4x8_0
- LOAD4x8 0
- .endm
-
- .macro KERNEL4x8_L1_L4 Index,IsLast
- KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro END4x8_NORMAL
- END4x8 0, AO, BO, 32,16
- .endm
-
- .macro Zero4X8
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
-
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
-
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
-
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
-
- .endm
-
- .macro LOAD4x8 Zero
-
- lxv vs24, 0(BO)
- lxv vs0, 0(AO)
- lxv vs1, 16(AO)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
- xxpermdi vs27, vs26, vs26,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
-
- .endif
- .endm
-
-
- .macro END4x8 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
-
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
-
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
-
- .endif
- .endm
-
- .macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xxpermdi vs11, vs10, vs10,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
-
-
- lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
-
- lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- xxpermdi vs27, vs26, vs26,2
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
-
-
- lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
-
- lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xxpermdi vs11, vs10, vs10,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
-
-
- .if \Complete==0
- lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
-
- lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
- .endif
- .if \IsLast==1
- .if \Complete==1
-
- addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
- .else
-
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
- .endif
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
-
- .endif
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
-
-
- .endm
-
- .macro KERNEL4x8 First
-
- LOAD4x8 0
- END4x8 \First, AO, BO, 32,16
- .endm
-
- .macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- .endif
-
- xxpermdi vs11, vs10, vs10,2
-
- .if \First==1
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
-
-
- .else
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
-
- .endif
- .if \Complete==0
- lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
-
- lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
- lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
- .endif
- .if \IsLast==1
- .if \Complete==1
- addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
- addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
-
- .else
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
- .endif
-
- .if \First==1
- xvmulsp vs32, vs4,vs8
- xvmulsp vs33, vs5,vs8
-
- xvmulsp vs36, vs4,vs9
- xvmulsp vs37, vs5,vs9
-
- .else
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- .endif
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
-
- .endif
- .if \First==1
- xvmulsp vs40, vs4,vs10
- xvmulsp vs41, vs5,vs10
-
- xvmulsp vs44, vs4,vs11
- xvmulsp vs45, vs5,vs11
-
- .else
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
- .endif
-
- .endm
-
-
- .macro SAVE4x8
-
- slwi T10, LDC , 1
- add T1, CO, LDC
-
- add T2, CO, T10
- add T3, T1, T10
-
-
-
- #ifndef TRMMKERNEL
- lxv vs34, 0(CO)
- lxv vs35, 16(CO)
- lxv vs38, 0(T1)
- lxv vs39, 16(T1)
- lxv vs42, 0(T2)
- lxv vs43, 16(T2)
- lxv vs46, 0(T3)
- lxv vs47, 16(T3)
-
-
- #endif
-
- xxmrglw vs8, vs32, vs44
- xxmrglw vs10, vs36, vs40
-
- xxmrghw vs1, vs32, vs44
- xxmrghw vs0, vs36, vs40
-
- xxmrglw vs12, vs33, vs45
- xxmrglw vs14, vs37, vs41
-
- xxmrghw vs2, vs37, vs41
- xxmrghw vs3, vs33, vs45
-
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
-
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
-
-
- /* multiply add normal way */
-
- #ifdef TRMMKERNEL
- xvmulsp vs34, vs8, alpha_r
- xvmulsp vs35, vs12, alpha_r
- xvmulsp vs38, vs9, alpha_r
- xvmulsp vs39, vs13, alpha_r
- xvmulsp vs42, vs10, alpha_r
- xvmulsp vs43, vs14, alpha_r
- xvmulsp vs46, vs11, alpha_r
- xvmulsp vs47, vs15, alpha_r
- #else
- xvmaddasp vs34, vs8, alpha_r
- xvmaddasp vs35, vs12, alpha_r
- xvmaddasp vs38, vs9, alpha_r
- xvmaddasp vs39, vs13, alpha_r
- xvmaddasp vs42, vs10, alpha_r
- xvmaddasp vs43, vs14, alpha_r
- xvmaddasp vs46, vs11, alpha_r
- xvmaddasp vs47, vs15, alpha_r
- #endif
-
-
- stxv vs34, 0(CO)
- stxv vs35, 16(CO)
- stxv vs38, 0(T1)
- stxv vs39, 16(T1)
- stxv vs42, 0(T2)
- stxv vs43, 16(T2)
- stxv vs46, 0(T3)
- stxv vs47, 16(T3)
-
-
- addi CO,CO,32
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=4
- **********************************************************************************************/
-
- .macro LOAD4x4_1
- LOAD4x4 1
- .endm
-
- .macro LOAD4x4_0
- LOAD4x4 0
- .endm
-
- .macro KERNEL4x4_L1_L4 Index,IsLast
- KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro Zero4X4
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
-
- .endm
-
- .macro LOAD4x4 Zero
-
- lxv vs0, 0(AO)
- lxv vs24, 0(BO)
-
-
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
-
- .endif
- .endm
-
- .macro END4x4_NORMAL
- END4x4 0, AO, BO, 16,16
- .endm
-
- .macro END4x4 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs24, vs0
- xvmulsp vs33, vs24, vs1
- xvmulsp vs34, vs24, vs2
- xvmulsp vs35, vs24, vs3
- .else
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
-
- .endif
- .endm
-
- .macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
-
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
-
- lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
- lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
-
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
-
-
- lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
- lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
-
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
-
- .if \Complete==0
-
- lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
- lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
- .endif
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
-
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
- addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
-
- .else
- addi \AREG, \AREG, DISP16(\Index,64)
- addi \BREG, \BREG, DISP16(\Index,64)
-
- .endif
- .endif
-
-
- .endm
-
- .macro KERNEL4x4 First
- LOAD4x4 0
- END4x4 \First, AO, BO, 16,16
- .endm
-
- .macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
- .if \First==1
- xvmulsp vs32, vs24, vs0
- xvmulsp vs33, vs24, vs1
- xvmulsp vs34, vs24, vs2
- xvmulsp vs35, vs24, vs3
-
- .else
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- .endif
-
- .if \Complete==0
-
- lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
- lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
- .endif
-
- .if \First==1
- xvmulsp vs32, vs26, vs4
- xvmulsp vs33, vs26, vs5
- xvmulsp vs34, vs26, vs6
- xvmulsp vs35, vs26, vs7
-
-
- .else
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
- .endif
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
- addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
-
- .else
- addi \AREG, \AREG, DISP8(\Index,32)
- addi \BREG, \BREG, DISP8(\Index,32)
-
- .endif
- .endif
-
-
- .endm
-
-
- .macro SAVE4x4
- slwi T10, LDC , 1
- add T1, CO, LDC
- #if !defined(TRMMKERNEL)
- lxv vs36, 0(CO)
- lxv vs37, 0(T1)
- #endif
- add T2, CO, T10
- add T3, T1, T10
- #if !defined(TRMMKERNEL)
- lxv vs38, 0(T2)
- lxv vs39, 0(T3)
- #endif
-
- xxmrglw vs0, vs35,vs32
- xxmrglw vs1, vs34,vs33
- xxmrglw vs4, vs32,vs35
- xxmrglw vs5, vs33,vs34
-
-
- xxmrghw vs2, vs35,vs32
- xxmrghw vs3, vs34,vs33
- xxmrghw vs6, vs32,vs35
- xxmrghw vs7, vs33,vs34
-
- xxmrgld vs24, vs1, vs0
- xxmrghd vs25,vs5,vs4
-
- xxmrgld vs26, vs2, vs3
- xxmrghd vs27,vs6,vs7
-
- #if defined(TRMMKERNEL)
- xvmulsp vs36, vs24, alpha_r
- xvmulsp vs37, vs25, alpha_r
- xvmulsp vs38, vs26, alpha_r
- xvmulsp vs39, vs27, alpha_r
- #else
- xvmaddasp vs36, vs24, alpha_r
- xvmaddasp vs37, vs25, alpha_r
- xvmaddasp vs38, vs26, alpha_r
- xvmaddasp vs39, vs27, alpha_r
- #endif
- stxv vs36, 0(CO)
- stxv vs37, 0(T1)
- stxv vs38, 0(T2)
- stxv vs39, 0(T3)
-
-
-
- addi CO,CO,16
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=2
- **********************************************************************************************/
-
-
- .macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast
- KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
-
- .macro Zero4x2
- xxlxor vs0, vs0, vs0
- xxlxor vs2, vs2, vs2
-
- .endm
-
- .macro KERNEL4x2
- KERNEL4x2_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 0
- xxspltw vs9, vs36, 1
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs2, vs26, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs2, vs26, vs9
-
- .endif
-
- addi \AREG, \AREG, DISP2(\Index,8)
- addi \BREG, \BREG, DISP4(\Index,16)
-
- .endm
-
- .macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
-
- lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG)
- xxspltw vs8, vs4, 2
- xxspltw vs9, vs4, 3
- xxspltw vs10, vs4, 0
- xxspltw vs11, vs4, 1
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs2, vs26, vs9
-
- xvmulsp vs0, vs28, vs10
- xvmulsp vs2, vs28, vs11
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs2, vs26, vs9
-
- xvmaddasp vs0, vs28, vs10
- xvmaddasp vs2, vs28, vs11
- .endif
-
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP8(\Index,32)
- .endif
-
- .endm
-
-
- .macro SAVE4x2
- slwi T10, LDC , 1
- add T1, CO, LDC
- add T2, CO, T10
- add T3, T1, T10
- /*convert alpha_r for multiply*/
- xscvspdp vs4,alpha_r
- /* v0 corresponds to vs32, do not forget*/
- #if !defined(TRMMKERNEL)
- lxssp v0,0(CO)
- lxssp v1,4(CO)
-
- lxssp v2,0(T1)
- lxssp v3,4(T1)
-
- lxssp v4,0(T2)
- lxssp v5,4(T2)
-
- lxssp v6,0(T3)
- lxssp v7,4(T3)
-
-
- #endif
- xscvspdp vs5, vs2
- xxspltw vs6, vs2, 1
- xxspltw vs7, vs2, 2
- xxspltw vs8, vs2, 3
- xscvspdp vs6,vs6
- xscvspdp vs7,vs7
- xscvspdp vs8,vs8
-
- xscvspdp vs24, vs0
- xxspltw vs25, vs0, 1
- xxspltw vs26, vs0, 2
- xxspltw vs27, vs0, 3
- xscvspdp vs25,vs25
- xscvspdp vs26,vs26
- xscvspdp vs27,vs27
-
-
- #if defined(TRMMKERNEL)
- xsmuldp vs32,vs8, vs4
- xsmuldp vs33,vs27, vs4
-
- xsmuldp vs34,vs7, vs4
- xsmuldp vs35,vs26, vs4
-
- xsmuldp vs36,vs6, vs4
- xsmuldp vs37,vs25, vs4
-
- xsmuldp vs38,vs5, vs4
- xsmuldp vs39,vs24, vs4
-
-
- #else
- xsmaddadp vs32,vs8, vs4
- xsmaddadp vs33,vs27, vs4
-
- xsmaddadp vs34,vs7, vs4
- xsmaddadp vs35,vs26, vs4
-
- xsmaddadp vs36,vs6, vs4
- xsmaddadp vs37,vs25, vs4
-
- xsmaddadp vs38,vs5, vs4
- xsmaddadp vs39,vs24, vs4
-
-
- #endif
-
- stxssp v0,0(CO)
- stxssp v1,4(CO)
-
- stxssp v2,0(T1)
- stxssp v3,4(T1)
-
- stxssp v4,0(T2)
- stxssp v5,4(T2)
-
- stxssp v6,0(T3)
- stxssp v7,4(T3)
-
-
-
-
- addi CO,CO,8
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=1
- **********************************************************************************************/
- .macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast
- KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro Zero4x1
- xxlxor vs0, vs0, vs0
- .endm
-
- .macro KERNEL4x1
- KERNEL4x1_1 AO,BO, 0
- .endm
-
- .macro KERNEL4x1_2
- KERNEL4x1_2_1 AO,BO, 0
- .endm
-
- .macro KERNEL4x1_1 AREG,BREG,First
- lxvwsx vs8, 0, \AREG
- lxv vs26, 0(\BREG)
- .if \First==1
- xvmulsp vs0, vs26, vs8
- .else
- xvmaddasp vs0, vs26, vs8
- .endif
- addi \AREG, \AREG, 4
- addi \BREG, \BREG, 16
- .endm
-
- .macro KERNEL4x1_2_1 AREG,BREG,First
- lxsd v4, 0(\AREG)
- lxv vs26, 0(\BREG)
- lxv vs28, 16(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs0, vs28, vs9
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs0, vs28, vs9
- .endif
- addi \AREG, \AREG, 8
- addi \BREG, \BREG, 32
- .endm
-
- .macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
- lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
- xxspltw vs8, vs4, 3
- xxspltw vs9, vs4, 2
- xxspltw vs10, vs4, 1
- xxspltw vs11, vs4, 0
- lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG)
- lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG)
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs0, vs28, vs9
- xvmulsp vs0, vs30, vs10
- xvmulsp vs0, vs32, vs11
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs0, vs28, vs9
- xvmaddasp vs0, vs30, vs10
- xvmaddasp vs0, vs32, vs11
- .endif
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP16(\Index,64)
- .endif
- .endm
-
- .macro SAVE4x1
- slwi T10, LDC , 1
- add T1, CO, LDC
- add T2, CO, T10
- add T3, T1, T10
- /*convert alpha_r for multiply*/
- xscvspdp vs4,alpha_r
- /* v0 corresponds to vs32, do not forget*/
- #if !defined(TRMMKERNEL)
- lxssp v0,0(CO)
- lxssp v2,0(T1)
- lxssp v4,0(T2)
- lxssp v6,0(T3)
- #endif
- xscvspdp vs24, vs0
- xxspltw vs25, vs0, 1
- xxspltw vs26, vs0, 2
- xxspltw vs27, vs0, 3
- xscvspdp vs25,vs25
- xscvspdp vs26,vs26
- xscvspdp vs27,vs27
-
- #if defined(TRMMKERNEL)
- xsmuldp vs32,vs27, vs4
- xsmuldp vs34,vs26, vs4
- xsmuldp vs36,vs25, vs4
- xsmuldp vs38,vs24, vs4
- #else
- xsmaddadp vs32,vs27, vs4
- xsmaddadp vs34,vs26, vs4
- xsmaddadp vs36,vs25, vs4
- xsmaddadp vs38,vs24, vs4
- #endif
- stxssp v0,0(CO)
- stxssp v2,0(T1)
- stxssp v4,0(T2)
- stxssp v6,0(T3)
- addi CO,CO,4
- .endm
-
- /****************************N=2 section*****************/
-
- .macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero2x16
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
- xxlxor vs4, vs4, vs4
- xxlxor vs5, vs5, vs5
- xxlxor vs6, vs6, vs6
- xxlxor vs7, vs7, vs7
- .endm
-
- .macro KERNEL2x16
- KERNEL2x16_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
- lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs2, vs28, vs8
- xvmulsp vs3, vs29, vs8
-
- xvmulsp vs4, vs26, vs9
- xvmulsp vs5, vs27, vs9
- xvmulsp vs6, vs28, vs9
- xvmulsp vs7, vs29, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
- xvmaddasp vs6, vs28, vs9
- xvmaddasp vs7, vs29, vs9
-
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP16(\Index,64)
-
- .endm
-
-
-
-
- .macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
- lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
-
- lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
- lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
- lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
-
- lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
- lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
- lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
-
- lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
- lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
- lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
- lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
- xxspltw vs12, vs39, 3
- xxspltw vs13, vs39, 2
- xxspltw vs14, vs39, 1
- xxspltw vs15, vs39, 0
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
- xvmaddasp vs6, vs28, vs9
- xvmaddasp vs7, vs29, vs9
-
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs17, vs10
- xvmaddasp vs2, vs18, vs10
- xvmaddasp vs3, vs19, vs10
-
- xvmaddasp vs4, vs16, vs11
- xvmaddasp vs5, vs17, vs11
- xvmaddasp vs6, vs18, vs11
- xvmaddasp vs7, vs19, vs11
-
- xvmaddasp vs0, vs30, vs12
- xvmaddasp vs1, vs31, vs12
- xvmaddasp vs2, vs32, vs12
- xvmaddasp vs3, vs33, vs12
-
- xvmaddasp vs4, vs30, vs13
- xvmaddasp vs5, vs31, vs13
- xvmaddasp vs6, vs32, vs13
- xvmaddasp vs7, vs33, vs13
-
- xvmaddasp vs0, vs34, vs14
- xvmaddasp vs1, vs35, vs14
- xvmaddasp vs2, vs36, vs14
- xvmaddasp vs3, vs37, vs14
-
- xvmaddasp vs4, vs34, vs15
- xvmaddasp vs5, vs35, vs15
- xvmaddasp vs6, vs36, vs15
- xvmaddasp vs7, vs37, vs15
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP64(\Index,256)
- .endif
-
- .endm
-
- .macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 3
- xxspltw vs9, vs36, 2
- xxspltw vs10, vs36, 1
- xxspltw vs11, vs36, 0
- lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
- lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
- lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
- lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
- xvmaddasp vs6, vs28, vs9
- xvmaddasp vs7, vs29, vs9
-
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs17, vs10
- xvmaddasp vs2, vs18, vs10
- xvmaddasp vs3, vs19, vs10
-
- xvmaddasp vs4, vs16, vs11
- xvmaddasp vs5, vs17, vs11
- xvmaddasp vs6, vs18, vs11
- xvmaddasp vs7, vs19, vs11
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
-
- .endm
-
-
- .macro SAVE2x16
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- lxv vs17, 16(CO)
- lxv vs18, 32(CO)
- lxv vs19, 48(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxv vs26, 0(T1)
- lxv vs27, 16(T1)
- lxv vs28, 32(T1)
- lxv vs29, 48(T1)
- #endif
-
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs17, vs1, alpha_r
- xvmulsp vs18, vs2, alpha_r
- xvmulsp vs19, vs3, alpha_r
- xvmulsp vs26, vs4, alpha_r
- xvmulsp vs27, vs5, alpha_r
- xvmulsp vs28, vs6, alpha_r
- xvmulsp vs29, vs7, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs17, vs1, alpha_r
- xvmaddasp vs18, vs2, alpha_r
- xvmaddasp vs19, vs3, alpha_r
- xvmaddasp vs26, vs4, alpha_r
- xvmaddasp vs27, vs5, alpha_r
- xvmaddasp vs28, vs6, alpha_r
- xvmaddasp vs29, vs7, alpha_r
- #endif
- stxv vs16, 0(CO)
- stxv vs17, 16(CO)
- stxv vs18, 32(CO)
- stxv vs19, 48(CO)
-
- stxv vs26, 0(T1)
- stxv vs27, 16(T1)
- stxv vs28, 32(T1)
- stxv vs29, 48(T1)
-
- addi CO,CO,64
-
- .endm
-
- /* M=8 N=2 */
-
- .macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero2x8
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
-
- xxlxor vs4, vs4, vs4
- xxlxor vs5, vs5, vs5
-
- .endm
-
- .macro KERNEL2x8
- KERNEL2x8_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
-
- xvmulsp vs4, vs26, vs9
- xvmulsp vs5, vs27, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
-
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP8(\Index,32)
-
- .endm
-
-
-
-
- .macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
-
- lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
-
- lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
-
- lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
- lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG)
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
- xxspltw vs12, vs39, 3
- xxspltw vs13, vs39, 2
- xxspltw vs14, vs39, 1
- xxspltw vs15, vs39, 0
-
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
-
-
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs17, vs10
- xvmaddasp vs4, vs16, vs11
- xvmaddasp vs5, vs17, vs11
-
-
- xvmaddasp vs0, vs30, vs12
- xvmaddasp vs1, vs31, vs12
- xvmaddasp vs4, vs30, vs13
- xvmaddasp vs5, vs31, vs13
-
- xvmaddasp vs0, vs34, vs14
- xvmaddasp vs1, vs35, vs14
- xvmaddasp vs4, vs34, vs15
- xvmaddasp vs5, vs35, vs15
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
-
- .endm
-
- .macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 3
- xxspltw vs9, vs36, 2
- xxspltw vs10, vs36, 1
- xxspltw vs11, vs36, 0
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
- lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG)
- lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG)
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
-
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs17, vs10
-
- xvmaddasp vs4, vs16, vs11
- xvmaddasp vs5, vs17, vs11
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
-
- .endm
-
-
- .macro SAVE2x8
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- lxv vs17, 16(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxv vs26, 0(T1)
- lxv vs27, 16(T1)
-
- #endif
-
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs17, vs1, alpha_r
- xvmulsp vs26, vs4, alpha_r
- xvmulsp vs27, vs5, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs17, vs1, alpha_r
- xvmaddasp vs26, vs4, alpha_r
- xvmaddasp vs27, vs5, alpha_r
- #endif
-
- stxv vs16, 0(CO)
- stxv vs17, 16(CO)
-
-
- stxv vs26, 0(T1)
- stxv vs27, 16(T1)
-
- addi CO,CO,32
-
- .endm
-
-
- /*M=4*/
-
-
- .macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- /* we will aggregate on save vs0 +vs4 vs11+vs5 */
- .macro Zero2x4
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
-
- xxlxor vs4, vs4, vs4
- xxlxor vs5, vs5, vs5
-
- .endm
-
- .macro KERNEL2x4
- KERNEL2x4_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs26, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP4(\Index,16)
-
- .endm
-
-
-
-
- .macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG)
-
- lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
-
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
- xxspltw vs12, vs39, 3
- xxspltw vs13, vs39, 2
- xxspltw vs14, vs39, 1
- xxspltw vs15, vs39, 0
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
- xvmaddasp vs4, vs16, vs10
- xvmaddasp vs5, vs16, vs11
-
-
- xvmaddasp vs0, vs30, vs12
- xvmaddasp vs1, vs30, vs13
- xvmaddasp vs4, vs34, vs14
- xvmaddasp vs5, vs34, vs15
-
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
-
- .endm
-
- .macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 3
- xxspltw vs9, vs36, 2
- xxspltw vs10, vs36, 1
- xxspltw vs11, vs36, 0
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG)
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
- xvmaddasp vs4, vs16, vs10
- xvmaddasp vs5, vs16, vs11
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP8(\Index,32)
- .endif
-
- .endm
-
-
- .macro SAVE2x4
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxv vs26, 0(T1)
-
- #endif
- /*aggregate vectors*/
- xvaddsp vs0,vs0,vs4
- xvaddsp vs1,vs1,vs5
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs26, vs1, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs26, vs1, alpha_r
- #endif
-
- stxv vs16, 0(CO)
- stxv vs26, 0(T1)
-
- addi CO,CO,16
-
- .endm
-
-
- /* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */
- .macro SWITCH_PERMUTE_INNER
- xxpermdi permute_mask, permute_mask, permute_mask,2
- .endm
-
- .macro Zero2x2
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- SWITCH_PERMUTE_INNER
- .endm
-
- .macro KERNEL2x2
- KERNEL2x2_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxperm vs9, vs36, permute_mask
- lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs37, vs36
- xvmulsp vs1, vs37, vs9
-
- .else
- xvmaddasp vs0, vs37, vs36
- xvmaddasp vs1, vs37, vs9
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP2(\Index,8)
-
- .endm
-
-
-
-
- .macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG)
-
-
- xxperm vs9, vs8, permute_mask
- xxperm vs11, vs10, permute_mask
-
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs16, vs11
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP8(\Index,32)
- .endif
-
- .endm
-
- .macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG)
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
-
-
- xxperm vs9, vs8, permute_mask
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP4(\Index,16)
- .endif
- .endm
-
-
- .macro SAVE2x2
-
- #ifndef TRMMKERNEL
- lxsd v4 , 0(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxsd v5 , 0(T1)
-
- #endif
- /*aggregate vectors*/
- xxpermdi vs4,vs0,vs0,2
- xxpermdi vs5,vs1,vs1,2
- xvaddsp vs0,vs0,vs4
- xvaddsp vs1,vs1,vs5
- /* */
- /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */
- xxperm vs1,vs1, permute_mask
-
-
- xxmrghw vs2 ,vs1,vs0
- xxpermdi vs2,vs2,vs2,2
- xxmrghw vs3 ,vs0,vs1
- #if defined(TRMMKERNEL)
- xvmulsp vs36, vs2, alpha_r
- xvmulsp vs37, vs3, alpha_r
- #else
- xvmaddasp vs36, vs2, alpha_r
- xvmaddasp vs37, vs3, alpha_r
- #endif
- /**** store last two words*/
-
-
- stxsd v4, 0(CO)
- stxsd v5, 0(T1)
-
- addi CO,CO,8
-
- .endm
-
- /*--------------------------- M=1 N=2 */
- .macro Zero2x1
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2,vs2,vs2
- xxlxor vs3,vs3,vs3
- .endm
-
- .macro KERNEL2x1
- KERNEL2x1_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
- /*
- we will calculate 1 alone then will add it to batched ones
- */
- .macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG)
- lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG)
- lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs2, vs37, vs35
- xvmulsp vs3, vs37, vs36
-
- .else
- xsmaddadp vs2, vs37, vs35
- xsmaddadp vs3, vs37, vs36
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP1(\Index,4)
-
- .endm
-
-
-
-
- .macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
-
- xxmrglw vs5, vs26,vs26
- xxmrghw vs6, vs26,vs26
-
- xvmaddasp vs0, vs8, vs5
- xvmaddasp vs1, vs10, vs6
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP4(\Index,16)
- .endif
-
- .endm
-
- .macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG)
- lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG)
- lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG)
- lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG)
- lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG)
- lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG)
-
-
- xsmaddadp vs2, vs37, vs35
- xsmaddadp vs3, vs37, vs36
-
- xsmaddadp vs2, vs38, vs39
- xsmaddadp vs3, vs38, vs40
-
-
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP2(\Index,8)
- .endm
-
-
- .macro SAVE2x1
-
- #ifndef TRMMKERNEL
- lxssp v4 , 0(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxssp v5 , 0(T1)
-
- #endif
-
- /*convert alpha_r for multiply*/
- xscvspdp vs16,alpha_r
-
- /*aggregate vectors 2x2_4 */
- xxpermdi vs4,vs0,vs0,2
- xxpermdi vs5,vs1,vs1,2
- xvaddsp vs0,vs0,vs4
- xvaddsp vs1,vs1,vs5
- xvaddsp vs0,vs0,vs1
- /*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
- xscvspdp vs5, vs0
- xxspltw vs6, vs0, 1
- xscvspdp vs6,vs6
- xsadddp vs2,vs2,vs6
- xsadddp vs3,vs3,vs5
-
- /**** store last two words*/
- #if defined(TRMMKERNEL)
- xsmuldp vs36,vs2, vs16
- xsmuldp vs37,vs3, vs16
-
- #else
- xsmaddadp vs36,vs2, vs16
- xsmaddadp vs37,vs3, vs16
- #endif
-
- stxssp v4, 0(CO)
- stxssp v5, 0(T1)
-
- addi CO,CO,4
-
- .endm
-
-
-
- /****************************N=1 section*****************/
-
- .macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero1x16
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
- .endm
-
- .macro KERNEL1x16
- KERNEL1x16_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
- xscvdpspn vs36,vs36
- xxspltw vs8, vs36, 0
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
- lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs2, vs28, vs8
- xvmulsp vs3, vs29, vs8
-
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
- .endif
-
- addi \BREG, \BREG, DISP1(\Index,4)
- addi \AREG, \AREG, DISP16(\Index,64)
-
- .endm
-
-
-
-
- .macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
- lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
-
- lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
- lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
- lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
-
- lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
- lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
- lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
-
- lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
- lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
- lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
- lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
-
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
-
- xvmaddasp vs0, vs16, vs9
- xvmaddasp vs1, vs17, vs9
- xvmaddasp vs2, vs18, vs9
- xvmaddasp vs3, vs19, vs9
-
-
- xvmaddasp vs0, vs30, vs10
- xvmaddasp vs1, vs31, vs10
- xvmaddasp vs2, vs32, vs10
- xvmaddasp vs3, vs33, vs10
-
-
- xvmaddasp vs0, vs34, vs11
- xvmaddasp vs1, vs35, vs11
- xvmaddasp vs2, vs36, vs11
- xvmaddasp vs3, vs37, vs11
-
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP64(\Index,256)
- .endif
-
- .endm
-
- .macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
- lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
- lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
- lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
-
- xvmaddasp vs0, vs16, vs9
- xvmaddasp vs1, vs17, vs9
- xvmaddasp vs2, vs18, vs9
- xvmaddasp vs3, vs19, vs9
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
-
- .endm
-
-
- .macro SAVE1x16
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- lxv vs17, 16(CO)
- lxv vs18, 32(CO)
- lxv vs19, 48(CO)
- #endif
-
-
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs17, vs1, alpha_r
- xvmulsp vs18, vs2, alpha_r
- xvmulsp vs19, vs3, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs17, vs1, alpha_r
- xvmaddasp vs18, vs2, alpha_r
- xvmaddasp vs19, vs3, alpha_r
- #endif
- stxv vs16, 0(CO)
- stxv vs17, 16(CO)
- stxv vs18, 32(CO)
- stxv vs19, 48(CO)
-
- addi CO,CO,64
-
- .endm
-
- /* M=8 N=1 */
-
- .macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero1x8
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
- .endm
-
- .macro KERNEL1x8
- KERNEL1x8_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
- xscvdpspn vs36,vs36
- xxspltw vs8, vs36, 0
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
-
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
- .endif
-
- addi \BREG, \BREG, DISP1(\Index,4)
- addi \AREG, \AREG, DISP8(\Index,32)
-
- .endm
-
-
-
-
- .macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
-
- lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
-
- lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
-
- lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
- lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)
-
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
-
- xvmaddasp vs2, vs16, vs9
- xvmaddasp vs3, vs17, vs9
-
-
- xvmaddasp vs0, vs30, vs10
- xvmaddasp vs1, vs31, vs10
-
-
- xvmaddasp vs2, vs34, vs11
- xvmaddasp vs3, vs35, vs11
-
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
-
- .endm
-
- .macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
- lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
-
- xvmaddasp vs2, vs16, vs9
- xvmaddasp vs3, vs17, vs9
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
-
- .endm
-
-
- .macro SAVE1x8
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- lxv vs17, 16(CO)
- #endif
- /* aggregate vs0 vs2 and vs1 vs3*/
- xvaddsp vs0,vs0,vs2
- xvaddsp vs1,vs1,vs3
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs17, vs1, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs17, vs1, alpha_r
- #endif
- stxv vs16, 0(CO)
- stxv vs17, 16(CO)
-
- addi CO,CO,32
-
- .endm
- /*M=4*/
-
- .macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero1x4
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
- .endm
-
- .macro KERNEL1x4
- KERNEL1x4_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
- xscvdpspn vs36,vs36
- xxspltw vs8, vs36, 0
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- .else
- xvmaddasp vs0, vs26, vs8
-
- .endif
-
- addi \BREG, \BREG, DISP1(\Index,4)
- addi \AREG, \AREG, DISP4(\Index,16)
-
- .endm
-
-
-
-
- .macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
-
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
-
- lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
-
-
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
-
- xvmaddasp vs0, vs26, vs8
-
- xvmaddasp vs1, vs27, vs9
-
- xvmaddasp vs2, vs30, vs10
-
-
- xvmaddasp vs3, vs31, vs11
-
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
-
- .endm
-
- .macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs9
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP8(\Index,32)
- .endif
-
- .endm
-
-
- .macro SAVE1x4
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- #endif
- /* aggregate */
- xvaddsp vs0,vs0,vs2
- xvaddsp vs1,vs1,vs3
- xvaddsp vs0,vs1,vs0
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- #endif
- stxv vs16, 0(CO)
-
- addi CO,CO,16
-
- .endm
-
- /* M=2 N=1*/
- .macro Zero1x2
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2,vs2,vs2
- xxlxor vs3,vs3,vs3
- .endm
-
- .macro KERNEL1x2
- KERNEL1x2_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
- /*
- we will calculate 1 alone then will add it to batched ones
- */
- .macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG)
- lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG)
- lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
-
-
- .if \First==1
- xvmuldp vs2, vs37, vs35
- xvmuldp vs3, vs37, vs36
-
- .else
- xsmaddadp vs2, vs37, vs35
- xsmaddadp vs3, vs37, vs36
- .endif
-
- addi \AREG, \AREG, DISP2(\Index,8)
- addi \BREG, \BREG, DISP1(\Index,4)
-
- .endm
-
-
-
-
- .macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
- lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG)
-
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
-
- xxmrglw vs5, vs26,vs26
- xxmrghw vs6, vs26,vs26
-
- xvmaddasp vs0, vs8, vs5
- xvmaddasp vs1, vs10, vs6
-
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP8(\Index,32)
- addi \BREG, \BREG, DISP4(\Index,16)
- .endif
-
- .endm
-
- .macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG)
- lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG)
- lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG)
- lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG)
- lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG)
- lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG)
-
-
- xsmaddadp vs2, vs37, vs35
- xsmaddadp vs3, vs37, vs36
-
- xsmaddadp vs2, vs38, vs39
- xsmaddadp vs3, vs38, vs40
-
-
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP2(\Index,8)
- .endm
-
-
- .macro SAVE1x2
-
- #ifndef TRMMKERNEL
- lxssp v4 , 0(CO)
- lxssp v5 , 4(CO)
-
- #endif
-
- /*convert alpha_r for multiply*/
- xscvspdp vs16,alpha_r
-
- /*aggregate vectors 1x2_4 */
- xxpermdi vs4,vs0,vs0,2
- xxpermdi vs5,vs1,vs1,2
- xvaddsp vs0,vs0,vs4
- xvaddsp vs1,vs1,vs5
- xvaddsp vs0,vs0,vs1
- /*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
- xscvspdp vs5, vs0
- xxspltw vs6, vs0, 1
- xscvspdp vs6,vs6
- xsadddp vs2,vs2,vs6
- xsadddp vs3,vs3,vs5
-
- /**** store last two words*/
- #if defined(TRMMKERNEL)
- xsmuldp vs36,vs2, vs16
- xsmuldp vs37,vs3, vs16
-
- #else
- xsmaddadp vs36,vs2, vs16
- xsmaddadp vs37,vs3, vs16
- #endif
-
- stxssp v4, 0(CO)
- stxssp v5, 4(CO)
-
- addi CO,CO,8
-
- .endm
- /*///////////////// N=1 M=1 //////////////////*/
- .macro Zero1x1
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2,vs2
- xxlxor vs3,vs3,vs3
- xxlxor vs4,vs4,vs4
- .endm
-
- .macro KERNEL1x1
- KERNEL1x1_1 AO,BO, 1, 0,0,0
- .endm
-
- .macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
- /*
- we will calculate 1 alone ( FIRST==1 to zero vs4)
- */
- .macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG)
- lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
-
-
- .if \First==1
- xvmuldp vs4, vs37, vs35
-
- .else
- xsmaddadp vs4, vs37, vs35
- .endif
-
- addi \AREG, \AREG, DISP1(\Index,4)
- addi \BREG, \BREG, DISP1(\Index,4)
-
- .endm
-
-
- .macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG)
- lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG)
- lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG)
- lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG)
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG)
- lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG)
- lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG)
- lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG)
- xvmaddasp vs0, vs8, vs26
- xvmaddasp vs1, vs9, vs16
- xvmaddasp vs2, vs10, vs17
- xvmaddasp vs3, vs11, vs18
- .if \IsLast==1
- addi \AREG, \AREG, DISP16(\Index,64)
- addi \BREG, \BREG, DISP16(\Index,64)
- .endif
-
- .endm
-
- .macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
- lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG)
- lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG)
- xvmaddasp vs0, vs8, vs26
- xvmaddasp vs1, vs9, vs16
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP8(\Index,32)
- addi \BREG, \BREG, DISP8(\Index,32)
- .endif
-
- .endm
-
-
- .macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG)
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
-
- xvmaddasp vs0, vs8, vs26
-
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP4(\Index,16)
- .endif
-
- .endm
-
- .macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG)
- lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG)
-
- xvmaddasp vs0, vs36, vs37
-
- addi \AREG, \AREG, DISP2(\Index,8)
- addi \BREG, \BREG, DISP2(\Index,8)
- .endm
-
-
- .macro SAVE1x1
-
- #ifndef TRMMKERNEL
- lxssp v4 , 0(CO)
-
- #endif
-
- /*convert alpha_r for multiply*/
- xscvspdp vs16,alpha_r
-
- /*aggregate vectors */
- xvaddsp vs0,vs0,vs1
- xvaddsp vs2,vs2,vs3
- xvaddsp vs0,vs0,vs2
-
- xxpermdi vs7,vs0,vs0,2
- xvaddsp vs0,vs0,vs7
- /*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
- xscvspdp vs5, vs0
- xxspltw vs6, vs0, 1
- xscvspdp vs6,vs6
- xsadddp vs7,vs5,vs6
- xsadddp vs4,vs4,vs7
-
- /**** store last two words*/
- #if defined(TRMMKERNEL)
- xsmuldp vs36,vs4, vs16
-
- #else
- xsmaddadp vs36,vs4, vs16
- #endif
-
- stxssp v4, 0(CO)
-
- addi CO,CO,4
-
- .endm
-
-
-
-
- /****************************TRMM POINTER REFRESH MACROSES*************************/
-
- .macro SHIFT_REG REG1,REG2,SHIFT_VAL
- .if \SHIFT_VAL==16
- slwi \REG1, \REG2, 6
- .elseif \SHIFT_VAL==8
- slwi \REG1, \REG2, 5
- .elseif \SHIFT_VAL==4
- slwi \REG1, \REG2, 4
- .elseif \SHIFT_VAL==2
- slwi \REG1, \REG2, 3
- .elseif \SHIFT_VAL==1
- slwi \REG1, \REG2, 2
- .endif
- .endm
-
- /*
- //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- // ptrbb = bb;
- // #else
- // ptrba += off*16;
- // ptrbb = bb + off*2;
- // #endif
- */
- .macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
- #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- /* ptrbb = bb;*/
- mr \PTR_B,\B_VAL /* refresh BPOINT */
-
- #else
- /*
- // ptrba =ptrba+ off*C_A;
- // ptrbb = bb + off*C_B;
- */
- SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
- SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
- add \PTR_B, \B_VAL , T4 /* Add values to BO */
- add \PTR_A, \PTR_A, T2 /* Add values to AO */
- #endif
- .endm
-
-
- /*
- // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- // temp = bk-off;
- // #elif defined(LEFT)
- // temp = off+16; // number of values in A
- // #else
- // temp = off+2; // number of values in B
- // #endif
- */
- .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
- #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- /* temp = bk-off;*/
- sub \TEMP_BK,\BK_VAL,\OFF_VAL
-
- #elif defined(LEFT)
- /* temp = off+INCR_A; // number of values in A */
- addi \TEMP_BK, \OFF_VAL, \INCR_A
- #else
- /* temp = off+INCR_B // number of values in B*/
- addi \TEMP_BK,\OFF_VAL, \INCR_B
- #endif
-
- .endm
- /*
- // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- // temp = bk - off;
- // #ifdef LEFT
- // temp -= 16; // number of values in A
- // #else
- // temp -= 2; // number of values in B
- // #endif
- // ptrba += temp*16;
- // ptrbb += temp*2;
- // #endif
-
- // #ifdef LEFT
- // off += 16; // number of values in A
- // #endif
- */
-
-
- .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-
- #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- /*temp = bk - off;*/
- sub \TEMP_BK,\BK_VAL,\OFF_VAL
- #ifdef LEFT
- /*temp -= 8; // number of values in A*/
- addi \TEMP_BK,\TEMP_BK,-\C_A
- #else
- /*temp -= 4; // number of values in B*/
- addi \TEMP_BK,\TEMP_BK,-\C_B
- #endif
- /*ptrba += temp*C_A;
- ptrbb += temp*C_B;*/
- SHIFT_REG T4,\TEMP_BK,\C_A
- SHIFT_REG T2,\TEMP_BK,\C_B
- add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
- add \PTR_B, \PTR_B,T2
-
- #endif
-
- #ifdef LEFT
- /*off += 8; // number of values in A*/
- addi \OFF_VAL,\OFF_VAL,\C_A
- #endif
- .endm
|