/*************************************************************************** Copyright (c) 2013-2020, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define unit_size 8 #define DISP32(ind, disp) (ind*unit_size*32+disp) #define DISP16(ind, disp) (ind*unit_size*16+disp) #define DISP8(ind, disp) (ind*unit_size*8+disp) #define DISP4(ind, disp) (ind*unit_size*4+disp) #define DISP2(ind, disp) (ind*unit_size*2+disp) #define DISP1(ind, disp) (ind*unit_size+disp) #define DISPX(disp) (disp) .macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI #if defined(NN) || defined(NT) || defined(TN) || defined(TT) xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI #elif defined(NC) || defined(TC) || defined(NR) || defined(TR) xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 #else // CC || CR || RC || RR /*we will assume {-alpha_r,-alpha_i} for this case */ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 /*we will negate alpha image instead to fix sign*/ xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI #endif .endm .macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI #if defined(NN) || defined(NT) || defined(TN) || defined(TT) xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 #elif defined(NC) || defined(TC) || defined(NR) || defined(TR) xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI #else // CC || CR || RC || RR /*we will assume {-alpha_r,-alpha_i} for this case */ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 /*we will negate alpha image instead to fix sign*/ xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI #endif .endm /* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ .macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2 xvmulsp \VSOUT1, \VSINII, alpha_i xvmulsp \VSOUT2, \VSINRR, alpha_i .endm /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ .macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2 xvmsubasp \VSOUT1, \VSINRR, alpha_r xvmaddasp \VSOUT2, \VSINII, alpha_r .endm .macro PERMUTE1 OUT, R1, R2, R3, R4 xxsel vs62, \R1, \R2, vs57 xxsel \OUT, \R3, \R4, vs57 xxpermdi \OUT, \OUT, vs62, 1 .endm .macro PERMUTE2 OUT, R1, R2, R3, R4 xxsel vs62, \R2, \R1, vs57 xxsel \OUT, \R4, \R3, vs57 xxpermdi \OUT, vs62, \OUT, 1 xxperm \OUT, \OUT, permute_mask .endm .macro PERMUTE3 OUT, R1, R2, R3, R4 xxsel vs62, \R1, \R2, vs57 xxsel \OUT, \R3, \R4, vs57 xxpermdi \OUT, vs62, \OUT, 2 .endm .macro PERMUTE4 OUT, R1, R2, R3, R4 xxsel vs62, \R2, \R1, vs57 xxsel \OUT, \R4, \R3, vs57 xxpermdi \OUT, \OUT, vs62, 2 xxperm \OUT, \OUT, permute_mask .endm .macro GROUP1 xxperm vs0, vs32, permute_mask xxperm vs4, vs40, permute_mask xxperm vs1, vs33, permute_mask xxperm vs5, vs41, permute_mask xxperm vs8, vs36, permute_mask xxperm vs12, vs44, permute_mask xxperm vs9, vs37, permute_mask xxperm vs13, vs45, permute_mask .endm .macro AGG_GROUP1 AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12 AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13 .endm .macro GROUP2 xxperm vs0, vs34, permute_mask xxperm vs4, vs42, permute_mask xxperm vs1, vs35, permute_mask xxperm vs5, vs43, permute_mask xxperm vs8, vs38, permute_mask xxperm vs12, vs46, permute_mask xxperm vs9, vs39, permute_mask xxperm vs13, vs47, permute_mask .endm .macro AGG_GROUP2 AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4 AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5 AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12 AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13 .endm .macro MULTIPLY_GROUP1 MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART1 vs33, vs41, vs2, vs3 MULT_APLHA_PART1 vs36, vs44, vs8, vs9 MULT_APLHA_PART1 vs37, vs45, vs10, vs11 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 MULT_APLHA_PART2 vs36, vs44, vs8, vs9 MULT_APLHA_PART2 vs37, vs45, vs10, vs11 .endm .macro MULTIPLY_GROUP2 MULT_APLHA_PART1 vs34, vs42, vs4, vs5 MULT_APLHA_PART1 vs35, vs43, vs6, vs7 MULT_APLHA_PART1 vs38, vs46, vs12, vs13 MULT_APLHA_PART1 vs39, vs47, vs14, vs15 MULT_APLHA_PART2 vs34, vs42, vs4, vs5 MULT_APLHA_PART2 vs35, vs43, vs6, vs7 MULT_APLHA_PART2 vs38, vs46, vs12, vs13 MULT_APLHA_PART2 vs39, vs47, vs14, vs15 .endm /* reconstruct r, i pairs*/ .macro RECONSTRUCT_PAIR1 xxperm vs0, vs1, save_permute_1 xxperm vs2, vs3, save_permute_1 xxperm vs8, vs9, save_permute_1 xxperm vs10, vs11, save_permute_1 .endm .macro RECONSTRUCT_PAIR2 xxperm vs4, vs5, save_permute_1 xxperm vs6, vs7, save_permute_1 xxperm vs12, vs13, save_permute_1 xxperm vs14, vs15, save_permute_1 .endm .macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4 xxmfacc \ACC PERMUTE1 \O1, \R3, \R2, \R1, \R0 PERMUTE2 \O2, \R1, \R0, \R3, \R2 PERMUTE3 \O3, \R1, \R0, \R3, \R2 PERMUTE4 \O4, \R3, \R2, \R1, \R0 .endm /* macros for N=4 and M=8 **********************************************************************************************/ .macro ZERO4x8 xxsetaccz 0 xxsetaccz 1 xxsetaccz 2 xxsetaccz 3 xxsetaccz 4 xxsetaccz 5 xxsetaccz 6 xxsetaccz 7 .endm .macro LOAD4x8 LOAD4x8O 0, 0 .endm .macro LOAD4x8O OffsetA, OffsetB lxvp vs34, (\OffsetB+0)(BO) lxvp vs32, (\OffsetA+0)(AO) lxvp vs36, (\OffsetA+32)(AO) .endm .macro END4x8_NORMAL END4x8 AO, BO, 64, 32 .endm .macro END4x8_WITHOUT_ADD END4x8 AO, BO, 0, 0 .endm .macro END4x8 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 3, 36, 34 xvf32gerpp 2, 37, 34 xvf32gerpp 1, 32, 34 xvf32gerpp 0, 33, 34 xvf32gerpp 7, 36, 35 xvf32gerpp 6, 37, 35 xvf32gerpp 5, 32, 35 xvf32gerpp 4, 33, 35 #else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 xvf32gerpp 7, 36, 34 xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 #endif .endm .macro LOAD4x8_2 LOAD4x8_2O 0, 0 .endm .macro LOAD4x8_2O OffsetA, OffsetB lxvp vs34, (\OffsetB)(BO) lxvp vs38, (32+\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) lxvp vs40, (64+\OffsetA)(AO) lxvp vs42, (64+32+\OffsetA)(AO) .endm .macro END4x8_2 /*for load2 offset will be 128 and 64*/ KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1 .endm .macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ xvf32gerpp 3, 36, 34 xvf32gerpp 2, 37, 34 xvf32gerpp 1, 32, 34 xvf32gerpp 0, 33, 34 xvf32gerpp 7, 36, 35 xvf32gerpp 6, 37, 35 xvf32gerpp 5, 32, 35 xvf32gerpp 4, 33, 35 #else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 xvf32gerpp 7, 36, 34 xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 #endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ xvf32gerpp 3, 42, 38 xvf32gerpp 2, 43, 38 xvf32gerpp 1, 40, 38 xvf32gerpp 0, 41, 38 xvf32gerpp 7, 42, 39 xvf32gerpp 6, 43, 39 xvf32gerpp 5, 40, 39 xvf32gerpp 4, 41, 39 #else xvf32gerpp 3, 42, 39 xvf32gerpp 2, 43, 39 xvf32gerpp 1, 40, 39 xvf32gerpp 0, 41, 39 xvf32gerpp 7, 42, 38 xvf32gerpp 6, 43, 38 xvf32gerpp 5, 40, 38 xvf32gerpp 4, 41, 38 #endif .if \Complete==0 lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG) .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP8(\Index, \OffsetB) addi \AREG, \AREG, DISP16(\Index, \OffsetA) .else addi \BREG, \BREG, DISP8(\Index, 64) addi \AREG, \AREG, DISP16(\Index, 128) .endif .endif .endm .macro KERNEL4x8 LOAD4x8 END4x8 AO, BO, 64, 32 .endm .macro SAVE4x8 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60 SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61 SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20 SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21 add T4, LDC, LDC add T1, CO, LDC #ifndef TRMMKERNEL lxvp vs24, 0(CO) #endif #ifndef TRMMKERNEL lxvp vs26, 32(CO) #endif #ifndef TRMMKERNEL lxvp vs28, 0(T1) #endif xxperm vs2, vs34, permute_mask xxperm vs6, vs42, permute_mask #ifndef TRMMKERNEL lxvp vs30, 32(T1) #endif xxperm vs3, vs35, permute_mask xxperm vs7, vs43, permute_mask add T2, CO, T4 add T3, T1, T4 GROUP1 AGG_GROUP1 AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 xxperm vs10, vs38, permute_mask xxperm vs14, vs46, permute_mask AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 xxperm vs11, vs39, permute_mask xxperm vs15, vs47, permute_mask xxperm vs0, vs48, permute_mask xxperm vs4, vs56, permute_mask xxperm vs1, vs49, permute_mask xxperm vs5, vs16, permute_mask AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14 xxperm vs2, vs50, permute_mask xxperm vs6, vs58, permute_mask AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15 xxperm vs3, vs17, permute_mask xxperm vs7, vs19, permute_mask AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4 xxperm vs8, vs52, permute_mask xxperm vs12, vs60, permute_mask AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5 xxperm vs9, vs53, permute_mask xxperm vs13, vs61, permute_mask AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6 xxperm vs10, vs54, permute_mask xxperm vs14, vs21, permute_mask AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7 xxperm vs11, vs18, permute_mask xxperm vs15, vs20, permute_mask AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12 AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULT_APLHA_PART1 vs32, vs40, vs0, vs1 AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14 MULT_APLHA_PART1 vs33, vs41, vs2, vs3 AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15 MULT_APLHA_PART1 vs34, vs42, vs4, vs5 MULT_APLHA_PART1 vs35, vs43, vs6, vs7 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 MULT_APLHA_PART2 vs34, vs42, vs4, vs5 MULT_APLHA_PART2 vs35, vs43, vs6, vs7 #ifndef TRMMKERNEL lxvp vs32, 0(T2) #endif MULT_APLHA_PART1 vs36, vs44, vs8, vs9 MULT_APLHA_PART1 vs37, vs45, vs10, vs11 #ifndef TRMMKERNEL lxvp vs40, 32(T2) #endif MULT_APLHA_PART1 vs38, vs46, vs12, vs13 MULT_APLHA_PART1 vs39, vs47, vs14, vs15 #ifndef TRMMKERNEL lxvp vs34, 0(T3) #endif MULT_APLHA_PART2 vs36, vs44, vs8, vs9 MULT_APLHA_PART2 vs37, vs45, vs10, vs11 #ifndef TRMMKERNEL lxvp vs42, 32(T3) #endif MULT_APLHA_PART2 vs38, vs46, vs12, vs13 MULT_APLHA_PART2 vs39, vs47, vs14, vs15 RECONSTRUCT_PAIR1 RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs1, vs0, vs8, 1 xxpermdi vs3, vs2, vs10, 1 xxpermdi vs5, vs4, vs12, 1 xxpermdi vs7, vs6, vs14, 1 xxpermdi vs9, vs8, vs0, 1 xxpermdi vs11, vs10, vs2, 1 #else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 #endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs13, vs12, vs4, 1 xxpermdi vs15, vs14, vs6, 1 #else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 #endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 xvaddsp vs29, vs29, vs9 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 #else #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ xxpermdi vs25, vs0, vs8, 1 xxpermdi vs24, vs2, vs10, 1 xxpermdi vs27, vs4, vs12, 1 xxpermdi vs26, vs6, vs14, 1 xxpermdi vs29, vs8, vs0, 1 xxpermdi vs28, vs10, vs2, 1 xxpermdi vs31, vs12, vs4, 1 xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs12, vs4, 2 xxpermdi vs26, vs14, vs6, 2 xxpermdi vs29, vs0, vs8, 2 xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 #endif #endif stxvp vs24, 0(CO) MULT_APLHA_PART1 vs48, vs56, vs0, vs1 MULT_APLHA_PART1 vs49, vs16, vs2, vs3 stxvp vs26, 32(CO) MULT_APLHA_PART1 vs50, vs58, vs4, vs5 MULT_APLHA_PART1 vs17, vs19, vs6, vs7 stxvp vs28, 0(T1) MULT_APLHA_PART2 vs48, vs56, vs0, vs1 MULT_APLHA_PART2 vs49, vs16, vs2, vs3 stxvp vs30, 32(T1) MULT_APLHA_PART2 vs50, vs58, vs4, vs5 MULT_APLHA_PART2 vs17, vs19, vs6, vs7 MULT_APLHA_PART1 vs52, vs60, vs8, vs9 MULT_APLHA_PART1 vs53, vs61, vs10, vs11 MULT_APLHA_PART1 vs54, vs21, vs12, vs13 MULT_APLHA_PART1 vs18, vs20, vs14, vs15 MULT_APLHA_PART2 vs52, vs60, vs8, vs9 MULT_APLHA_PART2 vs53, vs61, vs10, vs11 MULT_APLHA_PART2 vs54, vs21, vs12, vs13 MULT_APLHA_PART2 vs18, vs20, vs14, vs15 RECONSTRUCT_PAIR1 RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs1, vs0, vs8, 1 xxpermdi vs3, vs2, vs10, 1 xxpermdi vs5, vs4, vs12, 1 xxpermdi vs7, vs6, vs14, 1 xxpermdi vs9, vs8, vs0, 1 xxpermdi vs11, vs10, vs2, 1 #else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 #endif xvaddsp vs32, vs32, vs3 xvaddsp vs33, vs33, vs1 #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs13, vs12, vs4, 1 xxpermdi vs15, vs14, vs6, 1 #else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 #endif xvaddsp vs40, vs40, vs7 xvaddsp vs41, vs41, vs5 xvaddsp vs34, vs34, vs11 xvaddsp vs35, vs35, vs9 xvaddsp vs42, vs42, vs15 xvaddsp vs43, vs43, vs13 #else #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ xxpermdi vs33, vs0, vs8, 1 xxpermdi vs32, vs2, vs10, 1 xxpermdi vs41, vs4, vs12, 1 xxpermdi vs40, vs6, vs14, 1 xxpermdi vs35, vs8, vs0, 1 xxpermdi vs34, vs10, vs2, 1 xxpermdi vs43, vs12, vs4, 1 xxpermdi vs42, vs14, vs6, 1 #else xxpermdi vs33, vs8, vs0, 2 xxpermdi vs32, vs10, vs2, 2 xxpermdi vs41, vs12, vs4, 2 xxpermdi vs40, vs14, vs6, 2 xxpermdi vs35, vs0, vs8, 2 xxpermdi vs34, vs2, vs10, 2 xxpermdi vs43, vs4, vs12, 2 xxpermdi vs42, vs6, vs14, 2 #endif #endif stxvp vs32, 0(T2) stxvp vs40, 32(T2) stxvp vs34, 0(T3) stxvp vs42, 32(T3) addi CO, CO, 64 .endm /* macros for N=4 and M=4 **********************************************************************************************/ .macro ZERO4x4 xxsetaccz 0 xxsetaccz 1 xxsetaccz 2 xxsetaccz 3 .endm .macro LOAD4x4 LOAD4x4O 0, 0 .endm .macro LOAD4x4O OffsetA, OffsetB lxvp vs34, (\OffsetB+0)(BO) lxvp vs32, (\OffsetA+0)(AO) .endm .macro END4x4_NORMAL END4x4 AO, BO, 32, 32 .endm .macro END4x4_WITHOUT_ADD END4x4 AO, BO, 0, 0 .endm .macro END4x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 3, 32, 35 xvf32gerpp 2, 33, 35 xvf32gerpp 1, 32, 34 xvf32gerpp 0, 33, 34 #else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 #endif .endm .macro LOAD4x4_2 LOAD4x4_2O 0, 0 .endm .macro LOAD4x4_2O OffsetA, OffsetB lxvp vs34, (\OffsetB)(BO) lxvp vs38, (32+\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) .endm .macro END4x4_2 /*for load2 offset will be 64 and 64*/ KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1 .endm .macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 3, 32, 35 xvf32gerpp 2, 33, 35 xvf32gerpp 1, 32, 34 xvf32gerpp 0, 33, 34 #else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 #endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 3, 36, 39 xvf32gerpp 2, 37, 39 xvf32gerpp 1, 36, 38 xvf32gerpp 0, 37, 38 #else xvf32gerpp 3, 36, 38 xvf32gerpp 2, 37, 38 xvf32gerpp 1, 36, 39 xvf32gerpp 0, 37, 39 #endif .if \Complete==0 lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP8(\Index, \OffsetB) addi \AREG, \AREG, DISP8(\Index, \OffsetA) .else addi \BREG, \BREG, DISP8(\Index, 64) addi \AREG, \AREG, DISP8(\Index, 64) .endif .endif .endm .macro KERNEL4x4 LOAD4x4 END4x4 AO, BO, 32, 32 .endm .macro SAVE4x4 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 add T4, LDC, LDC add T1, CO, LDC #ifndef TRMMKERNEL lxvp vs24, 0(CO) #endif add T2, CO, T4 add T3, T1, T4 #ifndef TRMMKERNEL lxvp vs26, 0(T1) #endif #ifndef TRMMKERNEL lxvp vs28, 0(T2) #endif #ifndef TRMMKERNEL lxvp vs30, 0(T3) #endif GROUP1 AGG_GROUP1 GROUP2 AGG_GROUP2 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULTIPLY_GROUP1 MULTIPLY_GROUP2 /* reconstruct r, i pairs*/ RECONSTRUCT_PAIR1 RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs1, vs0, vs8, 1 xxpermdi vs3, vs2, vs10, 1 xxpermdi vs9, vs8, vs0, 1 xxpermdi vs11, vs10, vs2, 1 xxpermdi vs5, vs4, vs12, 1 xxpermdi vs7, vs6, vs14, 1 xxpermdi vs13, vs12, vs4, 1 xxpermdi vs15, vs14, vs6, 1 #else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 #endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 xvaddsp vs27, vs27, vs9 xvaddsp vs28, vs28, vs7 xvaddsp vs29, vs29, vs5 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 #else #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs25, vs0, vs8, 1 xxpermdi vs24, vs2, vs10, 1 xxpermdi vs27, vs8, vs0, 1 xxpermdi vs26, vs10, vs2, 1 xxpermdi vs29, vs4, vs12, 1 xxpermdi vs28, vs6, vs14, 1 xxpermdi vs31, vs12, vs4, 1 xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs0, vs8, 2 xxpermdi vs26, vs2, vs10, 2 xxpermdi vs29, vs12, vs4, 2 xxpermdi vs28, vs14, vs6, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 #endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) stxvp vs28, 0(T2) stxvp vs30, 0(T3) addi CO, CO, 32 .endm /* macros for N=4 and M=2 **********************************************************************************************/ .macro ZERO4x2 xxsetaccz 0 xxsetaccz 1 .endm .macro LOAD4x2 LOAD4x2O 0, 0 .endm .macro LOAD4x2O OffsetA, OffsetB lxv vs32, (\OffsetA+0)(AO) lxvp vs34, (\OffsetB+0)(BO) .endm .macro END4x2_NORMAL END4x2 AO, BO, 16, 32 .endm .macro END4x2_WITHOUT_ADD END4x2 AO, BO, 0, 0 .endm .macro END4x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 1, 35, 32 xvf32gerpp 0, 34, 32 #else xvf32gerpp 1, 34, 32 xvf32gerpp 0, 35, 32 #endif .endm .macro LOAD4x2_2 LOAD4x2_2O 0, 0 .endm .macro LOAD4x2_2O OffsetA, OffsetB lxvp vs32, (\OffsetA)(AO) lxvp vs34, (0+\OffsetB)(BO) lxvp vs36, (32+\OffsetB)(BO) .endm .macro END4x2_2 /*for load2 offset will be 32 and 64*/ KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1 .endm .macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 1, 35, 32 xvf32gerpp 0, 34, 32 #else xvf32gerpp 1, 34, 33 xvf32gerpp 0, 35, 33 #endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 1, 37, 33 xvf32gerpp 0, 36, 33 #else xvf32gerpp 1, 36, 32 xvf32gerpp 0, 37, 32 #endif .if \Complete==0 lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) .endif .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP4(\Index, \OffsetA) addi \BREG, \BREG, DISP8(\Index, \OffsetB) .else addi \AREG, \AREG, DISP4(\Index, 32) addi \BREG, \BREG, DISP8(\Index, 64) .endif .endif .endm .macro KERNEL4x2 LOAD4x2 END4x2 AO, BO, 16, 32 .endm .macro SAVE4x2 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 add T4, LDC, LDC add T1, CO, LDC add T2, CO, T4 add T3, T1, T4 #ifndef TRMMKERNEL lxv vs24, 0(CO) #endif #ifndef TRMMKERNEL lxv vs25, 0(T1) #endif #ifndef TRMMKERNEL lxv vs26, 0(T2) #endif #ifndef TRMMKERNEL lxv vs27, 0(T3) #endif GROUP1 AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULTIPLY_GROUP1 /* reconstruct r, i pairs*/ RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs1, vs0, vs8, 0 xxpermdi vs9, vs2, vs10, 0 xxpermdi vs3, vs8, vs0, 3 xxpermdi vs11, vs10, vs2, 3 #else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs10, vs2, 0 xxpermdi vs3, vs0, vs8, 3 xxpermdi vs11, vs2, vs10, 3 #endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 xvaddsp vs25, vs25, vs3 xvaddsp vs27, vs27, vs11 #else #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs24, vs0, vs8, 0 xxpermdi vs26, vs2, vs10, 0 xxpermdi vs25, vs8, vs0, 3 xxpermdi vs27, vs10, vs2, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs10, vs2, 0 xxpermdi vs25, vs0, vs8, 3 xxpermdi vs27, vs2, vs10, 3 #endif #endif stxv vs24, 0(CO) stxv vs25, 0(T1) stxv vs26, 0(T2) stxv vs27, 0(T3) addi CO, CO, 16 .endm /* macros for N=4 and M=2 **********************************************************************************************/ .macro ZERO4x1 xxsetaccz 0 xxsetaccz 1 .endm .macro LOAD4x1 LOAD4x1O 0, 0 .endm .macro LOAD4x1O OffsetA, OffsetB lxsd v0, (\OffsetA+0)(AO) lxvp vs34, (\OffsetB+0)(BO) .endm .macro END4x1_NORMAL END4x1 AO, BO,8, 32 .endm .macro END4x1_WITHOUT_ADD END4x1 AO, BO, 0, 0 .endm .macro END4x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 0, 34, 32 xvf32gerpp 1, 35, 32 #else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 #endif .endm .macro LOAD4x1_2 LOAD4x1_2O 0, 0 .endm .macro LOAD4x1_2O OffsetA, OffsetB lxv vs32, (\OffsetA)(AO) vspltisb v6, 0 #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs33, vs32, vs38, 2 xxpermdi vs32, vs32, vs38, 0 #else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 #endif lxvp vs34, (0+\OffsetB)(BO) lxvp vs36, (32+\OffsetB)(BO) .endm .macro END4x1_2 /*for load2 offset will be 16 and 64*/ KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1 .endm .macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 0, 34, 32 xvf32gerpp 1, 35, 32 #else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 #endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 0, 36, 33 xvf32gerpp 1, 37, 33 #else xvf32gerpp 0, 37, 33 xvf32gerpp 1, 36, 33 #endif .if \Complete==0 lxv vs32, DISP2(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs33, vs32, vs38, 2 xxpermdi vs32, vs32, vs38, 0 #else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 #endif .endif .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP2(\Index, \OffsetA) addi \BREG, \BREG, DISP8(\Index, \OffsetB) .else addi \AREG, \AREG, DISP2(\Index, 16) addi \BREG, \BREG, DISP8(\Index, 64) .endif .endif .endm .macro KERNEL4x1 LOAD4x1 END4x1 AO, BO, 8, 32 .endm .macro SAVE4x1 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 xxpermdi vs32, vs32, vs36, 1 xxpermdi vs40, vs40, vs44, 1 xxpermdi vs33, vs33, vs37, 1 xxpermdi vs41, vs41, vs45, 1 add T4, LDC, LDC add T1, CO, LDC add T2, CO, T4 add T3, T1, T4 #ifndef TRMMKERNEL lxsd v4, 0(CO) #endif #ifndef TRMMKERNEL lxsd v5, 0(T1) #endif #ifndef TRMMKERNEL lxsd v6, 0(T2) #endif #ifndef TRMMKERNEL lxsd v7, 0(T3) #endif xxperm vs0, vs32, permute_mask xxperm vs4, vs40, permute_mask xxperm vs1, vs33, permute_mask xxperm vs5, vs41, permute_mask AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART1 vs33, vs41, vs2, vs3 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 /* reconstruct r, i pairs*/ xxperm vs0, vs1, save_permute_1 xxperm vs2, vs3, save_permute_1 #ifndef TRMMKERNEL /* add */ xxspltd vs1, vs0, 0 xxspltd vs3, vs0, 1 xxspltd vs9, vs2, 0 xxspltd vs11, vs2, 1 /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ xvaddsp vs36, vs36, vs1 xvaddsp vs37, vs37, vs3 xvaddsp vs38, vs38, vs9 xvaddsp vs39, vs39, vs11 #else /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ xxspltd vs36, vs0, 0 xxspltd vs37, vs0, 1 xxspltd vs38, vs2, 0 xxspltd vs39, vs2, 1 #endif stxsd v4, 0(CO) stxsd v5, 0(T1) stxsd v6, 0(T2) stxsd v7, 0(T3) addi CO, CO, 8 .endm /* macros for N=2 and M=8 **********************************************************************************************/ .macro ZERO2x8 xxsetaccz 0 xxsetaccz 1 xxsetaccz 2 xxsetaccz 3 .endm .macro LOAD2x8 LOAD2x8O 0, 0 .endm .macro LOAD2x8O OffsetA, OffsetB lxv vs34, (\OffsetB+0)(BO) lxvp vs32, (\OffsetA+0)(AO) lxvp vs36, (\OffsetA+32)(AO) .endm .macro END2x8_NORMAL END2x8 AO, BO, 64, 16 .endm .macro END2x8_WITHOUT_ADD END2x8 AO, BO, 0, 0 .endm .macro END2x8 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif xvf32gerpp 2, 37, 34 xvf32gerpp 3, 36, 34 xvf32gerpp 0, 33, 34 xvf32gerpp 1, 32, 34 .endm .macro LOAD2x8_2 LOAD2x8_2O 0, 0 .endm .macro LOAD2x8_2O OffsetA, OffsetB lxvp vs34, (\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) lxvp vs38, (64+\OffsetA)(AO) lxvp vs40, (64+32+\OffsetA)(AO) .endm .macro END2x8_2 /*for load2 offset will be 128 and 32*/ KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1 .endm .macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 2, 37, 34 xvf32gerpp 3, 36, 34 xvf32gerpp 0, 33, 34 xvf32gerpp 1, 32, 34 #else xvf32gerpp 2, 37, 35 xvf32gerpp 3, 36, 35 xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 #endif .if \Complete==0 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 2, 41, 35 xvf32gerpp 3, 40, 35 xvf32gerpp 0, 39, 35 xvf32gerpp 1, 38, 35 #else xvf32gerpp 2, 41, 34 xvf32gerpp 3, 40, 34 xvf32gerpp 0, 39, 34 xvf32gerpp 1, 38, 34 #endif .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP4(\Index, \OffsetB) addi \AREG, \AREG, DISP16(\Index, \OffsetA) .else addi \BREG, \BREG, DISP4(\Index, 32) addi \AREG, \AREG, DISP16(\Index, 128) .endif .endif .endm .macro KERNEL2x8 LOAD2x8 END2x8 AO, BO, 64, 16 .endm .macro SAVE2x8 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 add T1, CO, LDC #ifndef TRMMKERNEL lxvp vs24, 0(CO) #endif #ifndef TRMMKERNEL lxvp vs26, 32(CO) #endif #ifndef TRMMKERNEL lxvp vs28, 0(T1) #endif #ifndef TRMMKERNEL lxvp vs30, 32(T1) #endif add T2, CO, T4 add T3, T1, T4 GROUP1 AGG_GROUP1 GROUP2 AGG_GROUP2 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULTIPLY_GROUP1 MULTIPLY_GROUP2 /* reconstruct r, i pairs*/ RECONSTRUCT_PAIR1 RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs1, vs0, vs8, 1 xxpermdi vs3, vs2, vs10, 1 xxpermdi vs5, vs4, vs12, 1 xxpermdi vs7, vs6, vs14, 1 xxpermdi vs9, vs8, vs0, 1 xxpermdi vs11, vs10, vs2, 1 #else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 #endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs13, vs12, vs4, 1 xxpermdi vs15, vs14, vs6, 1 #else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 #endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 xvaddsp vs29, vs29, vs9 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 #else #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs25, vs0, vs8, 1 xxpermdi vs24, vs2, vs10, 1 xxpermdi vs27, vs4, vs12, 1 xxpermdi vs26, vs6, vs14, 1 xxpermdi vs29, vs8, vs0, 1 xxpermdi vs28, vs10, vs2, 1 xxpermdi vs31, vs12, vs4, 1 xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs12, vs4, 2 xxpermdi vs26, vs14, vs6, 2 xxpermdi vs29, vs0, vs8, 2 xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 #endif #endif stxvp vs24, 0(CO) stxvp vs26, 32(CO) stxvp vs28, 0(T1) stxvp vs30, 32(T1) addi CO, CO, 64 .endm /* macros for N=2 and M=4 **********************************************************************************************/ .macro ZERO2x4 xxsetaccz 0 xxsetaccz 1 .endm .macro LOAD2x4 LOAD2x4O 0, 0 .endm .macro LOAD2x4O OffsetA, OffsetB lxv vs34, (\OffsetB+0)(BO) lxvp vs32, (\OffsetA+0)(AO) .endm .macro END2x4_NORMAL END2x4 AO, BO, 32, 16 .endm .macro END2x4_WITHOUT_ADD END2x4 AO, BO, 0, 0 .endm .macro END2x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif xvf32gerpp 0, 33, 34 xvf32gerpp 1, 32, 34 .endm .macro LOAD2x4_2 LOAD2x4_2O 0, 0 .endm .macro LOAD2x4_2O OffsetA, OffsetB lxvp vs34, (\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) .endm .macro END2x4_2 /*for load2 offset will be 64 and 32*/ KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1 .endm .macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 0, 33, 34 xvf32gerpp 1, 32, 34 #else xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 #endif .if \Complete==0 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xvf32gerpp 0, 37, 35 xvf32gerpp 1, 36, 35 #else xvf32gerpp 0, 37, 34 xvf32gerpp 1, 36, 34 #endif .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP4(\Index, \OffsetB) addi \AREG, \AREG, DISP8(\Index, \OffsetA) .else addi \BREG, \BREG, DISP4(\Index, 32) addi \AREG, \AREG, DISP8(\Index, 64) .endif .endif .endm .macro KERNEL2x4 LOAD2x4 END2x4 AO, BO, 32, 16 .endm .macro SAVE2x4 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 add T1, CO, LDC #ifndef TRMMKERNEL lxvp vs24, 0(CO) #endif #ifndef TRMMKERNEL lxvp vs26, 0(T1) #endif GROUP1 AGG_GROUP1 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULTIPLY_GROUP1 /* reconstruct r, i pairs*/ RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs1, vs0, vs8, 1 xxpermdi vs3, vs2, vs10, 1 xxpermdi vs9, vs8, vs0, 1 xxpermdi vs11, vs10, vs2, 1 #else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 #endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 xvaddsp vs27, vs27, vs9 #else #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs25, vs0, vs8, 1 xxpermdi vs24, vs2, vs10, 1 xxpermdi vs27, vs8, vs0, 1 xxpermdi vs26, vs10, vs2, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs0, vs8, 2 xxpermdi vs26, vs2, vs10, 2 #endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) addi CO, CO, 32 .endm /* macros for N=2 and M=2 **********************************************************************************************/ .macro ZERO2x2 xxsetaccz 0 .endm .macro LOAD2x2 LOAD2x2O 0, 0 .endm .macro LOAD2x2O OffsetA, OffsetB lxv vs32, (\OffsetA+0)(AO) lxv vs34, (\OffsetB+0)(BO) .endm .macro END2x2_NORMAL END2x2 AO, BO, 16, 16 .endm .macro END2x2_WITHOUT_ADD END2x2 AO, BO, 0, 0 .endm .macro END2x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif xvf32gerpp 0, 34, 32 .endm .macro LOAD2x2_2 LOAD2x2_2O 0, 0 .endm .macro LOAD2x2_2O OffsetA, OffsetB lxvp vs32, (\OffsetA)(AO) lxvp vs34, (0+\OffsetB)(BO) .endm .macro END2x2_2 /*for load2 offset will be 32 and 32*/ KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1 .endm .macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete xvf32gerpp 0, 34, 32 xvf32gerpp 0, 35, 33 .if \Complete==0 lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) lxvp vs34, DISP4(\Index, \OffsetA)(\BREG) .endif .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP4(\Index, \OffsetA) addi \BREG, \BREG, DISP4(\Index, \OffsetB) .else addi \AREG, \AREG, DISP4(\Index, 32) addi \BREG, \BREG, DISP4(\Index, 32) .endif .endif .endm .macro KERNEL2x2 LOAD2x2 END2x2 AO, BO, 16, 16 .endm .macro SAVE2x2 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 add T1, CO, LDC #ifndef TRMMKERNEL lxv vs24, 0(CO) #endif #ifndef TRMMKERNEL lxv vs26, 0(T1) #endif xxperm vs0, vs32, permute_mask xxperm vs4, vs40, permute_mask xxperm vs8, vs36, permute_mask xxperm vs12, vs44, permute_mask AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART1 vs36, vs44, vs8, vs9 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs36, vs44, vs8, vs9 /* reconstruct r, i pairs*/ xxperm vs0, vs1, save_permute_1 xxperm vs8, vs9, save_permute_1 #ifndef TRMMKERNEL /* add */ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs1, vs0, vs8, 0 xxpermdi vs9, vs8, vs0, 3 #else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs0, vs8, 3 #endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 #else #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs24, vs0, vs8, 0 xxpermdi vs26, vs8, vs0, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs0, vs8, 3 #endif #endif stxv vs24, 0(CO) stxv vs26, 0(T1) addi CO, CO, 16 .endm /* macros for N=2 and M=1 **********************************************************************************************/ .macro ZERO2x1 xxlxor vs32, vs32, vs32 xxlxor vs40, vs40, vs40 .endm .macro LOAD2x1 LOAD2x1O 0, 0 .endm .macro LOAD2x1O OffsetA, OffsetB lxsd v4, (\OffsetA+0)(AO) lxv vs0, (\OffsetB+0)(BO) xxspltd vs24, vs36, 0 xxperm vs26, vs24, permute_mask .endm .macro END2x1_NORMAL END2x1 AO, BO,8, 16 .endm .macro END2x1_WITHOUT_ADD END2x1 AO, BO, 0, 0 .endm .macro END2x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif xvmaddasp vs32, vs0, vs24 xvmaddasp vs40, vs0, vs26 .endm .macro LOAD2x1_2 LOAD2x1_2O 0, 0 .endm .macro LOAD2x1_2O OffsetA, OffsetB lxv vs27, (\OffsetA)(AO) lxvp vs4, (0+\OffsetB)(BO) xxspltd vs8, vs27, 1 xxspltd vs24, vs27, 0 xxperm vs10, vs8, permute_mask xxperm vs26, vs24, permute_mask .endm .macro END2x1_2 /*for load2 offset will be 16 and 32*/ KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1 .endm .macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete xvmaddasp vs32, vs5, vs8 xvmaddasp vs40, vs5, vs10 .if \Complete==0 lxv vs27, DISP2(\Index, \OffsetA)(\AREG) xxspltd vs8, vs27, 1 .endif .if \Complete==0 xxperm vs10, vs8, permute_mask .endif xvmaddasp vs32, vs4, vs24 xvmaddasp vs40, vs4, vs26 .if \Complete==0 xxspltd vs24, vs27, 0 xxperm vs26, vs24, permute_mask .endif .if \Complete==0 lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG) .endif .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP2(\Index, \OffsetA) addi \BREG, \BREG, DISP4(\Index, \OffsetB) .else addi \AREG, \AREG, DISP2(\Index, 16) addi \BREG, \BREG, DISP4(\Index, 32) .endif .endif .endm .macro KERNEL2x1 LOAD2x1 END2x1 AO, BO, 8, 16 .endm .macro SAVE2x1 add T1, CO, LDC #ifndef TRMMKERNEL lxsd v4, 0(CO) #endif #ifndef TRMMKERNEL lxsd v5, 0(T1) #endif xxperm vs0, vs32, permute_mask xxperm vs4, vs40, permute_mask AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 /* reconstruct r, i pairs*/ xxperm vs0, vs1, save_permute_1 #ifndef TRMMKERNEL /* add */ xxspltd vs1, vs0, 0 xxspltd vs3, vs0, 1 /*--v4==vs36 v5==vs37---*/ xvaddsp vs36, vs36, vs1 xvaddsp vs37, vs37, vs3 #else /*--v4==vs36 v5==vs37---*/ xxspltd vs36, vs0, 0 xxspltd vs37, vs0, 1 #endif stxsd v4, 0(CO) stxsd v5, 0(T1) addi CO, CO, 8 .endm /* macros for N=1 and M=8 **********************************************************************************************/ .macro ZERO1x8 xxsetaccz 0 xxsetaccz 1 xxsetaccz 2 xxsetaccz 3 .endm .macro LOAD1x8 LOAD1x8O 0, 0 .endm .macro LOAD1x8O OffsetA, OffsetB lxsd v2, (\OffsetB+0)(BO) lxvp vs32, (\OffsetA+0)(AO) lxvp vs36, (\OffsetA+32)(AO) .endm .macro END1x8_NORMAL END1x8 AO, BO, 64,8 .endm .macro END1x8_WITHOUT_ADD END1x8 AO, BO, 0, 0 .endm .macro END1x8 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif xvf32gerpp 0, 34, 33 xvf32gerpp 1, 34, 32 xvf32gerpp 2, 34, 37 xvf32gerpp 3, 34, 36 .endm .macro LOAD1x8_2 LOAD1x8_2O 0, 0 .endm .macro LOAD1x8_2O OffsetA, OffsetB lxv vs34, (\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) vspltisb v10, 0 #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs35, vs34, vs42, 2 xxpermdi vs34, vs34, vs42, 0 #else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 #endif lxvp vs38, (64+\OffsetA)(AO) lxvp vs40, (64+32+\OffsetA)(AO) .endm .macro END1x8_2 /*for load2 offset will be 128 and 16*/ KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1 .endm .macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete xvf32gerpp 0, 34, 33 xvf32gerpp 1, 34, 32 .if \Complete==0 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) .endif xvf32gerpp 2, 34, 37 xvf32gerpp 3, 34, 36 .if \Complete==0 lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif xvf32gerpp 0, 35, 39 xvf32gerpp 1, 35, 38 .if \Complete==0 lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) .endif xvf32gerpp 2, 35, 41 xvf32gerpp 3, 35, 40 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs35, vs34, vs42, 2 xxpermdi vs34, vs34, vs42, 0 #else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 #endif lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP2(\Index, \OffsetB) addi \AREG, \AREG, DISP16(\Index, \OffsetA) .else addi \BREG, \BREG, DISP2(\Index, 16) addi \AREG, \AREG, DISP16(\Index, 128) .endif .endif .endm .macro KERNEL1x8 LOAD1x8 END1x8 AO, BO, 64,8 .endm .macro SAVE1x8 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 xxpermdi vs32, vs32, vs36, 0 xxpermdi vs33, vs33, vs37, 0 xxpermdi vs34, vs34, vs38, 0 xxpermdi vs35, vs35, vs39, 0 xxpermdi vs40, vs40, vs44, 0 xxperm vs40, vs40, permute_mask xxpermdi vs41, vs41, vs45, 0 xxperm vs41, vs41, permute_mask xxpermdi vs42, vs42, vs46, 0 xxperm vs42, vs42, permute_mask xxpermdi vs43, vs43, vs47, 0 xxperm vs43, vs43, permute_mask #ifndef TRMMKERNEL lxvp vs24, 0(CO) #endif xxperm vs0, vs32, permute_mask xxperm vs4, vs40, permute_mask #ifndef TRMMKERNEL lxvp vs26, 32(CO) #endif xxperm vs1, vs33, permute_mask xxperm vs5, vs41, permute_mask xxperm vs2, vs34, permute_mask xxperm vs6, vs42, permute_mask xxperm vs3, vs35, permute_mask xxperm vs7, vs43, permute_mask AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 /*inner reverse save_permute and store vs28 */ xxpermdi vs28,save_permute_1,save_permute_1, 2 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART1 vs33, vs41, vs2, vs3 MULT_APLHA_PART1 vs34, vs42, vs4, vs5 MULT_APLHA_PART1 vs35, vs43, vs6, vs7 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 MULT_APLHA_PART2 vs34, vs42, vs4, vs5 MULT_APLHA_PART2 vs35, vs43, vs6, vs7 /* reconstruct r, i pairs*/ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxperm vs0, vs1, save_permute_1 xxperm vs2, vs3, save_permute_1 xxperm vs4, vs5, save_permute_1 xxperm vs6, vs7, save_permute_1 #else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 xxperm vs4, vs5, vs28 xxperm vs6, vs7, vs28 #endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 xvaddsp vs25, vs25, vs0 xvaddsp vs26, vs26, vs6 xvaddsp vs27, vs27, vs4 stxvp vs24, 0(CO) stxvp vs26, 32(CO) #else /* reconstruct r, i pairs*/ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) stxv vs2, 0(CO) stxv vs0, 16(CO) stxv vs6, 32(CO) stxv vs4, 48(CO) #else stxv vs0, 0(CO) stxv vs2, 16(CO) stxv vs4, 32(CO) stxv vs6, 48(CO) #endif #endif addi CO, CO, 64 .endm /* macros for N=1 and M=4 **********************************************************************************************/ .macro ZERO1x4 xxsetaccz 0 xxsetaccz 1 .endm .macro LOAD1x4 LOAD1x4O 0, 0 .endm .macro LOAD1x4O OffsetA, OffsetB lxsd v2, (\OffsetB+0)(BO) lxvp vs32, (\OffsetA+0)(AO) .endm .macro END1x4_NORMAL END1x4 AO, BO, 32,8 .endm .macro END1x4_WITHOUT_ADD END1x4 AO, BO, 0, 0 .endm .macro END1x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif xvf32gerpp 0, 34, 33 xvf32gerpp 1, 34, 32 .endm .macro LOAD1x4_2 LOAD1x4_2O 0, 0 .endm .macro LOAD1x4_2O OffsetA, OffsetB lxv vs34, (\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) vspltisb v6, 0 #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs35, vs34, vs38, 2 xxpermdi vs34, vs34, vs38, 0 #else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 #endif lxvp vs36, (32+\OffsetA)(AO) .endm .macro END1x4_2 /*for load2 offset will be 64 and 16*/ KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1 .endm .macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete xvf32gerpp 0, 34, 33 xvf32gerpp 1, 34, 32 .if \Complete==0 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif xvf32gerpp 0, 35, 37 xvf32gerpp 1, 35, 36 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxpermdi vs35, vs34, vs38, 2 xxpermdi vs34, vs34, vs38, 0 #else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 #endif lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP2(\Index, \OffsetB) addi \AREG, \AREG, DISP8(\Index, \OffsetA) .else addi \BREG, \BREG, DISP2(\Index, 16) addi \AREG, \AREG, DISP8(\Index, 64) .endif .endif .endm .macro KERNEL1x4 LOAD1x4 END1x4 AO, BO, 32,8 .endm .macro SAVE1x4 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 xxpermdi vs32, vs32, vs36, 0 xxpermdi vs40, vs40, vs44, 0 xxpermdi vs33, vs33, vs37, 0 xxpermdi vs41, vs41, vs45, 0 xxperm vs40, vs40, permute_mask xxperm vs41, vs41, permute_mask #ifndef TRMMKERNEL lxvp vs24, 0(CO) #endif xxperm vs0, vs32, permute_mask xxperm vs4, vs40, permute_mask xxperm vs1, vs33, permute_mask xxperm vs5, vs41, permute_mask AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 /*inner reverse save_permute and store vs28 */ xxpermdi vs28,save_permute_1,save_permute_1, 2 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART1 vs33, vs41, vs2, vs3 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 /* reconstruct r, i pairs*/ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxperm vs0, vs1, save_permute_1 xxperm vs2, vs3, save_permute_1 #else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 #endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 xvaddsp vs25, vs25, vs0 stxvp vs24, 0(CO) #else /* reconstruct r, i pairs*/ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) stxv vs2, 0(CO) stxv vs0, 16(CO) #else stxv vs0, 0(CO) stxv vs2, 16(CO) #endif #endif addi CO, CO, 32 .endm /* macros for N=1 and M=2 **********************************************************************************************/ .macro ZERO1x2 xxlxor vs32, vs32, vs32 xxlxor vs40, vs40, vs40 .endm .macro LOAD1x2 LOAD1x2O 0, 0 .endm .macro LOAD1x2O OffsetA, OffsetB lxsd vs4, (\OffsetB+0)(BO) lxv vs0, (\OffsetA+0)(AO) xxspltd vs24, vs36, 0 xxperm vs26, vs24, permute_mask .endm .macro END1x2_NORMAL END1x2 AO, BO, 16,8 .endm .macro END1x2_WITHOUT_ADD END1x2 AO, BO, 0, 0 .endm .macro END1x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif xvmaddasp vs32, vs0, vs24 xvmaddasp vs40, vs0, vs26 .endm .macro LOAD1x2_2 LOAD1x2_2O 0, 0 .endm .macro LOAD1x2_2O OffsetA, OffsetB lxv vs27, (\OffsetB)(BO) lxvp vs4, (0+\OffsetA)(AO) xxspltd vs8, vs27, 1 xxspltd vs24, vs27, 0 xxperm vs10, vs8, permute_mask xxperm vs26, vs24, permute_mask .endm .macro END1x2_2 /*for load2 offset will be 32 and 16*/ KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1 .endm .macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete .if \Complete==0 lxv vs27, DISP2(\Index, \OffsetB)(\BREG) .endif xvmaddasp vs32, vs5, vs8 xvmaddasp vs40, vs5, vs10 .if \Complete==0 xxspltd vs8, vs27, 1 xxperm vs10, vs8, permute_mask .endif xvmaddasp vs32, vs4, vs24 xvmaddasp vs40, vs4, vs26 .if \Complete==0 lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG) .endif .if \Complete==0 xxspltd vs24, vs27, 0 xxperm vs26, vs24, permute_mask .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP2(\Index, \OffsetB) addi \AREG, \AREG, DISP4(\Index, \OffsetA) .else addi \BREG, \BREG, DISP2(\Index, 16) addi \AREG, \AREG, DISP4(\Index, 32) .endif .endif .endm .macro KERNEL1x2 LOAD1x2 END1x2 AO, BO, 16,8 .endm .macro SAVE1x2 #ifndef TRMMKERNEL lxv vs24, 0(CO) #endif xxperm vs0, vs32, permute_mask xxperm vs4, vs40, permute_mask AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 /*inner reverse save_permute and store vs28 */ xxpermdi vs28,save_permute_1,save_permute_1, 2 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 /* reconstruct r, i pairs*/ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxperm vs0, vs1, save_permute_1 #else xxperm vs0, vs1, vs28 #endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs0 stxv vs24, 0(CO) #else /* reconstruct r, i pairs*/ stxv vs0, 0(CO) #endif addi CO, CO, 16 .endm /* macros for N=1 and M=1 **********************************************************************************************/ .macro ZERO1x1 xxlxor vs32, vs32, vs32 xxlxor vs40, vs40, vs40 .endm .macro LOAD1x1 LOAD1x1O 0, 0 .endm .macro LOAD1x1O OffsetA, OffsetB lxsd v4, (\OffsetB+0)(BO) lxsd v5, (\OffsetA+0)(AO) xxperm vs38, vs36, permute_mask .endm .macro END1x1_NORMAL END1x1 AO, BO,8,8 .endm .macro END1x1_WITHOUT_ADD END1x1 AO, BO, 0, 0 .endm .macro END1x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif xvmaddasp vs32, vs37, vs36 xvmaddasp vs40, vs37, vs38 .endm .macro LOAD1x1_2 LOAD1x1_2O 0, 0 .endm .macro LOAD1x1_2O OffsetA, OffsetB lxv vs8, (\OffsetB)(BO) lxv vs4, (0+\OffsetA)(AO) xxperm vs10, vs8, permute_mask .endm .macro END1x1_2 /*for load2 offset will be 16 and 16*/ KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1 .endm .macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 .endm .macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 .endm .macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete xvmaddasp vs32, vs4, vs8 xvmaddasp vs40, vs4, vs10 .if \Complete==0 lxv vs8, DISP2(\Index, \OffsetB)(\BREG) lxv vs4, DISP2(\Index, \OffsetB)(\AREG) xxperm vs10, vs8, permute_mask .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP2(\Index, \OffsetB) addi \AREG, \AREG, DISP2(\Index, \OffsetA) .else addi \BREG, \BREG, DISP2(\Index, 16) addi \AREG, \AREG, DISP2(\Index, 16) .endif .endif .endm .macro KERNEL1x1 LOAD1x1 END1x1 AO, BO, 8,8 .endm .macro SAVE1x1 #ifndef TRMMKERNEL lxsd v4, 0(CO) #endif /*aggregate x2*/ xxpermdi vs33, vs32, vs32, 2 xxpermdi vs41, vs40, vs40, 2 xvaddsp vs32, vs32, vs33 xvaddsp vs40, vs40, vs41 xxperm vs0, vs32, permute_mask xxperm vs4, vs40, permute_mask AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 /*inner reverse save_permute and store vs28 */ xxpermdi vs28,save_permute_1,save_permute_1, 2 /*VSINRR, VSINII, VSOUT1, VSOUT2*/ MULT_APLHA_PART1 vs32, vs40, vs37, vs1 MULT_APLHA_PART2 vs32, vs40, vs37, vs1 /* reconstruct r, i pairs*/ #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) xxperm vs37, vs1, save_permute_1 #else xxperm vs37, vs1, vs28 #endif #ifndef TRMMKERNEL /* add */ xvaddsp vs36, vs36, vs37 stxsd v4, 0(CO) #else /* vs37 is v5 */ stxsd v5, 0(CO) #endif addi CO, CO, 8 .endm /****************************TRMM POINTER REFRESH MACROSES*************************/ .macro SHIFT_REG REG1,REG2,SHIFT_VAL .if \SHIFT_VAL==16 slwi \REG1, \REG2, 7 .elseif \SHIFT_VAL==8 slwi \REG1, \REG2, 6 .elseif \SHIFT_VAL==4 slwi \REG1, \REG2, 5 .elseif \SHIFT_VAL==2 slwi \REG1, \REG2, 4 .elseif \SHIFT_VAL==1 slwi \REG1, \REG2, 3 .endif .endm /* //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) // ptrbb = bb; // #else // ptrba += off*8; // ptrbb = bb + off*4; // #endif */ .macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /* ptrbb = bb;*/ mr \PTR_B, \B_VAL /* refresh BPOINT */ #else /* // ptrba =ptrba+ off*C_A; // ptrbb = bb + off*C_B; */ SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */ SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */ add \PTR_B, \B_VAL, T4 /* Add values to BO */ add \PTR_A, \PTR_A, T2 /* Add values to AO */ #endif .endm /* // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) // temp = bk-off; // #elif defined(LEFT) // temp = off+8; // number of values in A // #else // temp = off+4; // number of values in B // #endif */ .macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) /* temp = bk-off;*/ sub \TEMP_BK, \BK_VAL, \OFF_VAL #elif defined(LEFT) /* temp = off+INCR_A; // number of values in A */ addi \TEMP_BK, \OFF_VAL, \INCR_A #else /* temp = off+INCR_B // number of values in B*/ addi \TEMP_BK, \OFF_VAL, \INCR_B #endif .endm /* // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) // temp = bk - off; // #ifdef LEFT // temp -= 8; // number of values in A // #else // temp -= 4; // number of values in B // #endif // ptrba += temp*8; // ptrbb += temp*4; // #endif // #ifdef LEFT // off += 8; // number of values in A // #endif */ .macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /*temp = bk - off;*/ sub \TEMP_BK, \BK_VAL, \OFF_VAL #ifdef LEFT /*temp -= 8; // number of values in A*/ addi \TEMP_BK, \TEMP_BK,-\C_A #else /*temp -= 4; // number of values in B*/ addi \TEMP_BK, \TEMP_BK,-\C_B #endif /*ptrba += temp*C_A; ptrbb += temp*C_B;*/ SHIFT_REG T4, \TEMP_BK, \C_A SHIFT_REG T2, \TEMP_BK, \C_B add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/ add \PTR_B, \PTR_B, T2 #endif #ifdef LEFT /*off += 8; // number of values in A*/ addi \OFF_VAL, \OFF_VAL, \C_A #endif .endm