|
- /***************************************************************************
- Copyright (c) 2013-2016, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
- /**************************************************************************************
- * 2016/04/22 Werner Saar (wernsaar@googlemail.com)
- * BLASTEST : OK
- * CTEST : OK
- * TEST : OK
- * LAPACK-TEST : OK
- **************************************************************************************/
-
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
- #define XSFADD_R1 xsadddp
- #define XSFADD_R2 xssubdp
- #define XSFADD_I1 xsadddp
- #define XSFADD_I2 xsadddp
-
- #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
-
- #define XSFADD_R1 xsadddp
- #define XSFADD_R2 xsadddp
- #define XSFADD_I1 xssubdp
- #define XSFADD_I2 xsadddp
-
- #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
-
- #define XSFADD_R1 xsadddp
- #define XSFADD_R2 xsadddp
- #define XSFADD_I1 xsadddp
- #define XSFADD_I2 xssubdp
-
- #else // CC || CR || RC || RR
-
- #define XSFADD_R1 xsadddp
- #define XSFADD_R2 xssubdp
- #define XSFADD_I1 xssubdp
- #define XSFADD_I2 xssubdp
-
- #endif
-
- /**********************************************************************************************
- * Macros for N=2 and M=8
- **********************************************************************************************/
-
- .macro LOAD2x8_1
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
-
- .endm
-
- .macro KERNEL2x8_I1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs12, o0, AO // load real,imag from A
- lxvd2x vs13, o16, AO // load real,imag from A
- lxvd2x vs14, o32, AO // load real,imag from A
- lxvd2x vs15, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
- lxvd2x vs22, o32, BO // load real part from B
- lxvd2x vs23, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
- xvmuldp vs36, vs2, vs16 // real*real, imag*real
- xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
- xvmuldp vs38, vs3, vs16 // real*real, imag*real
- xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
- xvmuldp vs40, vs4, vs16 // real*real, imag*real
- xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
- xvmuldp vs42, vs5, vs16 // real*real, imag*real
- xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
- xvmuldp vs44, vs6, vs16 // real*real, imag*real
- xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
- xvmuldp vs46, vs7, vs16 // real*real, imag*real
- xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
-
- xvmuldp vs48, vs0, vs18 // real*real, imag*real
- xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
- xvmuldp vs50, vs1, vs18 // real*real, imag*real
- xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
- xvmuldp vs52, vs2, vs18 // real*real, imag*real
- xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
- xvmuldp vs54, vs3, vs18 // real*real, imag*real
- xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
- xvmuldp vs56, vs4, vs18 // real*real, imag*real
- xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
- xvmuldp vs58, vs5, vs18 // real*real, imag*real
- xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
- xvmuldp vs60, vs6, vs18 // real*real, imag*real
- xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
- xvmuldp vs62, vs7, vs18 // real*real, imag*real
- xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x8_1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs12, o0, AO // load real,imag from A
- lxvd2x vs13, o16, AO // load real,imag from A
- lxvd2x vs14, o32, AO // load real,imag from A
- lxvd2x vs15, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
- lxvd2x vs22, o32, BO // load real part from B
- lxvd2x vs23, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
-
- xvmaddadp vs48, vs0, vs18 // real*real, imag*real
- xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs50, vs1, vs18 // real*real, imag*real
- xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs52, vs2, vs18 // real*real, imag*real
- xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs54, vs3, vs18 // real*real, imag*real
- xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
- xvmaddadp vs56, vs4, vs18 // real*real, imag*real
- xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
- xvmaddadp vs58, vs5, vs18 // real*real, imag*real
- xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
- xvmaddadp vs60, vs6, vs18 // real*real, imag*real
- xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
- xvmaddadp vs62, vs7, vs18 // real*real, imag*real
- xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x8_2
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
- xvmaddadp vs40, vs12, vs20 // real*real, imag*real
- xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
- xvmaddadp vs42, vs13, vs20 // real*real, imag*real
- xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
- xvmaddadp vs44, vs14, vs20 // real*real, imag*real
- xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
- xvmaddadp vs46, vs15, vs20 // real*real, imag*real
- xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
-
- xvmaddadp vs48, vs8, vs22 // real*real, imag*real
- xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs50, vs9, vs22 // real*real, imag*real
- xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
- xvmaddadp vs52, vs10, vs22 // real*real, imag*real
- xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
- xvmaddadp vs54, vs11, vs22 // real*real, imag*real
- xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
- xvmaddadp vs56, vs12, vs22 // real*real, imag*real
- xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
- xvmaddadp vs58, vs13, vs22 // real*real, imag*real
- xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
- xvmaddadp vs60, vs14, vs22 // real*real, imag*real
- xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
- xvmaddadp vs62, vs15, vs22 // real*real, imag*real
- xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x8_E2
-
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
- xvmaddadp vs40, vs12, vs20 // real*real, imag*real
- xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
- xvmaddadp vs42, vs13, vs20 // real*real, imag*real
- xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
- xvmaddadp vs44, vs14, vs20 // real*real, imag*real
- xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
- xvmaddadp vs46, vs15, vs20 // real*real, imag*real
- xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
-
- xvmaddadp vs48, vs8, vs22 // real*real, imag*real
- xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs50, vs9, vs22 // real*real, imag*real
- xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
- xvmaddadp vs52, vs10, vs22 // real*real, imag*real
- xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
- xvmaddadp vs54, vs11, vs22 // real*real, imag*real
- xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
- xvmaddadp vs56, vs12, vs22 // real*real, imag*real
- xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
- xvmaddadp vs58, vs13, vs22 // real*real, imag*real
- xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
- xvmaddadp vs60, vs14, vs22 // real*real, imag*real
- xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
- xvmaddadp vs62, vs15, vs22 // real*real, imag*real
- xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x8_SUBI1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
- xvmuldp vs36, vs2, vs16 // real*real, imag*real
- xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
- xvmuldp vs38, vs3, vs16 // real*real, imag*real
- xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
- xvmuldp vs40, vs4, vs16 // real*real, imag*real
- xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
- xvmuldp vs42, vs5, vs16 // real*real, imag*real
- xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
- xvmuldp vs44, vs6, vs16 // real*real, imag*real
- xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
- xvmuldp vs46, vs7, vs16 // real*real, imag*real
- xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
-
- xvmuldp vs48, vs0, vs18 // real*real, imag*real
- xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
- xvmuldp vs50, vs1, vs18 // real*real, imag*real
- xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
- xvmuldp vs52, vs2, vs18 // real*real, imag*real
- xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
- xvmuldp vs54, vs3, vs18 // real*real, imag*real
- xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
- xvmuldp vs56, vs4, vs18 // real*real, imag*real
- xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
- xvmuldp vs58, vs5, vs18 // real*real, imag*real
- xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
- xvmuldp vs60, vs6, vs18 // real*real, imag*real
- xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
- xvmuldp vs62, vs7, vs18 // real*real, imag*real
- xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x8_SUB1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
-
- xvmaddadp vs48, vs0, vs18 // real*real, imag*real
- xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs50, vs1, vs18 // real*real, imag*real
- xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs52, vs2, vs18 // real*real, imag*real
- xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs54, vs3, vs18 // real*real, imag*real
- xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
- xvmaddadp vs56, vs4, vs18 // real*real, imag*real
- xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
- xvmaddadp vs58, vs5, vs18 // real*real, imag*real
- xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
- xvmaddadp vs60, vs6, vs18 // real*real, imag*real
- xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
- xvmaddadp vs62, vs7, vs18 // real*real, imag*real
- xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro SAVE2x8
-
-
- mr T1, CO
- addi T2, T1, 64
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
- lxvd2x vs18, o32, T1
- lxvd2x vs19, o48, T1
- lxvd2x vs20, o0, T2
- lxvd2x vs21, o16, T2
- lxvd2x vs22, o32, T2
- lxvd2x vs23, o48, T2
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs32 // realA*realB
- XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
-
- xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs32 // realA*imagB
- XSFADD_I2 vs1, vs1, vs33 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs34 // realA*realB
- XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
-
- xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs34 // realA*imagB
- XSFADD_I2 vs1, vs1, vs35 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs36 // realA*realB
- XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
-
- xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs36 // realA*imagB
- XSFADD_I2 vs1, vs1, vs37 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs38 // realA*realB
- XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
-
- xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs38 // realA*imagB
- XSFADD_I2 vs1, vs1, vs39 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs40 // realA*realB
- XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
-
- xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs40 // realA*imagB
- XSFADD_I2 vs1, vs1, vs41 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs42 // realA*realB
- XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
-
- xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs42 // realA*imagB
- XSFADD_I2 vs1, vs1, vs43 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs44 // realA*realB
- XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
-
- xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs44 // realA*imagB
- XSFADD_I2 vs1, vs1, vs45 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs46 // realA*realB
- XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
-
- xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs46 // realA*imagB
- XSFADD_I2 vs1, vs1, vs47 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
- xvadddp vs12, vs12, vs20
- xvadddp vs13, vs13, vs21
- xvadddp vs14, vs14, vs22
- xvadddp vs15, vs15, vs23
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
- stxvd2x vs12, o0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
- lxvd2x vs18, o32, T1
- lxvd2x vs19, o48, T1
- lxvd2x vs20, o0, T2
- lxvd2x vs21, o16, T2
- lxvd2x vs22, o32, T2
- lxvd2x vs23, o48, T2
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs48 // realA*realB
- XSFADD_R2 vs0, vs0, vs49 // imagA*imagB
-
- xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs48 // realA*imagB
- XSFADD_I2 vs1, vs1, vs49 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs50 // realA*realB
- XSFADD_R2 vs0, vs0, vs51 // imagA*imagB
-
- xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs50 // realA*imagB
- XSFADD_I2 vs1, vs1, vs51 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs52 // realA*realB
- XSFADD_R2 vs0, vs0, vs53 // imagA*imagB
-
- xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs52 // realA*imagB
- XSFADD_I2 vs1, vs1, vs53 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs54 // realA*realB
- XSFADD_R2 vs0, vs0, vs55 // imagA*imagB
-
- xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs54 // realA*imagB
- XSFADD_I2 vs1, vs1, vs55 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs56 // realA*realB
- XSFADD_R2 vs0, vs0, vs57 // imagA*imagB
-
- xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs56 // realA*imagB
- XSFADD_I2 vs1, vs1, vs57 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs58 // realA*realB
- XSFADD_R2 vs0, vs0, vs59 // imagA*imagB
-
- xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs58 // realA*imagB
- XSFADD_I2 vs1, vs1, vs59 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs60 // realA*realB
- XSFADD_R2 vs0, vs0, vs61 // imagA*imagB
-
- xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs60 // realA*imagB
- XSFADD_I2 vs1, vs1, vs61 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs62 // realA*realB
- XSFADD_R2 vs0, vs0, vs63 // imagA*imagB
-
- xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs62 // realA*imagB
- XSFADD_I2 vs1, vs1, vs63 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
- xvadddp vs12, vs12, vs20
- xvadddp vs13, vs13, vs21
- xvadddp vs14, vs14, vs22
- xvadddp vs15, vs15, vs23
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
- stxvd2x vs12, o0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
- addi CO, CO, 128
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=2 and M=4
- **********************************************************************************************/
-
- .macro LOAD2x4_1
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
-
- .endm
-
- .macro KERNEL2x4_I1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
- lxvd2x vs22, o32, BO // load real part from B
- lxvd2x vs23, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
- xvmuldp vs36, vs2, vs16 // real*real, imag*real
- xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
- xvmuldp vs38, vs3, vs16 // real*real, imag*real
- xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
-
- xvmuldp vs40, vs0, vs18 // real*real, imag*real
- xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
- xvmuldp vs42, vs1, vs18 // real*real, imag*real
- xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
- xvmuldp vs44, vs2, vs18 // real*real, imag*real
- xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
- xvmuldp vs46, vs3, vs18 // real*real, imag*real
- xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x4_1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
- lxvd2x vs22, o32, BO // load real part from B
- lxvd2x vs23, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
-
- xvmaddadp vs40, vs0, vs18 // real*real, imag*real
- xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs42, vs1, vs18 // real*real, imag*real
- xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs44, vs2, vs18 // real*real, imag*real
- xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs46, vs3, vs18 // real*real, imag*real
- xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x4_2
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
-
- xvmaddadp vs40, vs8, vs22 // real*real, imag*real
- xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs42, vs9, vs22 // real*real, imag*real
- xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
- xvmaddadp vs44, vs10, vs22 // real*real, imag*real
- xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
- xvmaddadp vs46, vs11, vs22 // real*real, imag*real
- xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x4_E2
-
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
-
- xvmaddadp vs40, vs8, vs22 // real*real, imag*real
- xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs42, vs9, vs22 // real*real, imag*real
- xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
- xvmaddadp vs44, vs10, vs22 // real*real, imag*real
- xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
- xvmaddadp vs46, vs11, vs22 // real*real, imag*real
- xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x4_SUBI1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
- xvmuldp vs36, vs2, vs16 // real*real, imag*real
- xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
- xvmuldp vs38, vs3, vs16 // real*real, imag*real
- xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
-
- xvmuldp vs40, vs0, vs18 // real*real, imag*real
- xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
- xvmuldp vs42, vs1, vs18 // real*real, imag*real
- xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
- xvmuldp vs44, vs2, vs18 // real*real, imag*real
- xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
- xvmuldp vs46, vs3, vs18 // real*real, imag*real
- xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x4_SUB1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
-
- xvmaddadp vs40, vs0, vs18 // real*real, imag*real
- xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs42, vs1, vs18 // real*real, imag*real
- xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs44, vs2, vs18 // real*real, imag*real
- xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs46, vs3, vs18 // real*real, imag*real
- xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro SAVE2x4
-
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
- lxvd2x vs18, o32, T1
- lxvd2x vs19, o48, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs32 // realA*realB
- XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
-
- xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs32 // realA*imagB
- XSFADD_I2 vs1, vs1, vs33 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs34 // realA*realB
- XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
-
- xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs34 // realA*imagB
- XSFADD_I2 vs1, vs1, vs35 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs36 // realA*realB
- XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
-
- xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs36 // realA*imagB
- XSFADD_I2 vs1, vs1, vs37 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs38 // realA*realB
- XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
-
- xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs38 // realA*imagB
- XSFADD_I2 vs1, vs1, vs39 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
- lxvd2x vs18, o32, T1
- lxvd2x vs19, o48, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs40 // realA*realB
- XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
-
- xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs40 // realA*imagB
- XSFADD_I2 vs1, vs1, vs41 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs42 // realA*realB
- XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
-
- xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs42 // realA*imagB
- XSFADD_I2 vs1, vs1, vs43 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs44 // realA*realB
- XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
-
- xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs44 // realA*imagB
- XSFADD_I2 vs1, vs1, vs45 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs46 // realA*realB
- XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
-
- xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs46 // realA*imagB
- XSFADD_I2 vs1, vs1, vs47 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- add T1, T1, LDC
- addi CO, CO, 64
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=2 and M=2
- **********************************************************************************************/
-
- .macro LOAD2x2_1
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
-
- .endm
-
- .macro KERNEL2x2_I1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
- lxvd2x vs22, o32, BO // load real part from B
- lxvd2x vs23, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
-
- xvmuldp vs36, vs0, vs18 // real*real, imag*real
- xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
- xvmuldp vs38, vs1, vs18 // real*real, imag*real
- xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x2_1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
- lxvd2x vs22, o32, BO // load real part from B
- lxvd2x vs23, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
-
- xvmaddadp vs36, vs0, vs18 // real*real, imag*real
- xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs38, vs1, vs18 // real*real, imag*real
- xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x2_2
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
-
- xvmaddadp vs36, vs8, vs22 // real*real, imag*real
- xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs38, vs9, vs22 // real*real, imag*real
- xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x2_E2
-
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
-
- xvmaddadp vs36, vs8, vs22 // real*real, imag*real
- xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs38, vs9, vs22 // real*real, imag*real
- xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x2_SUBI1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
-
- xvmuldp vs36, vs0, vs18 // real*real, imag*real
- xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
- xvmuldp vs38, vs1, vs18 // real*real, imag*real
- xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x2_SUB1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
-
- xvmaddadp vs36, vs0, vs18 // real*real, imag*real
- xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs38, vs1, vs18 // real*real, imag*real
- xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro SAVE2x2
-
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs32 // realA*realB
- XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
-
- xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs32 // realA*imagB
- XSFADD_I2 vs1, vs1, vs33 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs34 // realA*realB
- XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
-
- xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs34 // realA*imagB
- XSFADD_I2 vs1, vs1, vs35 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs36 // realA*realB
- XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
-
- xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs36 // realA*imagB
- XSFADD_I2 vs1, vs1, vs37 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs38 // realA*realB
- XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
-
- xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs38 // realA*imagB
- XSFADD_I2 vs1, vs1, vs39 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
-
- add T1, T1, LDC
- addi CO, CO, 32
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=2 and M=1
- **********************************************************************************************/
-
- .macro LOAD2x1_1
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- lxvd2x vs0, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
-
- .endm
-
- .macro KERNEL2x1_I1
-
- lxvd2x vs8, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
- lxvd2x vs22, o32, BO // load real part from B
- lxvd2x vs23, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
-
- xvmuldp vs34, vs0, vs18 // real*real, imag*real
- xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x1_1
-
- lxvd2x vs8, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
- lxvd2x vs22, o32, BO // load real part from B
- lxvd2x vs23, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
-
- xvmaddadp vs34, vs0, vs18 // real*real, imag*real
- xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x1_2
-
- lxvd2x vs0, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
-
- xvmaddadp vs34, vs8, vs22 // real*real, imag*real
- xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x1_E2
-
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
-
- xvmaddadp vs34, vs8, vs22 // real*real, imag*real
- xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x1_SUBI1
-
- lxvd2x vs0, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
-
- xvmuldp vs34, vs0, vs18 // real*real, imag*real
- xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL2x1_SUB1
-
- lxvd2x vs0, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
- lxvd2x vs18, o32, BO // load real part from B
- lxvd2x vs19, o48, BO // load imag part from B
-
- addi BO, BO, 64
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
-
- xvmaddadp vs34, vs0, vs18 // real*real, imag*real
- xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
-
-
- .endm
-
- .macro SAVE2x1
-
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs32 // realA*realB
- XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
-
- xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs32 // realA*imagB
- XSFADD_I2 vs1, vs1, vs33 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
-
- #endif
-
- stxvd2x vs8, o0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs34 // realA*realB
- XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
-
- xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs34 // realA*imagB
- XSFADD_I2 vs1, vs1, vs35 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
-
- #endif
-
- stxvd2x vs8, o0, T1
-
- add T1, T1, LDC
- addi CO, CO, 16
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=1 and M=8
- **********************************************************************************************/
-
- .macro LOAD1x8_1
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
-
- .endm
-
- .macro KERNEL1x8_I1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs12, o0, AO // load real,imag from A
- lxvd2x vs13, o16, AO // load real,imag from A
- lxvd2x vs14, o32, AO // load real,imag from A
- lxvd2x vs15, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
- xvmuldp vs36, vs2, vs16 // real*real, imag*real
- xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
- xvmuldp vs38, vs3, vs16 // real*real, imag*real
- xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
- xvmuldp vs40, vs4, vs16 // real*real, imag*real
- xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
- xvmuldp vs42, vs5, vs16 // real*real, imag*real
- xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
- xvmuldp vs44, vs6, vs16 // real*real, imag*real
- xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
- xvmuldp vs46, vs7, vs16 // real*real, imag*real
- xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x8_1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs12, o0, AO // load real,imag from A
- lxvd2x vs13, o16, AO // load real,imag from A
- lxvd2x vs14, o32, AO // load real,imag from A
- lxvd2x vs15, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x8_2
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
- xvmaddadp vs40, vs12, vs20 // real*real, imag*real
- xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
- xvmaddadp vs42, vs13, vs20 // real*real, imag*real
- xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
- xvmaddadp vs44, vs14, vs20 // real*real, imag*real
- xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
- xvmaddadp vs46, vs15, vs20 // real*real, imag*real
- xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x8_E2
-
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
- xvmaddadp vs40, vs12, vs20 // real*real, imag*real
- xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
- xvmaddadp vs42, vs13, vs20 // real*real, imag*real
- xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
- xvmaddadp vs44, vs14, vs20 // real*real, imag*real
- xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
- xvmaddadp vs46, vs15, vs20 // real*real, imag*real
- xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x8_SUBI1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
- xvmuldp vs36, vs2, vs16 // real*real, imag*real
- xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
- xvmuldp vs38, vs3, vs16 // real*real, imag*real
- xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
- xvmuldp vs40, vs4, vs16 // real*real, imag*real
- xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
- xvmuldp vs42, vs5, vs16 // real*real, imag*real
- xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
- xvmuldp vs44, vs6, vs16 // real*real, imag*real
- xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
- xvmuldp vs46, vs7, vs16 // real*real, imag*real
- xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x8_SUB1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro SAVE1x8
-
-
- mr T1, CO
- addi T2, T1, 64
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
- lxvd2x vs18, o32, T1
- lxvd2x vs19, o48, T1
- lxvd2x vs20, o0, T2
- lxvd2x vs21, o16, T2
- lxvd2x vs22, o32, T2
- lxvd2x vs23, o48, T2
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs32 // realA*realB
- XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
-
- xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs32 // realA*imagB
- XSFADD_I2 vs1, vs1, vs33 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs34 // realA*realB
- XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
-
- xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs34 // realA*imagB
- XSFADD_I2 vs1, vs1, vs35 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs36 // realA*realB
- XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
-
- xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs36 // realA*imagB
- XSFADD_I2 vs1, vs1, vs37 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs38 // realA*realB
- XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
-
- xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs38 // realA*imagB
- XSFADD_I2 vs1, vs1, vs39 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs40 // realA*realB
- XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
-
- xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs40 // realA*imagB
- XSFADD_I2 vs1, vs1, vs41 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs42 // realA*realB
- XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
-
- xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs42 // realA*imagB
- XSFADD_I2 vs1, vs1, vs43 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs44 // realA*realB
- XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
-
- xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs44 // realA*imagB
- XSFADD_I2 vs1, vs1, vs45 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs46 // realA*realB
- XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
-
- xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs46 // realA*imagB
- XSFADD_I2 vs1, vs1, vs47 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
- xvadddp vs12, vs12, vs20
- xvadddp vs13, vs13, vs21
- xvadddp vs14, vs14, vs22
- xvadddp vs15, vs15, vs23
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
- stxvd2x vs12, o0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
- addi CO, CO, 128
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=1 and M=4
- **********************************************************************************************/
-
- .macro LOAD1x4_1
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
-
- .endm
-
- .macro KERNEL1x4_I1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
- xvmuldp vs36, vs2, vs16 // real*real, imag*real
- xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
- xvmuldp vs38, vs3, vs16 // real*real, imag*real
- xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x4_1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x4_2
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x4_E2
-
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x4_SUBI1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
- xvmuldp vs36, vs2, vs16 // real*real, imag*real
- xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
- xvmuldp vs38, vs3, vs16 // real*real, imag*real
- xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x4_SUB1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
- addi AO, AO, 64
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro SAVE1x4
-
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
- lxvd2x vs18, o32, T1
- lxvd2x vs19, o48, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs32 // realA*realB
- XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
-
- xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs32 // realA*imagB
- XSFADD_I2 vs1, vs1, vs33 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs34 // realA*realB
- XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
-
- xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs34 // realA*imagB
- XSFADD_I2 vs1, vs1, vs35 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs36 // realA*realB
- XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
-
- xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs36 // realA*imagB
- XSFADD_I2 vs1, vs1, vs37 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs38 // realA*realB
- XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
-
- xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs38 // realA*imagB
- XSFADD_I2 vs1, vs1, vs39 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- add T1, T1, LDC
- addi CO, CO, 64
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=1 and M=2
- **********************************************************************************************/
-
- .macro LOAD1x2_1
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
-
- .endm
-
- .macro KERNEL1x2_I1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x2_1
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x2_2
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x2_E2
-
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x2_SUBI1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
- xvmuldp vs34, vs1, vs16 // real*real, imag*real
- xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x2_SUB1
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
- addi AO, AO, 32
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro SAVE1x2
-
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
- lxvd2x vs17, o16, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs32 // realA*realB
- XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
-
- xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs32 // realA*imagB
- XSFADD_I2 vs1, vs1, vs33 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs34 // realA*realB
- XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
-
- xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs34 // realA*imagB
- XSFADD_I2 vs1, vs1, vs35 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
-
- #endif
-
- stxvd2x vs8, o0, T1
- stxvd2x vs9, o16, T1
-
- add T1, T1, LDC
- addi CO, CO, 32
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=1 and M=1
- **********************************************************************************************/
-
- .macro LOAD1x1_1
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- lxvd2x vs0, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
-
- .endm
-
- .macro KERNEL1x1_I1
-
- lxvd2x vs8, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x1_1
-
- lxvd2x vs8, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs20, o0, BO // load real part from B
- lxvd2x vs21, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x1_2
-
- lxvd2x vs0, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x1_E2
-
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x1_SUBI1
-
- lxvd2x vs0, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmuldp vs32, vs0, vs16 // real*real, imag*real
- xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro KERNEL1x1_SUB1
-
- lxvd2x vs0, o0, AO // load real,imag from A
-
- addi AO, AO, 16
-
- lxvd2x vs16, o0, BO // load real part from B
- lxvd2x vs17, o16, BO // load imag part from B
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
-
-
- .endm
-
- .macro SAVE1x1
-
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
-
- lxvd2x vs16, o0, T1
-
- #endif
-
-
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
-
- XSFADD_R1 vs0, vs0, vs32 // realA*realB
- XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
-
- xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
- xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 vs1, vs1, vs32 // realA*imagB
- XSFADD_I2 vs1, vs1, vs33 // imagA*realB
-
- xsmuldp vs4, vs0, alpha_r // real*alpha_r
- xsmuldp vs5, vs1, alpha_i // imag*alpha_i
- xsmuldp vs6, vs0, alpha_i // real*alpha_i
- xsmuldp vs7, vs1, alpha_r // imag*alpha_r
-
- xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
- xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
- xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
-
-
- #ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
-
- #endif
-
- stxvd2x vs8, o0, T1
-
- add T1, T1, LDC
- addi CO, CO, 16
-
- .endm
-
-
-
- .macro ZCOPYB_1x1
-
- lxvdsx vs4, o0, BO // b0_r
- lxvdsx vs5, o8, BO // b0_i
- addi BO, BO, 16
- stxvd2x vs4, o0, BBO
- stxvd2x vs5, o16, BBO
- addi BBO, BBO, 32
-
- .endm
-
-
- .macro ZCOPYB_8x1
-
- lxvd2x vs32, o0, BO
- lxvd2x vs33, o16, BO
- lxvd2x vs34, o32, BO
- lxvd2x vs35, o48, BO
- addi BO, BO, 64
-
- lxvd2x vs36, o0, BO
- lxvd2x vs37, o16, BO
- lxvd2x vs38, o32, BO
- lxvd2x vs39, o48, BO
- addi BO, BO, 64
-
- xxspltd vs40, vs32, 0
- xxspltd vs41, vs32, 1
- xxspltd vs42, vs33, 0
- xxspltd vs43, vs33, 1
- xxspltd vs44, vs34, 0
- xxspltd vs45, vs34, 1
- xxspltd vs46, vs35, 0
- xxspltd vs47, vs35, 1
-
- xxspltd vs48, vs36, 0
- xxspltd vs49, vs36, 1
- xxspltd vs50, vs37, 0
- xxspltd vs51, vs37, 1
- xxspltd vs52, vs38, 0
- xxspltd vs53, vs38, 1
- xxspltd vs54, vs39, 0
- xxspltd vs55, vs39, 1
-
- stxvd2x vs40, o0, BBO
- stxvd2x vs41, o16, BBO
- stxvd2x vs42, o32, BBO
- stxvd2x vs43, o48, BBO
- addi BBO, BBO, 64
-
- stxvd2x vs44, o0, BBO
- stxvd2x vs45, o16, BBO
- stxvd2x vs46, o32, BBO
- stxvd2x vs47, o48, BBO
- addi BBO, BBO, 64
-
- stxvd2x vs48, o0, BBO
- stxvd2x vs49, o16, BBO
- stxvd2x vs50, o32, BBO
- stxvd2x vs51, o48, BBO
- addi BBO, BBO, 64
-
- stxvd2x vs52, o0, BBO
- stxvd2x vs53, o16, BBO
- stxvd2x vs54, o32, BBO
- stxvd2x vs55, o48, BBO
- addi BBO, BBO, 64
-
- .endm
-
|