|
- /***************************************************************************
- Copyright (c) 2013-2016, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
- /**************************************************************************************
- * 2016/03/05 Werner Saar (wernsaar@googlemail.com)
- * BLASTEST : OK
- * CTEST : OK
- * TEST : OK
- * LAPACK-TEST : OK
- **************************************************************************************/
-
- /*********************************************************************
- * Macros for N=4, M=16 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD4x16_1', `
- #else
- .macro LOAD4x16_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x16_I1', `
- #else
- .macro KERNEL4x16_I1
- #endif
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- xvmuldp vs36, vs4, vs24
- xvmuldp vs37, vs5, vs24
- xvmuldp vs38, vs6, vs24
- xvmuldp vs39, vs7, vs24
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
-
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- xvmuldp vs44, vs4, vs25
- xvmuldp vs45, vs5, vs25
- xvmuldp vs46, vs6, vs25
- xvmuldp vs47, vs7, vs25
-
- addi AO, AO, 64
-
- xvmuldp vs48, vs0, vs26
- xvmuldp vs49, vs1, vs26
- xvmuldp vs50, vs2, vs26
- xvmuldp vs51, vs3, vs26
-
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
-
- xvmuldp vs52, vs4, vs26
- xvmuldp vs53, vs5, vs26
- xvmuldp vs54, vs6, vs26
- xvmuldp vs55, vs7, vs26
-
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
-
- xvmuldp vs56, vs0, vs27
- xvmuldp vs57, vs1, vs27
- xvmuldp vs58, vs2, vs27
- xvmuldp vs59, vs3, vs27
-
- lxvdsx vs30, o16, BO
- lxvdsx vs31, o24, BO
-
- xvmuldp vs60, vs4, vs27
- xvmuldp vs61, vs5, vs27
- xvmuldp vs62, vs6, vs27
- xvmuldp vs63, vs7, vs27
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x16_1', `
- #else
- .macro KERNEL4x16_1
- #endif
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- xvmaddadp vs36, vs4, vs24
- xvmaddadp vs37, vs5, vs24
- xvmaddadp vs38, vs6, vs24
- xvmaddadp vs39, vs7, vs24
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
- xvmaddadp vs42, vs2, vs25
- xvmaddadp vs43, vs3, vs25
-
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- xvmaddadp vs44, vs4, vs25
- xvmaddadp vs45, vs5, vs25
- xvmaddadp vs46, vs6, vs25
- xvmaddadp vs47, vs7, vs25
-
- addi AO, AO, 64
-
- xvmaddadp vs48, vs0, vs26
- xvmaddadp vs49, vs1, vs26
- xvmaddadp vs50, vs2, vs26
- xvmaddadp vs51, vs3, vs26
-
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
-
- xvmaddadp vs52, vs4, vs26
- xvmaddadp vs53, vs5, vs26
- xvmaddadp vs54, vs6, vs26
- xvmaddadp vs55, vs7, vs26
-
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
-
- xvmaddadp vs56, vs0, vs27
- xvmaddadp vs57, vs1, vs27
- xvmaddadp vs58, vs2, vs27
- xvmaddadp vs59, vs3, vs27
-
-
- lxvdsx vs30, o16, BO
- lxvdsx vs31, o24, BO
-
- xvmaddadp vs60, vs4, vs27
- xvmaddadp vs61, vs5, vs27
- xvmaddadp vs62, vs6, vs27
- xvmaddadp vs63, vs7, vs27
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x16_2', `
- #else
- .macro KERNEL4x16_2
- #endif
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- xvmaddadp vs36, vs12, vs28
- xvmaddadp vs37, vs13, vs28
- xvmaddadp vs38, vs14, vs28
- xvmaddadp vs39, vs15, vs28
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
- xvmaddadp vs42, vs10, vs29
- xvmaddadp vs43, vs11, vs29
-
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- xvmaddadp vs44, vs12, vs29
- xvmaddadp vs45, vs13, vs29
- xvmaddadp vs46, vs14, vs29
- xvmaddadp vs47, vs15, vs29
-
- addi AO, AO, 64
-
- xvmaddadp vs48, vs8, vs30
- xvmaddadp vs49, vs9, vs30
- xvmaddadp vs50, vs10, vs30
- xvmaddadp vs51, vs11, vs30
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
-
- xvmaddadp vs52, vs12, vs30
- xvmaddadp vs53, vs13, vs30
- xvmaddadp vs54, vs14, vs30
- xvmaddadp vs55, vs15, vs30
-
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- xvmaddadp vs56, vs8, vs31
- xvmaddadp vs57, vs9, vs31
- xvmaddadp vs58, vs10, vs31
- xvmaddadp vs59, vs11, vs31
-
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- xvmaddadp vs60, vs12, vs31
- xvmaddadp vs61, vs13, vs31
- xvmaddadp vs62, vs14, vs31
- xvmaddadp vs63, vs15, vs31
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x16_E2', `
- #else
- .macro KERNEL4x16_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
- xvmaddadp vs36, vs12, vs28
- xvmaddadp vs37, vs13, vs28
- xvmaddadp vs38, vs14, vs28
- xvmaddadp vs39, vs15, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
- xvmaddadp vs42, vs10, vs29
- xvmaddadp vs43, vs11, vs29
- xvmaddadp vs44, vs12, vs29
- xvmaddadp vs45, vs13, vs29
- xvmaddadp vs46, vs14, vs29
- xvmaddadp vs47, vs15, vs29
-
- xvmaddadp vs48, vs8, vs30
- xvmaddadp vs49, vs9, vs30
- xvmaddadp vs50, vs10, vs30
- xvmaddadp vs51, vs11, vs30
- xvmaddadp vs52, vs12, vs30
- xvmaddadp vs53, vs13, vs30
- xvmaddadp vs54, vs14, vs30
- xvmaddadp vs55, vs15, vs30
-
- xvmaddadp vs56, vs8, vs31
- xvmaddadp vs57, vs9, vs31
- xvmaddadp vs58, vs10, vs31
- xvmaddadp vs59, vs11, vs31
- xvmaddadp vs60, vs12, vs31
- xvmaddadp vs61, vs13, vs31
- xvmaddadp vs62, vs14, vs31
- xvmaddadp vs63, vs15, vs31
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x16_SUBI1', `
- #else
- .macro KERNEL4x16_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
- xvmuldp vs36, vs4, vs24
- xvmuldp vs37, vs5, vs24
- xvmuldp vs38, vs6, vs24
- xvmuldp vs39, vs7, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
- xvmuldp vs44, vs4, vs25
- xvmuldp vs45, vs5, vs25
- xvmuldp vs46, vs6, vs25
- xvmuldp vs47, vs7, vs25
-
- xvmuldp vs48, vs0, vs26
- xvmuldp vs49, vs1, vs26
- xvmuldp vs50, vs2, vs26
- xvmuldp vs51, vs3, vs26
- xvmuldp vs52, vs4, vs26
- xvmuldp vs53, vs5, vs26
- xvmuldp vs54, vs6, vs26
- xvmuldp vs55, vs7, vs26
-
- xvmuldp vs56, vs0, vs27
- xvmuldp vs57, vs1, vs27
- xvmuldp vs58, vs2, vs27
- xvmuldp vs59, vs3, vs27
- xvmuldp vs60, vs4, vs27
- xvmuldp vs61, vs5, vs27
- xvmuldp vs62, vs6, vs27
- xvmuldp vs63, vs7, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x16_SUB1', `
- #else
- .macro KERNEL4x16_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
- xvmaddadp vs36, vs4, vs24
- xvmaddadp vs37, vs5, vs24
- xvmaddadp vs38, vs6, vs24
- xvmaddadp vs39, vs7, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
- xvmaddadp vs42, vs2, vs25
- xvmaddadp vs43, vs3, vs25
- xvmaddadp vs44, vs4, vs25
- xvmaddadp vs45, vs5, vs25
- xvmaddadp vs46, vs6, vs25
- xvmaddadp vs47, vs7, vs25
-
- xvmaddadp vs48, vs0, vs26
- xvmaddadp vs49, vs1, vs26
- xvmaddadp vs50, vs2, vs26
- xvmaddadp vs51, vs3, vs26
- xvmaddadp vs52, vs4, vs26
- xvmaddadp vs53, vs5, vs26
- xvmaddadp vs54, vs6, vs26
- xvmaddadp vs55, vs7, vs26
-
- xvmaddadp vs56, vs0, vs27
- xvmaddadp vs57, vs1, vs27
- xvmaddadp vs58, vs2, vs27
- xvmaddadp vs59, vs3, vs27
- xvmaddadp vs60, vs4, vs27
- xvmaddadp vs61, vs5, vs27
- xvmaddadp vs62, vs6, vs27
- xvmaddadp vs63, vs7, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE4x16', `
- #else
- .macro SAVE4x16
- #endif
-
- mr T1, CO
- addi T2, T1, 64
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
-
- lxvd2x vs4, 0, T2
- lxvd2x vs5, o16, T2
- lxvd2x vs6, o32, T2
- lxvd2x vs7, o48, T2
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- xvmaddadp vs2, vs34, alpha_r
- xvmaddadp vs3, vs35, alpha_r
- xvmaddadp vs4, vs36, alpha_r
- xvmaddadp vs5, vs37, alpha_r
- xvmaddadp vs6, vs38, alpha_r
- xvmaddadp vs7, vs39, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- xvmuldp vs2, vs34, alpha_r
- xvmuldp vs3, vs35, alpha_r
- xvmuldp vs4, vs36, alpha_r
- xvmuldp vs5, vs37, alpha_r
- xvmuldp vs6, vs38, alpha_r
- xvmuldp vs7, vs39, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
-
- stxvd2x vs4, 0, T2
- stxvd2x vs5, o16, T2
- stxvd2x vs6, o32, T2
- stxvd2x vs7, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
-
- lxvd2x vs12, 0, T2
- lxvd2x vs13, o16, T2
- lxvd2x vs14, o32, T2
- lxvd2x vs15, o48, T2
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs40, alpha_r
- xvmaddadp vs9, vs41, alpha_r
- xvmaddadp vs10, vs42, alpha_r
- xvmaddadp vs11, vs43, alpha_r
- xvmaddadp vs12, vs44, alpha_r
- xvmaddadp vs13, vs45, alpha_r
- xvmaddadp vs14, vs46, alpha_r
- xvmaddadp vs15, vs47, alpha_r
- #else
- xvmuldp vs8, vs40, alpha_r
- xvmuldp vs9, vs41, alpha_r
- xvmuldp vs10, vs42, alpha_r
- xvmuldp vs11, vs43, alpha_r
- xvmuldp vs12, vs44, alpha_r
- xvmuldp vs13, vs45, alpha_r
- xvmuldp vs14, vs46, alpha_r
- xvmuldp vs15, vs47, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
-
- stxvd2x vs12, 0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
-
- lxvd2x vs4, 0, T2
- lxvd2x vs5, o16, T2
- lxvd2x vs6, o32, T2
- lxvd2x vs7, o48, T2
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs48, alpha_r
- xvmaddadp vs1, vs49, alpha_r
- xvmaddadp vs2, vs50, alpha_r
- xvmaddadp vs3, vs51, alpha_r
- xvmaddadp vs4, vs52, alpha_r
- xvmaddadp vs5, vs53, alpha_r
- xvmaddadp vs6, vs54, alpha_r
- xvmaddadp vs7, vs55, alpha_r
- #else
- xvmuldp vs0, vs48, alpha_r
- xvmuldp vs1, vs49, alpha_r
- xvmuldp vs2, vs50, alpha_r
- xvmuldp vs3, vs51, alpha_r
- xvmuldp vs4, vs52, alpha_r
- xvmuldp vs5, vs53, alpha_r
- xvmuldp vs6, vs54, alpha_r
- xvmuldp vs7, vs55, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
-
- stxvd2x vs4, 0, T2
- stxvd2x vs5, o16, T2
- stxvd2x vs6, o32, T2
- stxvd2x vs7, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
-
- lxvd2x vs12, 0, T2
- lxvd2x vs13, o16, T2
- lxvd2x vs14, o32, T2
- lxvd2x vs15, o48, T2
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs56, alpha_r
- xvmaddadp vs9, vs57, alpha_r
- xvmaddadp vs10, vs58, alpha_r
- xvmaddadp vs11, vs59, alpha_r
- xvmaddadp vs12, vs60, alpha_r
- xvmaddadp vs13, vs61, alpha_r
- xvmaddadp vs14, vs62, alpha_r
- xvmaddadp vs15, vs63, alpha_r
- #else
- xvmuldp vs8, vs56, alpha_r
- xvmuldp vs9, vs57, alpha_r
- xvmuldp vs10, vs58, alpha_r
- xvmuldp vs11, vs59, alpha_r
- xvmuldp vs12, vs60, alpha_r
- xvmuldp vs13, vs61, alpha_r
- xvmuldp vs14, vs62, alpha_r
- xvmuldp vs15, vs63, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
-
- stxvd2x vs12, 0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
-
- addi CO, CO, 128
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=4, M=8 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD4x8_1', `
- #else
- .macro LOAD4x8_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x8_I1', `
- #else
- .macro KERNEL4x8_I1
- #endif
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
-
- xvmuldp vs48, vs0, vs26
- xvmuldp vs49, vs1, vs26
-
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- xvmuldp vs50, vs2, vs26
- xvmuldp vs51, vs3, vs26
-
- lxvdsx vs30, o16, BO
- lxvdsx vs31, o24, BO
-
- xvmuldp vs56, vs0, vs27
- xvmuldp vs57, vs1, vs27
- xvmuldp vs58, vs2, vs27
- xvmuldp vs59, vs3, vs27
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x8_1', `
- #else
- .macro KERNEL4x8_1
- #endif
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
- xvmaddadp vs42, vs2, vs25
- xvmaddadp vs43, vs3, vs25
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- xvmaddadp vs48, vs0, vs26
- xvmaddadp vs49, vs1, vs26
-
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- xvmaddadp vs50, vs2, vs26
- xvmaddadp vs51, vs3, vs26
-
- lxvdsx vs30, o16, BO
- lxvdsx vs31, o24, BO
-
- xvmaddadp vs56, vs0, vs27
- xvmaddadp vs57, vs1, vs27
- xvmaddadp vs58, vs2, vs27
- xvmaddadp vs59, vs3, vs27
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x8_2', `
- #else
- .macro KERNEL4x8_2
- #endif
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
- xvmaddadp vs42, vs10, vs29
- xvmaddadp vs43, vs11, vs29
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- xvmaddadp vs48, vs8, vs30
- xvmaddadp vs49, vs9, vs30
-
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- xvmaddadp vs50, vs10, vs30
- xvmaddadp vs51, vs11, vs30
-
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- xvmaddadp vs56, vs8, vs31
- xvmaddadp vs57, vs9, vs31
- xvmaddadp vs58, vs10, vs31
- xvmaddadp vs59, vs11, vs31
-
- addi AO, AO, 64
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x8_E2', `
- #else
- .macro KERNEL4x8_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
- xvmaddadp vs42, vs10, vs29
- xvmaddadp vs43, vs11, vs29
-
- xvmaddadp vs48, vs8, vs30
- xvmaddadp vs49, vs9, vs30
- xvmaddadp vs50, vs10, vs30
- xvmaddadp vs51, vs11, vs30
-
- xvmaddadp vs56, vs8, vs31
- xvmaddadp vs57, vs9, vs31
- xvmaddadp vs58, vs10, vs31
- xvmaddadp vs59, vs11, vs31
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x8_SUBI1', `
- #else
- .macro KERNEL4x8_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 64
- addi BO, BO, 32
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
-
- xvmuldp vs48, vs0, vs26
- xvmuldp vs49, vs1, vs26
- xvmuldp vs50, vs2, vs26
- xvmuldp vs51, vs3, vs26
-
- xvmuldp vs56, vs0, vs27
- xvmuldp vs57, vs1, vs27
- xvmuldp vs58, vs2, vs27
- xvmuldp vs59, vs3, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x8_SUB1', `
- #else
- .macro KERNEL4x8_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 64
- addi BO, BO, 32
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
- xvmaddadp vs42, vs2, vs25
- xvmaddadp vs43, vs3, vs25
-
- xvmaddadp vs48, vs0, vs26
- xvmaddadp vs49, vs1, vs26
- xvmaddadp vs50, vs2, vs26
- xvmaddadp vs51, vs3, vs26
-
- xvmaddadp vs56, vs0, vs27
- xvmaddadp vs57, vs1, vs27
- xvmaddadp vs58, vs2, vs27
- xvmaddadp vs59, vs3, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE4x8', `
- #else
- .macro SAVE4x8
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- xvmaddadp vs2, vs34, alpha_r
- xvmaddadp vs3, vs35, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- xvmuldp vs2, vs34, alpha_r
- xvmuldp vs3, vs35, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs40, alpha_r
- xvmaddadp vs9, vs41, alpha_r
- xvmaddadp vs10, vs42, alpha_r
- xvmaddadp vs11, vs43, alpha_r
- #else
- xvmuldp vs8, vs40, alpha_r
- xvmuldp vs9, vs41, alpha_r
- xvmuldp vs10, vs42, alpha_r
- xvmuldp vs11, vs43, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs48, alpha_r
- xvmaddadp vs1, vs49, alpha_r
- xvmaddadp vs2, vs50, alpha_r
- xvmaddadp vs3, vs51, alpha_r
- #else
- xvmuldp vs0, vs48, alpha_r
- xvmuldp vs1, vs49, alpha_r
- xvmuldp vs2, vs50, alpha_r
- xvmuldp vs3, vs51, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs56, alpha_r
- xvmaddadp vs9, vs57, alpha_r
- xvmaddadp vs10, vs58, alpha_r
- xvmaddadp vs11, vs59, alpha_r
- #else
- xvmuldp vs8, vs56, alpha_r
- xvmuldp vs9, vs57, alpha_r
- xvmuldp vs10, vs58, alpha_r
- xvmuldp vs11, vs59, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- addi CO, CO, 64
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=4, M=4 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD4x4_1', `
- #else
- .macro LOAD4x4_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 32
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x4_I1', `
- #else
- .macro KERNEL4x4_I1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
- lxvdsx vs30, o16, BO
- lxvdsx vs31, o24, BO
-
- addi AO, AO, 32
- addi BO, BO, 32
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
-
- xvmuldp vs48, vs0, vs26
- xvmuldp vs49, vs1, vs26
-
- xvmuldp vs56, vs0, vs27
- xvmuldp vs57, vs1, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x4_1', `
- #else
- .macro KERNEL4x4_1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
- lxvdsx vs30, o16, BO
- lxvdsx vs31, o24, BO
-
- addi AO, AO, 32
- addi BO, BO, 32
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
-
- xvmaddadp vs48, vs0, vs26
- xvmaddadp vs49, vs1, vs26
-
- xvmaddadp vs56, vs0, vs27
- xvmaddadp vs57, vs1, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x4_2', `
- #else
- .macro KERNEL4x4_2
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 32
- addi BO, BO, 32
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
-
- xvmaddadp vs48, vs8, vs30
- xvmaddadp vs49, vs9, vs30
-
- xvmaddadp vs56, vs8, vs31
- xvmaddadp vs57, vs9, vs31
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x4_E2', `
- #else
- .macro KERNEL4x4_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
-
- xvmaddadp vs48, vs8, vs30
- xvmaddadp vs49, vs9, vs30
-
- xvmaddadp vs56, vs8, vs31
- xvmaddadp vs57, vs9, vs31
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x4_SUBI1', `
- #else
- .macro KERNEL4x4_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 32
- addi BO, BO, 32
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
-
- xvmuldp vs48, vs0, vs26
- xvmuldp vs49, vs1, vs26
-
- xvmuldp vs56, vs0, vs27
- xvmuldp vs57, vs1, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x4_SUB1', `
- #else
- .macro KERNEL4x4_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 32
- addi BO, BO, 32
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
-
- xvmaddadp vs48, vs0, vs26
- xvmaddadp vs49, vs1, vs26
-
- xvmaddadp vs56, vs0, vs27
- xvmaddadp vs57, vs1, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE4x4', `
- #else
- .macro SAVE4x4
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs40, alpha_r
- xvmaddadp vs9, vs41, alpha_r
- #else
- xvmuldp vs8, vs40, alpha_r
- xvmuldp vs9, vs41, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs48, alpha_r
- xvmaddadp vs1, vs49, alpha_r
- #else
- xvmuldp vs0, vs48, alpha_r
- xvmuldp vs1, vs49, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs56, alpha_r
- xvmaddadp vs9, vs57, alpha_r
- #else
- xvmuldp vs8, vs56, alpha_r
- xvmuldp vs9, vs57, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
-
- addi CO, CO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=4, M=2 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD4x2_1', `
- #else
- .macro LOAD4x2_1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 16
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x2_I1', `
- #else
- .macro KERNEL4x2_I1
- #endif
-
- lxvd2x vs8, 0, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
- lxvdsx vs30, o16, BO
- lxvdsx vs31, o24, BO
-
- addi AO, AO, 16
- addi BO, BO, 32
-
-
- xvmuldp vs32, vs0, vs24
-
- xvmuldp vs40, vs0, vs25
-
- xvmuldp vs48, vs0, vs26
-
- xvmuldp vs56, vs0, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x2_1', `
- #else
- .macro KERNEL4x2_1
- #endif
-
- lxvd2x vs8, 0, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
- lxvdsx vs30, o16, BO
- lxvdsx vs31, o24, BO
-
- addi AO, AO, 16
- addi BO, BO, 32
-
-
- xvmaddadp vs32, vs0, vs24
-
- xvmaddadp vs40, vs0, vs25
-
- xvmaddadp vs48, vs0, vs26
-
- xvmaddadp vs56, vs0, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x2_2', `
- #else
- .macro KERNEL4x2_2
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 16
- addi BO, BO, 32
-
-
- xvmaddadp vs32, vs8, vs28
-
- xvmaddadp vs40, vs8, vs29
-
- xvmaddadp vs48, vs8, vs30
-
- xvmaddadp vs56, vs8, vs31
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x2_E2', `
- #else
- .macro KERNEL4x2_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
-
- xvmaddadp vs40, vs8, vs29
-
- xvmaddadp vs48, vs8, vs30
-
- xvmaddadp vs56, vs8, vs31
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x2_SUBI1', `
- #else
- .macro KERNEL4x2_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 16
- addi BO, BO, 32
-
-
- xvmuldp vs32, vs0, vs24
-
- xvmuldp vs40, vs0, vs25
-
- xvmuldp vs48, vs0, vs26
-
- xvmuldp vs56, vs0, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x2_SUB1', `
- #else
- .macro KERNEL4x2_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
- lxvdsx vs26, o16, BO
- lxvdsx vs27, o24, BO
-
- addi AO, AO, 16
- addi BO, BO, 32
-
-
- xvmaddadp vs32, vs0, vs24
-
- xvmaddadp vs40, vs0, vs25
-
- xvmaddadp vs48, vs0, vs26
-
- xvmaddadp vs56, vs0, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE4x2', `
- #else
- .macro SAVE4x2
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs40, alpha_r
- #else
- xvmuldp vs8, vs40, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs48, alpha_r
- #else
- xvmuldp vs0, vs48, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs56, alpha_r
- #else
- xvmuldp vs8, vs56, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
-
- addi CO, CO, 16
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=4, M=1 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD4x1_1', `
- #else
- .macro LOAD4x1_1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
- lxsdx vs25, o8, BO
- lxsdx vs26, o16, BO
- lxsdx vs27, o24, BO
-
- addi AO, AO, 8
- addi BO, BO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x1_I1', `
- #else
- .macro KERNEL4x1_I1
- #endif
-
- lxsdx vs8, 0, AO
-
- lxsdx vs28, 0, BO
- lxsdx vs29, o8, BO
- lxsdx vs30, o16, BO
- lxsdx vs31, o24, BO
-
- addi AO, AO, 8
- addi BO, BO, 32
-
-
- xsmuldp vs32, vs0, vs24
-
- xsmuldp vs40, vs0, vs25
-
- xsmuldp vs48, vs0, vs26
-
- xsmuldp vs56, vs0, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x1_1', `
- #else
- .macro KERNEL4x1_1
- #endif
-
- lxsdx vs8, 0, AO
-
- lxsdx vs28, 0, BO
- lxsdx vs29, o8, BO
- lxsdx vs30, o16, BO
- lxsdx vs31, o24, BO
-
- addi AO, AO, 8
- addi BO, BO, 32
-
-
- xsmaddadp vs32, vs0, vs24
-
- xsmaddadp vs40, vs0, vs25
-
- xsmaddadp vs48, vs0, vs26
-
- xsmaddadp vs56, vs0, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x1_2', `
- #else
- .macro KERNEL4x1_2
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
- lxsdx vs25, o8, BO
- lxsdx vs26, o16, BO
- lxsdx vs27, o24, BO
-
- addi AO, AO, 8
- addi BO, BO, 32
-
-
- xsmaddadp vs32, vs8, vs28
-
- xsmaddadp vs40, vs8, vs29
-
- xsmaddadp vs48, vs8, vs30
-
- xsmaddadp vs56, vs8, vs31
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x1_E2', `
- #else
- .macro KERNEL4x1_E2
- #endif
-
-
- xsmaddadp vs32, vs8, vs28
-
- xsmaddadp vs40, vs8, vs29
-
- xsmaddadp vs48, vs8, vs30
-
- xsmaddadp vs56, vs8, vs31
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x1_SUBI1', `
- #else
- .macro KERNEL4x1_SUBI1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
- lxsdx vs25, o8, BO
- lxsdx vs26, o16, BO
- lxsdx vs27, o24, BO
-
- addi AO, AO, 8
- addi BO, BO, 32
-
-
- xsmuldp vs32, vs0, vs24
-
- xsmuldp vs40, vs0, vs25
-
- xsmuldp vs48, vs0, vs26
-
- xsmuldp vs56, vs0, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL4x1_SUB1', `
- #else
- .macro KERNEL4x1_SUB1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
- lxsdx vs25, o8, BO
- lxsdx vs26, o16, BO
- lxsdx vs27, o24, BO
-
- addi AO, AO, 8
- addi BO, BO, 32
-
-
- xsmaddadp vs32, vs0, vs24
-
- xsmaddadp vs40, vs0, vs25
-
- xsmaddadp vs48, vs0, vs26
-
- xsmaddadp vs56, vs0, vs27
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE4x1', `
- #else
- .macro SAVE4x1
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxsdx vs0, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xsmaddadp vs0, vs32, alpha_r
- #else
- xsmuldp vs0, vs32, alpha_r
- #endif
-
- stxsdx vs0, 0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxsdx vs8, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xsmaddadp vs8, vs40, alpha_r
- #else
- xsmuldp vs8, vs40, alpha_r
- #endif
-
- stxsdx vs8, 0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxsdx vs0, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xsmaddadp vs0, vs48, alpha_r
- #else
- xsmuldp vs0, vs48, alpha_r
- #endif
-
- stxsdx vs0, 0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxsdx vs8, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xsmaddadp vs8, vs56, alpha_r
- #else
- xsmuldp vs8, vs56, alpha_r
- #endif
-
- stxsdx vs8, 0, T1
-
- addi CO, CO, 8
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=2, M=16 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD2x16_1', `
- #else
- .macro LOAD2x16_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x16_I1', `
- #else
- .macro KERNEL2x16_I1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
-
- addi AO, AO, 64
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
- xvmuldp vs36, vs4, vs24
- xvmuldp vs37, vs5, vs24
- xvmuldp vs38, vs6, vs24
- xvmuldp vs39, vs7, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
- xvmuldp vs44, vs4, vs25
- xvmuldp vs45, vs5, vs25
- xvmuldp vs46, vs6, vs25
- xvmuldp vs47, vs7, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x16_1', `
- #else
- .macro KERNEL2x16_1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
-
- addi AO, AO, 64
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
- xvmaddadp vs36, vs4, vs24
- xvmaddadp vs37, vs5, vs24
- xvmaddadp vs38, vs6, vs24
- xvmaddadp vs39, vs7, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
- xvmaddadp vs42, vs2, vs25
- xvmaddadp vs43, vs3, vs25
- xvmaddadp vs44, vs4, vs25
- xvmaddadp vs45, vs5, vs25
- xvmaddadp vs46, vs6, vs25
- xvmaddadp vs47, vs7, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x16_2', `
- #else
- .macro KERNEL2x16_2
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
- xvmaddadp vs36, vs12, vs28
- xvmaddadp vs37, vs13, vs28
- xvmaddadp vs38, vs14, vs28
- xvmaddadp vs39, vs15, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
- xvmaddadp vs42, vs10, vs29
- xvmaddadp vs43, vs11, vs29
- xvmaddadp vs44, vs12, vs29
- xvmaddadp vs45, vs13, vs29
- xvmaddadp vs46, vs14, vs29
- xvmaddadp vs47, vs15, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x16_E2', `
- #else
- .macro KERNEL2x16_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
- xvmaddadp vs36, vs12, vs28
- xvmaddadp vs37, vs13, vs28
- xvmaddadp vs38, vs14, vs28
- xvmaddadp vs39, vs15, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
- xvmaddadp vs42, vs10, vs29
- xvmaddadp vs43, vs11, vs29
- xvmaddadp vs44, vs12, vs29
- xvmaddadp vs45, vs13, vs29
- xvmaddadp vs46, vs14, vs29
- xvmaddadp vs47, vs15, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x16_SUBI1', `
- #else
- .macro KERNEL2x16_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
- xvmuldp vs36, vs4, vs24
- xvmuldp vs37, vs5, vs24
- xvmuldp vs38, vs6, vs24
- xvmuldp vs39, vs7, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
- xvmuldp vs44, vs4, vs25
- xvmuldp vs45, vs5, vs25
- xvmuldp vs46, vs6, vs25
- xvmuldp vs47, vs7, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x16_SUB1', `
- #else
- .macro KERNEL2x16_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
- xvmaddadp vs36, vs4, vs24
- xvmaddadp vs37, vs5, vs24
- xvmaddadp vs38, vs6, vs24
- xvmaddadp vs39, vs7, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
- xvmaddadp vs42, vs2, vs25
- xvmaddadp vs43, vs3, vs25
- xvmaddadp vs44, vs4, vs25
- xvmaddadp vs45, vs5, vs25
- xvmaddadp vs46, vs6, vs25
- xvmaddadp vs47, vs7, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE2x16', `
- #else
- .macro SAVE2x16
- #endif
-
- mr T1, CO
- addi T2, T1, 64
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
-
- lxvd2x vs4, 0, T2
- lxvd2x vs5, o16, T2
- lxvd2x vs6, o32, T2
- lxvd2x vs7, o48, T2
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- xvmaddadp vs2, vs34, alpha_r
- xvmaddadp vs3, vs35, alpha_r
- xvmaddadp vs4, vs36, alpha_r
- xvmaddadp vs5, vs37, alpha_r
- xvmaddadp vs6, vs38, alpha_r
- xvmaddadp vs7, vs39, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- xvmuldp vs2, vs34, alpha_r
- xvmuldp vs3, vs35, alpha_r
- xvmuldp vs4, vs36, alpha_r
- xvmuldp vs5, vs37, alpha_r
- xvmuldp vs6, vs38, alpha_r
- xvmuldp vs7, vs39, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
- stxvd2x vs4, 0, T2
- stxvd2x vs5, o16, T2
- stxvd2x vs6, o32, T2
- stxvd2x vs7, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
-
- lxvd2x vs12, 0, T2
- lxvd2x vs13, o16, T2
- lxvd2x vs14, o32, T2
- lxvd2x vs15, o48, T2
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs40, alpha_r
- xvmaddadp vs9, vs41, alpha_r
- xvmaddadp vs10, vs42, alpha_r
- xvmaddadp vs11, vs43, alpha_r
- xvmaddadp vs12, vs44, alpha_r
- xvmaddadp vs13, vs45, alpha_r
- xvmaddadp vs14, vs46, alpha_r
- xvmaddadp vs15, vs47, alpha_r
- #else
- xvmuldp vs8, vs40, alpha_r
- xvmuldp vs9, vs41, alpha_r
- xvmuldp vs10, vs42, alpha_r
- xvmuldp vs11, vs43, alpha_r
- xvmuldp vs12, vs44, alpha_r
- xvmuldp vs13, vs45, alpha_r
- xvmuldp vs14, vs46, alpha_r
- xvmuldp vs15, vs47, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- stxvd2x vs12, 0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
-
- addi CO, CO, 128
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=4, M=8 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD2x8_1', `
- #else
- .macro LOAD2x8_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x8_I1', `
- #else
- .macro KERNEL2x8_I1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x8_1', `
- #else
- .macro KERNEL2x8_1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
- xvmaddadp vs42, vs2, vs25
- xvmaddadp vs43, vs3, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x8_2', `
- #else
- .macro KERNEL2x8_2
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
- xvmaddadp vs42, vs10, vs29
- xvmaddadp vs43, vs11, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x8_E2', `
- #else
- .macro KERNEL2x8_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
- xvmaddadp vs42, vs10, vs29
- xvmaddadp vs43, vs11, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x8_SUBI1', `
- #else
- .macro KERNEL2x8_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x8_SUB1', `
- #else
- .macro KERNEL2x8_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 64
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
- xvmaddadp vs42, vs2, vs25
- xvmaddadp vs43, vs3, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE2x8', `
- #else
- .macro SAVE2x8
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- xvmaddadp vs2, vs34, alpha_r
- xvmaddadp vs3, vs35, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- xvmuldp vs2, vs34, alpha_r
- xvmuldp vs3, vs35, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs40, alpha_r
- xvmaddadp vs9, vs41, alpha_r
- xvmaddadp vs10, vs42, alpha_r
- xvmaddadp vs11, vs43, alpha_r
- #else
- xvmuldp vs8, vs40, alpha_r
- xvmuldp vs9, vs41, alpha_r
- xvmuldp vs10, vs42, alpha_r
- xvmuldp vs11, vs43, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- addi CO, CO, 64
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=2, M=4 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD2x4_1', `
- #else
- .macro LOAD2x4_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 32
- addi BO, BO, 16
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x4_I1', `
- #else
- .macro KERNEL2x4_I1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- addi AO, AO, 32
- addi BO, BO, 16
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x4_1', `
- #else
- .macro KERNEL2x4_1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- addi AO, AO, 32
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x4_2', `
- #else
- .macro KERNEL2x4_2
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 32
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x4_E2', `
- #else
- .macro KERNEL2x4_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
-
- xvmaddadp vs40, vs8, vs29
- xvmaddadp vs41, vs9, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x4_SUBI1', `
- #else
- .macro KERNEL2x4_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 32
- addi BO, BO, 16
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
-
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x4_SUB1', `
- #else
- .macro KERNEL2x4_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 32
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
-
- xvmaddadp vs40, vs0, vs25
- xvmaddadp vs41, vs1, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE2x4', `
- #else
- .macro SAVE2x4
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs40, alpha_r
- xvmaddadp vs9, vs41, alpha_r
- #else
- xvmuldp vs8, vs40, alpha_r
- xvmuldp vs9, vs41, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
-
- addi CO, CO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=2, M=2 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD2x2_1', `
- #else
- .macro LOAD2x2_1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 16
- addi BO, BO, 16
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x2_I1', `
- #else
- .macro KERNEL2x2_I1
- #endif
-
- lxvd2x vs8, 0, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- addi AO, AO, 16
- addi BO, BO, 16
-
-
- xvmuldp vs32, vs0, vs24
-
- xvmuldp vs40, vs0, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x2_1', `
- #else
- .macro KERNEL2x2_1
- #endif
-
- lxvd2x vs8, 0, AO
-
- lxvdsx vs28, 0, BO
- lxvdsx vs29, o8, BO
-
- addi AO, AO, 16
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs0, vs24
-
- xvmaddadp vs40, vs0, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x2_2', `
- #else
- .macro KERNEL2x2_2
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 16
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs8, vs28
-
- xvmaddadp vs40, vs8, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x2_E2', `
- #else
- .macro KERNEL2x2_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
-
- xvmaddadp vs40, vs8, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x2_SUBI1', `
- #else
- .macro KERNEL2x2_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 16
- addi BO, BO, 16
-
-
- xvmuldp vs32, vs0, vs24
-
- xvmuldp vs40, vs0, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x2_SUB1', `
- #else
- .macro KERNEL2x2_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
- lxvdsx vs25, o8, BO
-
- addi AO, AO, 16
- addi BO, BO, 16
-
-
- xvmaddadp vs32, vs0, vs24
-
- xvmaddadp vs40, vs0, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE2x2', `
- #else
- .macro SAVE2x2
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs8, vs40, alpha_r
- #else
- xvmuldp vs8, vs40, alpha_r
- #endif
-
- stxvd2x vs8, 0, T1
-
- addi CO, CO, 16
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=2, M=1 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD2x1_1', `
- #else
- .macro LOAD2x1_1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
- lxsdx vs25, o8, BO
-
- addi AO, AO, 8
- addi BO, BO, 16
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x1_I1', `
- #else
- .macro KERNEL2x1_I1
- #endif
-
- lxsdx vs8, 0, AO
-
- lxsdx vs28, 0, BO
- lxsdx vs29, o8, BO
-
- addi AO, AO, 8
- addi BO, BO, 16
-
-
- xsmuldp vs32, vs0, vs24
-
- xsmuldp vs40, vs0, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x1_1', `
- #else
- .macro KERNEL2x1_1
- #endif
-
- lxsdx vs8, 0, AO
-
- lxsdx vs28, 0, BO
- lxsdx vs29, o8, BO
-
- addi AO, AO, 8
- addi BO, BO, 16
-
-
- xsmaddadp vs32, vs0, vs24
-
- xsmaddadp vs40, vs0, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x1_2', `
- #else
- .macro KERNEL2x1_2
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
- lxsdx vs25, o8, BO
-
- addi AO, AO, 8
- addi BO, BO, 16
-
-
- xsmaddadp vs32, vs8, vs28
-
- xsmaddadp vs40, vs8, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x1_E2', `
- #else
- .macro KERNEL2x1_E2
- #endif
-
-
- xsmaddadp vs32, vs8, vs28
-
- xsmaddadp vs40, vs8, vs29
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x1_SUBI1', `
- #else
- .macro KERNEL2x1_SUBI1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
- lxsdx vs25, o8, BO
-
- addi AO, AO, 8
- addi BO, BO, 16
-
-
- xsmuldp vs32, vs0, vs24
-
- xsmuldp vs40, vs0, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL2x1_SUB1', `
- #else
- .macro KERNEL2x1_SUB1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
- lxsdx vs25, o8, BO
-
- addi AO, AO, 8
- addi BO, BO, 16
-
-
- xsmaddadp vs32, vs0, vs24
-
- xsmaddadp vs40, vs0, vs25
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE2x1', `
- #else
- .macro SAVE2x1
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxsdx vs0, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xsmaddadp vs0, vs32, alpha_r
- #else
- xsmuldp vs0, vs32, alpha_r
- #endif
-
- stxsdx vs0, 0, T1
-
- add T1, T1, LDC
-
- #ifndef TRMMKERNEL
- lxsdx vs8, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xsmaddadp vs8, vs40, alpha_r
- #else
- xsmuldp vs8, vs40, alpha_r
- #endif
-
- stxsdx vs8, 0, T1
-
- addi CO, CO, 8
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=1, M=16 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD1x16_1', `
- #else
- .macro LOAD1x16_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x16_I1', `
- #else
- .macro KERNEL1x16_I1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- lxvdsx vs28, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
-
- addi AO, AO, 64
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
- xvmuldp vs36, vs4, vs24
- xvmuldp vs37, vs5, vs24
- xvmuldp vs38, vs6, vs24
- xvmuldp vs39, vs7, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x16_1', `
- #else
- .macro KERNEL1x16_1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- lxvdsx vs28, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
-
- addi AO, AO, 64
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
- xvmaddadp vs36, vs4, vs24
- xvmaddadp vs37, vs5, vs24
- xvmaddadp vs38, vs6, vs24
- xvmaddadp vs39, vs7, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x16_2', `
- #else
- .macro KERNEL1x16_2
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
- xvmaddadp vs36, vs12, vs28
- xvmaddadp vs37, vs13, vs28
- xvmaddadp vs38, vs14, vs28
- xvmaddadp vs39, vs15, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x16_E2', `
- #else
- .macro KERNEL1x16_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
- xvmaddadp vs36, vs12, vs28
- xvmaddadp vs37, vs13, vs28
- xvmaddadp vs38, vs14, vs28
- xvmaddadp vs39, vs15, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x16_SUBI1', `
- #else
- .macro KERNEL1x16_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
- xvmuldp vs36, vs4, vs24
- xvmuldp vs37, vs5, vs24
- xvmuldp vs38, vs6, vs24
- xvmuldp vs39, vs7, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x16_SUB1', `
- #else
- .macro KERNEL1x16_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
- xvmaddadp vs36, vs4, vs24
- xvmaddadp vs37, vs5, vs24
- xvmaddadp vs38, vs6, vs24
- xvmaddadp vs39, vs7, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE1x16', `
- #else
- .macro SAVE1x16
- #endif
-
- mr T1, CO
- addi T2, T1, 64
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
-
- lxvd2x vs4, 0, T2
- lxvd2x vs5, o16, T2
- lxvd2x vs6, o32, T2
- lxvd2x vs7, o48, T2
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- xvmaddadp vs2, vs34, alpha_r
- xvmaddadp vs3, vs35, alpha_r
- xvmaddadp vs4, vs36, alpha_r
- xvmaddadp vs5, vs37, alpha_r
- xvmaddadp vs6, vs38, alpha_r
- xvmaddadp vs7, vs39, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- xvmuldp vs2, vs34, alpha_r
- xvmuldp vs3, vs35, alpha_r
- xvmuldp vs4, vs36, alpha_r
- xvmuldp vs5, vs37, alpha_r
- xvmuldp vs6, vs38, alpha_r
- xvmuldp vs7, vs39, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
- stxvd2x vs4, 0, T2
- stxvd2x vs5, o16, T2
- stxvd2x vs6, o32, T2
- stxvd2x vs7, o48, T2
-
- addi CO, CO, 128
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=4, M=8 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD1x8_1', `
- #else
- .macro LOAD1x8_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x8_I1', `
- #else
- .macro KERNEL1x8_I1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- lxvdsx vs28, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x8_1', `
- #else
- .macro KERNEL1x8_1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
-
- lxvdsx vs28, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x8_2', `
- #else
- .macro KERNEL1x8_2
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x8_E2', `
- #else
- .macro KERNEL1x8_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
- xvmaddadp vs34, vs10, vs28
- xvmaddadp vs35, vs11, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x8_SUBI1', `
- #else
- .macro KERNEL1x8_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x8_SUB1', `
- #else
- .macro KERNEL1x8_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 64
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
- xvmaddadp vs34, vs2, vs24
- xvmaddadp vs35, vs3, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE1x8', `
- #else
- .macro SAVE1x8
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- xvmaddadp vs2, vs34, alpha_r
- xvmaddadp vs3, vs35, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- xvmuldp vs2, vs34, alpha_r
- xvmuldp vs3, vs35, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
- addi CO, CO, 64
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=1, M=4 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD1x4_1', `
- #else
- .macro LOAD1x4_1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 32
- addi BO, BO, 8
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x4_I1', `
- #else
- .macro KERNEL1x4_I1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- lxvdsx vs28, 0, BO
-
- addi AO, AO, 32
- addi BO, BO, 8
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x4_1', `
- #else
- .macro KERNEL1x4_1
- #endif
-
- lxvd2x vs8, 0, AO
- lxvd2x vs9, o16, AO
-
- lxvdsx vs28, 0, BO
-
- addi AO, AO, 32
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x4_2', `
- #else
- .macro KERNEL1x4_2
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 32
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x4_E2', `
- #else
- .macro KERNEL1x4_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
- xvmaddadp vs33, vs9, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x4_SUBI1', `
- #else
- .macro KERNEL1x4_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 32
- addi BO, BO, 8
-
-
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x4_SUB1', `
- #else
- .macro KERNEL1x4_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
- lxvd2x vs1, o16, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 32
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs0, vs24
- xvmaddadp vs33, vs1, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE1x4', `
- #else
- .macro SAVE1x4
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- xvmaddadp vs1, vs33, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
-
- addi CO, CO, 32
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=1, M=2 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD1x2_1', `
- #else
- .macro LOAD1x2_1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 16
- addi BO, BO, 8
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x2_I1', `
- #else
- .macro KERNEL1x2_I1
- #endif
-
- lxvd2x vs8, 0, AO
-
- lxvdsx vs28, 0, BO
-
- addi AO, AO, 16
- addi BO, BO, 8
-
-
- xvmuldp vs32, vs0, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x2_1', `
- #else
- .macro KERNEL1x2_1
- #endif
-
- lxvd2x vs8, 0, AO
-
- lxvdsx vs28, 0, BO
-
- addi AO, AO, 16
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs0, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x2_2', `
- #else
- .macro KERNEL1x2_2
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 16
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs8, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x2_E2', `
- #else
- .macro KERNEL1x2_E2
- #endif
-
-
- xvmaddadp vs32, vs8, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x2_SUBI1', `
- #else
- .macro KERNEL1x2_SUBI1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 16
- addi BO, BO, 8
-
-
- xvmuldp vs32, vs0, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x2_SUB1', `
- #else
- .macro KERNEL1x2_SUB1
- #endif
-
- lxvd2x vs0, 0, AO
-
- lxvdsx vs24, 0, BO
-
- addi AO, AO, 16
- addi BO, BO, 8
-
-
- xvmaddadp vs32, vs0, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE1x2', `
- #else
- .macro SAVE1x2
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xvmaddadp vs0, vs32, alpha_r
- #else
- xvmuldp vs0, vs32, alpha_r
- #endif
-
- stxvd2x vs0, 0, T1
-
- addi CO, CO, 16
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- /*********************************************************************
- * Macros for N=1, M=1 *
- *********************************************************************/
-
- #if defined(_AIX)
- define(`LOAD1x1_1', `
- #else
- .macro LOAD1x1_1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
-
- addi AO, AO, 8
- addi BO, BO, 8
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x1_I1', `
- #else
- .macro KERNEL1x1_I1
- #endif
-
- lxsdx vs8, 0, AO
-
- lxsdx vs28, 0, BO
-
- addi AO, AO, 8
- addi BO, BO, 8
-
-
- xsmuldp vs32, vs0, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x1_1', `
- #else
- .macro KERNEL1x1_1
- #endif
-
- lxsdx vs8, 0, AO
-
- lxsdx vs28, 0, BO
-
- addi AO, AO, 8
- addi BO, BO, 8
-
-
- xsmaddadp vs32, vs0, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x1_2', `
- #else
- .macro KERNEL1x1_2
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
-
- addi AO, AO, 8
- addi BO, BO, 8
-
-
- xsmaddadp vs32, vs8, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x1_E2', `
- #else
- .macro KERNEL1x1_E2
- #endif
-
-
- xsmaddadp vs32, vs8, vs28
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x1_SUBI1', `
- #else
- .macro KERNEL1x1_SUBI1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
-
- addi AO, AO, 8
- addi BO, BO, 8
-
-
- xsmuldp vs32, vs0, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`KERNEL1x1_SUB1', `
- #else
- .macro KERNEL1x1_SUB1
- #endif
-
- lxsdx vs0, 0, AO
-
- lxsdx vs24, 0, BO
-
- addi AO, AO, 8
- addi BO, BO, 8
-
-
- xsmaddadp vs32, vs0, vs24
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
- #if defined(_AIX)
- define(`SAVE1x1', `
- #else
- .macro SAVE1x1
- #endif
-
- mr T1, CO
-
- #ifndef TRMMKERNEL
- lxsdx vs0, 0, T1
- #endif
-
- #ifndef TRMMKERNEL
- xsmaddadp vs0, vs32, alpha_r
- #else
- xsmuldp vs0, vs32, alpha_r
- #endif
-
- stxsdx vs0, 0, T1
-
- addi CO, CO, 8
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
|