|
- /*********************************************************************/
- /* Copyright 2009, 2010 The University of Texas at Austin. */
- /* All rights reserved. */
- /* */
- /* Redistribution and use in source and binary forms, with or */
- /* without modification, are permitted provided that the following */
- /* conditions are met: */
- /* */
- /* 1. Redistributions of source code must retain the above */
- /* copyright notice, this list of conditions and the following */
- /* disclaimer. */
- /* */
- /* 2. Redistributions in binary form must reproduce the above */
- /* copyright notice, this list of conditions and the following */
- /* disclaimer in the documentation and/or other materials */
- /* provided with the distribution. */
- /* */
- /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
- /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
- /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
- /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
- /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
- /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
- /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
- /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
- /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
- /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
- /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
- /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
- /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
- /* POSSIBILITY OF SUCH DAMAGE. */
- /* */
- /* The views and conclusions contained in the software and */
- /* documentation are those of the authors and should not be */
- /* interpreted as representing official policies, either expressed */
- /* or implied, of The University of Texas at Austin. */
- /*********************************************************************/
-
- #define ASSEMBLER
- #include "common.h"
-
- #define M r3
- #define N r4
- #define A r6
- #define LDA r7
- #define X r8
- #define INCX r9
- #define Y r10
- #define INCY r5
-
- #define I r11
- #define J r12
-
- #define INCY2 r24
- #define A1 r25
- #define A2 r26
- #define A3 r27
- #define A4 r28
-
- #define YL r29
- #define YS r30
- #define INC2 r31
-
- #define yl1 f0
- #define yl2 f2
- #define yl3 f3
- #define yl4 f4
- #define ys1 f5
- #define ys2 f6
- #define ys3 f7
- #define ys4 f8
- #define yl5 f27
- #define ys5 f28
-
- #define alpha1 f9
- #define alpha2 f10
-
- #define a1 f11
- #define a2 f12
- #define a3 f13
- #define a4 f14
- #define a5 f15
- #define a6 f16
- #define a7 f17
- #define a8 f18
-
- #define a9 f19
- #define a10 f20
- #define a11 f21
- #define a12 f22
- #define a13 f23
- #define a14 f24
- #define a15 f25
- #define a16 f26
-
- #define alpha f1
-
- PROLOGUE
- PROFCODE
-
- li r0, -16
- lwz INCY, 8(SP)
-
- stfpdux f14, SP, r0
- stfpdux f15, SP, r0
- stfpdux f16, SP, r0
- stfpdux f17, SP, r0
- stfpdux f18, SP, r0
- stfpdux f19, SP, r0
- stfpdux f20, SP, r0
- stfpdux f21, SP, r0
- stfpdux f22, SP, r0
- stfpdux f23, SP, r0
- stfpdux f24, SP, r0
- stfpdux f25, SP, r0
- stfpdux f26, SP, r0
- stfpdux f27, SP, r0
- stfpdux f28, SP, r0
- stfpdux f29, SP, r0
- stfpdux f30, SP, r0
- stfpdux f31, SP, r0
-
- stwu r31, -4(SP)
- stwu r30, -4(SP)
- stwu r29, -4(SP)
- stwu r28, -4(SP)
-
- stwu r27, -4(SP)
- stwu r26, -4(SP)
- stwu r25, -4(SP)
- stwu r24, -4(SP)
-
- stwu r23, -4(SP)
- stwu r22, -4(SP)
- stwu r21, -4(SP)
- stwu r20, -4(SP)
-
- stwu r19, -4(SP)
- stwu r18, -4(SP)
- stwu r17, -4(SP)
- stwu r16, -4(SP)
-
- slwi LDA, LDA, BASE_SHIFT
- slwi INCX, INCX, BASE_SHIFT
- slwi INCY, INCY, BASE_SHIFT
-
- fsmfp alpha, alpha
-
- cmpwi cr0, M, 0
- ble- .L999
- cmpwi cr0, N, 0
- ble- .L999
-
- add INCY2, INCY, INCY
- li INC2, 2 * SIZE
- sub X, X, INCX
-
- andi. r0, A, 2 * SIZE - 1
- # bne .L100
-
- # All cases for aligned A, even LDA
-
- cmpwi cr0, INCY, SIZE
- bne .L70
-
- andi. r0, Y, 2 * SIZE - 1
- bne .L40
-
- # A : aligned LDA : even Y : Unit Aligned
-
- sub A, A, INC2
- sub Y, Y, INCY2
-
- srawi. J, N, 2
- ble .L20
- .align 4
-
- .L11:
- LFDUX alpha1, X, INCX
- mr A1, A
- add A2, A, LDA
- add A3, A2, LDA
- LFSDUX alpha1, X, INCX
- LFDUX alpha2, X, INCX
- add A4, A3, LDA
- add A, A4, LDA
- mr YL, Y
- LFSDUX alpha2, X, INCX
- fpmul alpha1, alpha, alpha1
- mr YS, Y
- srawi. r0, M, 3
- mtspr CTR, r0
- fpmul alpha2, alpha, alpha2
- ble .L15
-
- LFPDUX yl1, YL, INCY2
- LFPDUX yl2, YL, INCY2
- LFPDUX yl3, YL, INCY2
- LFPDUX yl4, YL, INCY2
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFPDUX a9, A1, INC2
- LFPDUX a13, A1, INC2
-
- LFPDUX a2, A2, INC2
- LFPDUX a6, A2, INC2
- LFPDUX a10, A2, INC2
- LFPDUX a14, A2, INC2
-
- LFPDUX a3, A3, INC2
- LFPDUX a7, A3, INC2
- LFPDUX a11, A3, INC2
- LFPDUX a15, A3, INC2
-
- LFPDUX a4, A4, INC2
- fxcpmadd ys1, alpha1, a1, yl1
- LFPDUX a8, A4, INC2
- fxcpmadd ys2, alpha1, a5, yl2
- LFPDUX a12, A4, INC2
- fxcpmadd ys3, alpha1, a9, yl3
- LFPDUX a16, A4, INC2
- fxcpmadd ys4, alpha1, a13, yl4
- bdz .L13
- .align 4
-
- .L12:
- LFPDUX yl1, YL, INCY2
-
- fxcsmadd ys1, alpha1, a2, ys1
- LFPDUX a1, A1, INC2
- fxcsmadd ys2, alpha1, a6, ys2
- LFPDUX a5, A1, INC2
- fxcsmadd ys3, alpha1, a10, ys3
- LFPDUX a9, A1, INC2
- fxcsmadd ys4, alpha1, a14, ys4
- LFPDUX a13, A1, INC2
-
- LFPDUX yl2, YL, INCY2
-
- fxcpmadd ys1, alpha2, a3, ys1
- LFPDUX a2, A2, INC2
- fxcpmadd ys2, alpha2, a7, ys2
- LFPDUX a6, A2, INC2
- fxcpmadd ys3, alpha2, a11, ys3
- LFPDUX a10, A2, INC2
- fxcpmadd ys4, alpha2, a15, ys4
- LFPDUX a14, A2, INC2
-
- LFPDUX yl3, YL, INCY2
-
- fxcsmadd ys1, alpha2, a4, ys1
- LFPDUX a3, A3, INC2
- fxcsmadd ys2, alpha2, a8, ys2
- LFPDUX a7, A3, INC2
- fxcsmadd ys3, alpha2, a12, ys3
- LFPDUX a11, A3, INC2
- fxcsmadd ys4, alpha2, a16, ys4
- LFPDUX a15, A3, INC2
-
- LFPDUX yl4, YL, INCY2
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- STFPDUX ys3, YS, INCY2
- STFPDUX ys4, YS, INCY2
-
- LFPDUX a4, A4, INC2
- fxcpmadd ys1, alpha1, a1, yl1
- LFPDUX a8, A4, INC2
- fxcpmadd ys2, alpha1, a5, yl2
- LFPDUX a12, A4, INC2
- fxcpmadd ys3, alpha1, a9, yl3
- LFPDUX a16, A4, INC2
- fxcpmadd ys4, alpha1, a13, yl4
- bdnz .L12
- .align 4
-
- .L13:
- fxcsmadd ys1, alpha1, a2, ys1
- fxcsmadd ys2, alpha1, a6, ys2
- fxcsmadd ys3, alpha1, a10, ys3
- fxcsmadd ys4, alpha1, a14, ys4
-
- fxcpmadd ys1, alpha2, a3, ys1
- fxcpmadd ys2, alpha2, a7, ys2
- fxcpmadd ys3, alpha2, a11, ys3
- fxcpmadd ys4, alpha2, a15, ys4
-
- fxcsmadd ys1, alpha2, a4, ys1
- fxcsmadd ys2, alpha2, a8, ys2
- fxcsmadd ys3, alpha2, a12, ys3
- fxcsmadd ys4, alpha2, a16, ys4
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- STFPDUX ys3, YS, INCY2
- STFPDUX ys4, YS, INCY2
- .align 4
-
- .L15:
- andi. r0, M, 7
- ble .L19
-
- andi. r0, M, 4
- ble .L17
-
- LFPDUX yl1, YL, INCY2
- LFPDUX a1, A1, INC2
- LFPDUX yl2, YL, INCY2
- LFPDUX a5, A1, INC2
-
- LFPDUX a2, A2, INC2
- LFPDUX a6, A2, INC2
- LFPDUX a3, A3, INC2
- LFPDUX a7, A3, INC2
-
- LFPDUX a4, A4, INC2
- LFPDUX a8, A4, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcpmadd ys2, alpha1, a5, yl2
- fxcsmadd ys1, alpha1, a2, ys1
- fxcsmadd ys2, alpha1, a6, ys2
-
- fxcpmadd ys1, alpha2, a3, ys1
- fxcpmadd ys2, alpha2, a7, ys2
- fxcsmadd ys1, alpha2, a4, ys1
- fxcsmadd ys2, alpha2, a8, ys2
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- .align 4
-
- .L17:
- andi. r0, M, 2
- ble .L18
-
- LFPDUX yl1, YL, INCY2
-
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
- LFPDUX a3, A3, INC2
- LFPDUX a4, A4, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
- fxcpmadd ys1, alpha2, a3, ys1
- fxcsmadd ys1, alpha2, a4, ys1
-
- STFPDUX ys1, YS, INCY2
- .align 4
-
- .L18:
- andi. r0, M, 1
- ble .L19
-
- LFDUX yl1, YL, INCY2
-
- LFDUX a1, A1, INC2
- LFDUX a2, A2, INC2
- LFDUX a3, A3, INC2
- LFDUX a4, A4, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
- fxcpmadd ys1, alpha2, a3, ys1
- fxcsmadd ys1, alpha2, a4, ys1
-
- STFDUX ys1, YS, INCY2
- .align 4
-
- .L19:
- addi J, J, -1
- cmpi cr0, 0, J, 0
- bgt .L11
- .align 4
-
- .L20:
- andi. J, N, 2
- ble .L30
-
- LFDUX alpha1, X, INCX
-
- mr A1, A
- add A2, A, LDA
- add A, A2, LDA
- LFSDUX alpha1, X, INCX
-
- mr YL, Y
- mr YS, Y
- fpmul alpha1, alpha, alpha1
-
- srawi. r0, M, 3
- mtspr CTR, r0
- ble .L25
-
- LFPDUX yl1, YL, INCY2
- LFPDUX a1, A1, INC2
- LFPDUX yl2, YL, INCY2
- LFPDUX a5, A1, INC2
-
- LFPDUX yl3, YL, INCY2
- LFPDUX a9, A1, INC2
- LFPDUX yl4, YL, INCY2
- LFPDUX a13, A1, INC2
-
- LFPDUX a2, A2, INC2
- LFPDUX a6, A2, INC2
- LFPDUX a10, A2, INC2
- LFPDUX a14, A2, INC2
- bdz .L23
- .align 4
-
- .L22:
- fxcpmadd ys1, alpha1, a1, yl1
- LFPDUX a1, A1, INC2
- LFPDUX yl1, YL, INCY2
- fxcpmadd ys2, alpha1, a5, yl2
- LFPDUX a5, A1, INC2
- LFPDUX yl2, YL, INCY2
- fxcpmadd ys3, alpha1, a9, yl3
- LFPDUX a9, A1, INC2
- LFPDUX yl3, YL, INCY2
- fxcpmadd ys4, alpha1, a13, yl4
- LFPDUX a13, A1, INC2
- LFPDUX yl4, YL, INCY2
-
- fxcsmadd ys1, alpha1, a2, ys1
- LFPDUX a2, A2, INC2
- fxcsmadd ys2, alpha1, a6, ys2
- LFPDUX a6, A2, INC2
- fxcsmadd ys3, alpha1, a10, ys3
- LFPDUX a10, A2, INC2
- fxcsmadd ys4, alpha1, a14, ys4
- LFPDUX a14, A2, INC2
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- STFPDUX ys3, YS, INCY2
- STFPDUX ys4, YS, INCY2
- bdnz .L22
- .align 4
-
- .L23:
- fxcpmadd ys1, alpha1, a1, yl1
- fxcpmadd ys2, alpha1, a5, yl2
- fxcpmadd ys3, alpha1, a9, yl3
- fxcpmadd ys4, alpha1, a13, yl4
-
- fxcsmadd ys1, alpha1, a2, ys1
- fxcsmadd ys2, alpha1, a6, ys2
- fxcsmadd ys3, alpha1, a10, ys3
- fxcsmadd ys4, alpha1, a14, ys4
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- STFPDUX ys3, YS, INCY2
- STFPDUX ys4, YS, INCY2
- .align 4
-
- .L25:
- andi. r0, M, 7
- ble .L30
-
- andi. r0, M, 4
- ble .L27
-
- LFPDUX yl1, YL, INCY2
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
-
- LFPDUX yl2, YL, INCY2
- LFPDUX a5, A1, INC2
- LFPDUX a6, A2, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
- fxcpmadd ys2, alpha1, a5, yl2
- fxcsmadd ys2, alpha1, a6, ys2
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- .align 4
-
- .L27:
- andi. r0, M, 2
- ble .L28
-
- LFPDUX yl1, YL, INCY2
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
-
- STFPDUX ys1, YS, INCY2
- .align 4
-
- .L28:
- andi. r0, M, 1
- ble .L30
-
- LFDUX yl1, YL, INCY2
- LFDUX a1, A1, INC2
- LFDUX a2, A2, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
-
- STFDUX ys1, YS, INCY2
- .align 4
-
- .L30:
- andi. J, N, 1
- ble .L999
-
- LFDUX alpha1, X, INCX
-
- mr A1, A
- mr YL, Y
- mr YS, Y
- fmul alpha1, alpha, alpha1
-
- srawi. r0, M, 3
- mtspr CTR, r0
- ble .L35
-
- LFPDUX yl1, YL, INCY2
- LFPDUX a1, A1, INC2
- LFPDUX yl2, YL, INCY2
- LFPDUX a5, A1, INC2
-
- LFPDUX yl3, YL, INCY2
- LFPDUX a9, A1, INC2
- LFPDUX yl4, YL, INCY2
- LFPDUX a13, A1, INC2
- bdz .L33
- .align 4
-
- .L32:
- fxcpmadd ys1, alpha1, a1, yl1
- LFPDUX yl1, YL, INCY2
- LFPDUX a1, A1, INC2
- fxcpmadd ys2, alpha1, a5, yl2
- LFPDUX yl2, YL, INCY2
- LFPDUX a5, A1, INC2
- fxcpmadd ys3, alpha1, a9, yl3
- LFPDUX yl3, YL, INCY2
- LFPDUX a9, A1, INC2
- fxcpmadd ys4, alpha1, a13, yl4
- LFPDUX yl4, YL, INCY2
- LFPDUX a13, A1, INC2
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- STFPDUX ys3, YS, INCY2
- STFPDUX ys4, YS, INCY2
- bdnz .L32
- .align 4
-
- .L33:
- fxcpmadd ys1, alpha1, a1, yl1
- fxcpmadd ys2, alpha1, a5, yl2
- fxcpmadd ys3, alpha1, a9, yl3
- fxcpmadd ys4, alpha1, a13, yl4
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- STFPDUX ys3, YS, INCY2
- STFPDUX ys4, YS, INCY2
- .align 4
-
- .L35:
- andi. r0, M, 7
- ble .L999
-
- andi. r0, M, 4
- ble .L37
-
- LFPDUX yl1, YL, INCY2
- LFPDUX a1, A1, INC2
-
- LFPDUX yl2, YL, INCY2
- LFPDUX a5, A1, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcpmadd ys2, alpha1, a5, yl2
-
- STFPDUX ys1, YS, INCY2
- STFPDUX ys2, YS, INCY2
- .align 4
-
- .L37:
- andi. r0, M, 2
- ble .L38
-
- LFPDUX yl1, YL, INCY2
- LFPDUX a1, A1, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
-
- STFPDUX ys1, YS, INCY2
- .align 4
-
- .L38:
- andi. r0, M, 1
- ble .L999
-
- LFDUX yl1, YL, INCY2
- LFDUX a1, A1, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
-
- STFDUX ys1, YS, INCY2
- b .L999
- .align 4
-
- .L40:
- # A : aligned LDA : even Y : Unaligned
-
- sub A, A, INC2
- sub Y, Y, INCY
-
- srawi. J, N, 2
- ble .L50
- .align 4
-
- .L41:
- LFDUX alpha1, X, INCX
- LFSDUX alpha1, X, INCX
- LFDUX alpha2, X, INCX
- LFSDUX alpha2, X, INCX
-
- fpmul alpha1, alpha, alpha1
- fpmul alpha2, alpha, alpha2
-
- mr A1, A
- add A2, A, LDA
- add A3, A2, LDA
- add A4, A3, LDA
- add A, A4, LDA
-
- mr YL, Y
- sub YS, Y, INCY2
-
- LFSDX ys1, YS, INCY2
- LFDX yl1, YL, INCY
-
- srawi. r0, M, 3
- mtspr CTR, r0
- ble .L45
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFPDUX a9, A1, INC2
- LFPDUX a13, A1, INC2
-
- LFXDUX yl2, YL, INCY2
- LFXDUX yl3, YL, INCY2
- LFXDUX yl4, YL, INCY2
- LFXDUX yl5, YL, INCY2
-
- LFPDUX a2, A2, INC2
- LFPDUX a6, A2, INC2
- LFPDUX a10, A2, INC2
- LFPDUX a14, A2, INC2
-
- LFPDUX a3, A3, INC2
- LFPDUX a7, A3, INC2
- LFPDUX a11, A3, INC2
- LFPDUX a15, A3, INC2
-
- LFPDUX a4, A4, INC2
- fsmr yl1, yl2
- LFPDUX a8, A4, INC2
- fsmr yl2, yl3
- LFPDUX a12, A4, INC2
- fsmr yl3, yl4
- LFPDUX a16, A4, INC2
- fsmr yl4, yl5
- bdz .L43
- .align 4
-
- .L42:
- fxcpmadd ys2, alpha1, a1, yl1
- LFPDUX a1, A1, INC2
- fxcpmadd ys3, alpha1, a5, yl2
- LFPDUX a5, A1, INC2
- fxcpmadd ys4, alpha1, a9, yl3
- LFPDUX a9, A1, INC2
- fxcpmadd ys5, alpha1, a13, yl4
- LFPDUX a13, A1, INC2
-
- fxcsmadd ys2, alpha1, a2, ys2
- LFPDUX a2, A2, INC2
- fxcsmadd ys3, alpha1, a6, ys3
- LFPDUX a6, A2, INC2
- fxcsmadd ys4, alpha1, a10, ys4
- LFPDUX a10, A2, INC2
- fxcsmadd ys5, alpha1, a14, ys5
- LFPDUX a14, A2, INC2
-
- fxcpmadd ys2, alpha2, a3, ys2
- LFPDUX a3, A3, INC2
- fxcpmadd ys3, alpha2, a7, ys3
- LFPDUX a7, A3, INC2
- fxcpmadd ys4, alpha2, a11, ys4
- LFPDUX a11, A3, INC2
- fxcpmadd ys5, alpha2, a15, ys5
- LFPDUX a15, A3, INC2
-
- fxcsmadd ys2, alpha2, a4, ys2
- LFPDUX a4, A4, INC2
- fxcsmadd ys3, alpha2, a8, ys3
- LFPDUX a8, A4, INC2
- fxcsmadd ys4, alpha2, a12, ys4
- LFPDUX a12, A4, INC2
- fxcsmadd ys5, alpha2, a16, ys5
- LFPDUX a16, A4, INC2
-
- fmr yl1, yl5
- LFXDUX yl2, YL, INCY2
- fmr ys1, ys2
- LFXDUX yl3, YL, INCY2
- fmr ys2, ys3
- LFXDUX yl4, YL, INCY2
- fmr ys3, ys4
- LFXDUX yl5, YL, INCY2
- fmr ys4, ys5
-
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys5
- STFXDUX ys2, YS, INCY2
- fsmr yl1, yl2
- STFXDUX ys3, YS, INCY2
- fsmr yl2, yl3
- STFXDUX ys4, YS, INCY2
- fsmr yl3, yl4
-
- fsmr yl4, yl5
- bdnz .L42
- .align 4
-
- .L43:
- fxcpmadd ys2, alpha1, a1, yl1
- fxcpmadd ys3, alpha1, a5, yl2
- fxcpmadd ys4, alpha1, a9, yl3
- fxcpmadd ys5, alpha1, a13, yl4
-
- fxcsmadd ys2, alpha1, a2, ys2
- fxcsmadd ys3, alpha1, a6, ys3
- fxcsmadd ys4, alpha1, a10, ys4
- fxcsmadd ys5, alpha1, a14, ys5
-
- fxcpmadd ys2, alpha2, a3, ys2
- fxcpmadd ys3, alpha2, a7, ys3
- fxcpmadd ys4, alpha2, a11, ys4
- fxcpmadd ys5, alpha2, a15, ys5
-
- fxcsmadd ys2, alpha2, a4, ys2
- fxcsmadd ys3, alpha2, a8, ys3
- fxcsmadd ys4, alpha2, a12, ys4
- fxcsmadd ys5, alpha2, a16, ys5
-
- fmr ys1, ys2
- fmr ys2, ys3
- fmr ys3, ys4
- fmr ys4, ys5
- fmr yl1, yl5
-
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys5
- STFXDUX ys2, YS, INCY2
- STFXDUX ys3, YS, INCY2
- STFXDUX ys4, YS, INCY2
- .align 4
-
- .L45:
- andi. r0, M, 7
- ble .L48
-
- andi. r0, M, 4
- ble .L46
-
- LFXDUX yl2, YL, INCY2
- LFXDUX yl3, YL, INCY2
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
-
- LFPDUX a2, A2, INC2
- LFPDUX a6, A2, INC2
- LFPDUX a3, A3, INC2
- LFPDUX a7, A3, INC2
-
- LFPDUX a4, A4, INC2
- fsmr yl1, yl2
- LFPDUX a8, A4, INC2
- fsmr yl2, yl3
-
- fxcpmadd ys2, alpha1, a1, yl1
- fxcpmadd ys3, alpha1, a5, yl2
- fxcsmadd ys2, alpha1, a2, ys2
- fxcsmadd ys3, alpha1, a6, ys3
-
- fxcpmadd ys2, alpha2, a3, ys2
- fxcpmadd ys3, alpha2, a7, ys3
- fxcsmadd ys2, alpha2, a4, ys2
- fxcsmadd ys3, alpha2, a8, ys3
-
- fmr yl1, yl3
- fmr ys1, ys2
- fmr ys2, ys3
-
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys3
- STFXDUX ys2, YS, INCY2
- .align 4
-
- .L46:
- andi. r0, M, 2
- ble .L47
-
- LFXDUX yl2, YL, INCY2
-
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
- LFPDUX a3, A3, INC2
- LFPDUX a4, A4, INC2
-
- fsmr yl1, yl2
- fxcpmadd ys2, alpha1, a1, yl1
- fxcsmadd ys2, alpha1, a2, ys2
- fxcpmadd ys2, alpha2, a3, ys2
- fxcsmadd ys2, alpha2, a4, ys2
- fmr yl1, yl2
-
- fmr ys1, ys2
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys2
- .align 4
-
- .L47:
- andi. r0, M, 1
- ble .L48
-
- LFDUX a1, A1, INC2
- LFDUX a2, A2, INC2
- LFDUX a3, A3, INC2
- LFDUX a4, A4, INC2
-
- fxcpmadd ys2, alpha1, a1, yl1
- fxcsmadd ys2, alpha1, a2, ys2
- fxcpmadd ys2, alpha2, a3, ys2
- fxcsmadd ys2, alpha2, a4, ys2
-
- STFSDX ys1, YS, INCY2
- add YS, YS, INCY
- STFDX ys2, YS, INCY2
- b .L49
- .align 4
-
- .L48:
- STFSDUX ys1, YS, INCY2
- .align 4
-
- .L49:
- addi J, J, -1
- cmpi cr0, 0, J, 0
- bgt .L41
- .align 4
-
- .L50:
- andi. J, N, 2
- ble .L60
-
- LFDUX alpha1, X, INCX
-
- mr A1, A
- add A2, A, LDA
- add A, A2, LDA
- LFSDUX alpha1, X, INCX
-
- mr YL, Y
- sub YS, Y, INCY2
- fpmul alpha1, alpha, alpha1
-
- LFSDX ys1, YS, INCY2
- LFDX yl1, YL, INCY
-
- srawi. r0, M, 3
- mtspr CTR, r0
- ble .L55
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFPDUX a9, A1, INC2
- LFPDUX a13, A1, INC2
-
- LFXDUX yl2, YL, INCY2
- LFXDUX yl3, YL, INCY2
- LFXDUX yl4, YL, INCY2
- LFXDUX yl5, YL, INCY2
-
- LFPDUX a2, A2, INC2
- fsmr yl1, yl2
- LFPDUX a6, A2, INC2
- fsmr yl2, yl3
- LFPDUX a10, A2, INC2
- fsmr yl3, yl4
- LFPDUX a14, A2, INC2
- fsmr yl4, yl5
- bdz .L53
- .align 4
-
- .L52:
- fxcpmadd ys2, alpha1, a1, yl1
- LFPDUX a1, A1, INC2
- fxcpmadd ys3, alpha1, a5, yl2
- LFPDUX a5, A1, INC2
- fxcpmadd ys4, alpha1, a9, yl3
- LFPDUX a9, A1, INC2
- fxcpmadd ys5, alpha1, a13, yl4
- LFPDUX a13, A1, INC2
-
- fxcsmadd ys2, alpha1, a2, ys2
- LFPDUX a2, A2, INC2
- fxcsmadd ys3, alpha1, a6, ys3
- LFPDUX a6, A2, INC2
- fxcsmadd ys4, alpha1, a10, ys4
- LFPDUX a10, A2, INC2
- fxcsmadd ys5, alpha1, a14, ys5
- LFPDUX a14, A2, INC2
-
- fmr yl1, yl5
- LFXDUX yl2, YL, INCY2
- fmr ys1, ys2
- LFXDUX yl3, YL, INCY2
- fmr ys2, ys3
- LFXDUX yl4, YL, INCY2
- fmr ys3, ys4
- LFXDUX yl5, YL, INCY2
- fmr ys4, ys5
-
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys5
- STFXDUX ys2, YS, INCY2
- fsmr yl1, yl2
- STFXDUX ys3, YS, INCY2
- fsmr yl2, yl3
- STFXDUX ys4, YS, INCY2
- fsmr yl3, yl4
-
- fsmr yl4, yl5
- bdnz .L52
- .align 4
-
- .L53:
- fxcpmadd ys2, alpha1, a1, yl1
- fxcpmadd ys3, alpha1, a5, yl2
- fxcpmadd ys4, alpha1, a9, yl3
- fxcpmadd ys5, alpha1, a13, yl4
-
- fxcsmadd ys2, alpha1, a2, ys2
- fxcsmadd ys3, alpha1, a6, ys3
- fxcsmadd ys4, alpha1, a10, ys4
- fxcsmadd ys5, alpha1, a14, ys5
-
- fmr yl1, yl5
- fmr ys1, ys2
- fmr ys2, ys3
- fmr ys3, ys4
- fmr ys4, ys5
-
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys5
- STFXDUX ys2, YS, INCY2
- STFXDUX ys3, YS, INCY2
- STFXDUX ys4, YS, INCY2
- .align 4
-
- .L55:
- andi. r0, M, 7
- ble .L59
-
- andi. r0, M, 4
- ble .L57
-
- LFXDUX yl2, YL, INCY2
- LFXDUX yl3, YL, INCY2
-
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
-
- LFPDUX a5, A1, INC2
- LFPDUX a6, A2, INC2
-
- fsmr yl1, yl2
- fsmr yl2, yl3
-
- fxcpmadd ys2, alpha1, a1, yl1
- fxcsmadd ys2, alpha1, a2, ys2
- fxcpmadd ys3, alpha1, a5, yl2
- fxcsmadd ys3, alpha1, a6, ys3
-
- fmr yl1, yl3
- fmr ys1, ys2
- fmr ys2, ys3
-
- STFXDUX ys1, YS, INCY2
- STFXDUX ys2, YS, INCY2
- fsmr ys1, ys3
- .align 4
-
- .L57:
- andi. r0, M, 2
- ble .L58
-
- LFXDUX yl2, YL, INCY2
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
-
- fsmr yl1, yl2
- fxcpmadd ys2, alpha1, a1, yl1
- fxcsmadd ys2, alpha1, a2, ys2
- fmr yl1, yl2
-
- fmr ys1, ys2
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys2
- .align 4
-
- .L58:
- andi. r0, M, 1
- ble .L59
-
- LFDUX a1, A1, INC2
- LFDUX a2, A2, INC2
-
- fxmr alpha2, alpha1
- fmadd ys1, alpha1, a1, yl1
- fmadd ys1, alpha2, a2, ys1
-
- STFXDUX ys1, YS, INCY2
- b .L60
- .align 4
-
- .L59:
- STFSDUX ys1, YS, INCY2
- .align 4
-
- .L60:
- andi. J, N, 1
- ble .L999
-
- LFDUX alpha1, X, INCX
- mr A1, A
-
- mr YL, Y
- sub YS, Y, INCY2
-
- fmul alpha1, alpha, alpha1
-
- LFSDX ys1, YS, INCY2
- LFDX yl1, YL, INCY
-
- srawi. r0, M, 3
- mtspr CTR, r0
- ble .L65
-
- LFXDUX yl2, YL, INCY2
- LFXDUX yl3, YL, INCY2
- LFXDUX yl4, YL, INCY2
- LFXDUX yl5, YL, INCY2
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFPDUX a9, A1, INC2
- LFPDUX a13, A1, INC2
-
- fsmr yl1, yl2
- fsmr yl2, yl3
- fsmr yl3, yl4
- fsmr yl4, yl5
- bdz .L63
- .align 4
-
- .L62:
- fxcpmadd ys2, alpha1, a1, yl1
- LFPDUX a1, A1, INC2
- fxcpmadd ys3, alpha1, a5, yl2
- LFXDUX yl2, YL, INCY2
- fxcpmadd ys4, alpha1, a9, yl3
- LFXDUX yl3, YL, INCY2
- fxcpmadd ys5, alpha1, a13, yl4
- LFXDUX yl4, YL, INCY2
-
- fmr yl1, yl5
- LFXDUX yl5, YL, INCY2
- fmr ys1, ys2
- LFPDUX a5, A1, INC2
- fmr ys2, ys3
- LFPDUX a9, A1, INC2
- fmr ys3, ys4
- LFPDUX a13, A1, INC2
- fmr ys4, ys5
-
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys5
- STFXDUX ys2, YS, INCY2
- fsmr yl1, yl2
- STFXDUX ys3, YS, INCY2
- fsmr yl2, yl3
- STFXDUX ys4, YS, INCY2
- fsmr yl3, yl4
-
- fsmr yl4, yl5
- bdnz .L62
- .align 4
-
- .L63:
- fxcpmadd ys2, alpha1, a1, yl1
- fxcpmadd ys3, alpha1, a5, yl2
- fxcpmadd ys4, alpha1, a9, yl3
- fxcpmadd ys5, alpha1, a13, yl4
-
- fmr yl1, yl5
- fmr ys1, ys2
- fmr ys2, ys3
- fmr ys3, ys4
- fmr ys4, ys5
-
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys5
- STFXDUX ys2, YS, INCY2
- STFXDUX ys3, YS, INCY2
- STFXDUX ys4, YS, INCY2
- .align 4
-
- .L65:
- andi. r0, M, 7
- ble .L69
-
- andi. r0, M, 4
- ble .L67
-
- LFXDUX yl2, YL, INCY2
- LFXDUX yl3, YL, INCY2
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
-
- fsmr yl1, yl2
- fsmr yl2, yl3
-
- fxcpmadd ys2, alpha1, a1, yl1
- fxcpmadd ys3, alpha1, a5, yl2
-
- fmr yl1, yl3
- fmr ys1, ys2
- fmr ys2, ys3
-
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys3
- STFXDUX ys2, YS, INCY2
- .align 4
-
- .L67:
- andi. r0, M, 2
- ble .L68
-
- LFPDUX a1, A1, INC2
- LFXDUX yl2, YL, INCY2
-
- fsmr yl1, yl2
- fxcpmadd ys2, alpha1, a1, yl1
- fmr yl1, yl2
- fmr ys1, ys2
- STFXDUX ys1, YS, INCY2
- fsmr ys1, ys2
- .align 4
-
- .L68:
- andi. r0, M, 1
- ble .L69
-
- LFDUX a1, A1, INC2
- fmadd ys1, alpha1, a1, yl1
- STFXDUX ys1, YS, INCY2
- b .L999
- .align 4
-
- .L69:
- STFSDUX ys1, YS, INCY2
- b .L999
- .align 4
-
- .L70:
- sub A, A, INC2
- sub Y, Y, INCY
- srawi. J, N, 2
- ble .L80
- .align 4
-
- .L71:
- LFDUX alpha1, X, INCX
- mr A1, A
- add A2, A, LDA
- add A3, A2, LDA
- LFSDUX alpha1, X, INCX
- LFDUX alpha2, X, INCX
- add A4, A3, LDA
- add A, A4, LDA
- mr YL, Y
- LFSDUX alpha2, X, INCX
- fpmul alpha1, alpha, alpha1
- mr YS, Y
- srawi. r0, M, 3
- mtspr CTR, r0
- fpmul alpha2, alpha, alpha2
- ble .L75
-
- LFDUX yl1, YL, INCY
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFPDUX a9, A1, INC2
- LFPDUX a13, A1, INC2
- LFSDUX yl1, YL, INCY
-
- LFDUX yl2, YL, INCY
- LFPDUX a2, A2, INC2
- LFPDUX a6, A2, INC2
- LFPDUX a10, A2, INC2
- LFPDUX a14, A2, INC2
- LFSDUX yl2, YL, INCY
-
- LFDUX yl3, YL, INCY
- LFPDUX a3, A3, INC2
- LFPDUX a7, A3, INC2
- LFPDUX a11, A3, INC2
- LFPDUX a15, A3, INC2
- LFSDUX yl3, YL, INCY
-
- LFDUX yl4, YL, INCY
- LFPDUX a4, A4, INC2
- LFPDUX a8, A4, INC2
- LFPDUX a12, A4, INC2
- LFPDUX a16, A4, INC2
- LFSDUX yl4, YL, INCY
- bdz .L73
- .align 4
-
- .L72:
- fxcpmadd ys1, alpha1, a1, yl1
- LFPDUX a1, A1, INC2
- LFDUX yl1, YL, INCY
- fxcpmadd ys2, alpha1, a5, yl2
- LFPDUX a5, A1, INC2
- fxcpmadd ys3, alpha1, a9, yl3
- LFPDUX a9, A1, INC2
- fxcpmadd ys4, alpha1, a13, yl4
- LFPDUX a13, A1, INC2
- LFSDUX yl1, YL, INCY
-
- fxcsmadd ys1, alpha1, a2, ys1
- LFPDUX a2, A2, INC2
- LFDUX yl2, YL, INCY
- fxcsmadd ys2, alpha1, a6, ys2
- LFPDUX a6, A2, INC2
- fxcsmadd ys3, alpha1, a10, ys3
- LFPDUX a10, A2, INC2
- fxcsmadd ys4, alpha1, a14, ys4
- LFPDUX a14, A2, INC2
- LFSDUX yl2, YL, INCY
-
- fxcpmadd ys1, alpha2, a3, ys1
- LFPDUX a3, A3, INC2
- LFDUX yl3, YL, INCY
- fxcpmadd ys2, alpha2, a7, ys2
- LFPDUX a7, A3, INC2
- fxcpmadd ys3, alpha2, a11, ys3
- LFPDUX a11, A3, INC2
- fxcpmadd ys4, alpha2, a15, ys4
- LFPDUX a15, A3, INC2
- LFSDUX yl3, YL, INCY
-
- fxcsmadd ys1, alpha2, a4, ys1
- LFPDUX a4, A4, INC2
- LFDUX yl4, YL, INCY
- fxcsmadd ys2, alpha2, a8, ys2
- LFPDUX a8, A4, INC2
- fxcsmadd ys3, alpha2, a12, ys3
- LFPDUX a12, A4, INC2
- fxcsmadd ys4, alpha2, a16, ys4
- LFPDUX a16, A4, INC2
- LFSDUX yl4, YL, INCY
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFSDUX ys2, YS, INCY
- STFDUX ys3, YS, INCY
- STFSDUX ys3, YS, INCY
- STFDUX ys4, YS, INCY
- STFSDUX ys4, YS, INCY
- bdnz .L72
- .align 4
-
- .L73:
- fxcpmadd ys1, alpha1, a1, yl1
- fxcpmadd ys2, alpha1, a5, yl2
- fxcpmadd ys3, alpha1, a9, yl3
- fxcpmadd ys4, alpha1, a13, yl4
-
- fxcsmadd ys1, alpha1, a2, ys1
- fxcsmadd ys2, alpha1, a6, ys2
- fxcsmadd ys3, alpha1, a10, ys3
- fxcsmadd ys4, alpha1, a14, ys4
-
- fxcpmadd ys1, alpha2, a3, ys1
- fxcpmadd ys2, alpha2, a7, ys2
- fxcpmadd ys3, alpha2, a11, ys3
- fxcpmadd ys4, alpha2, a15, ys4
-
- fxcsmadd ys1, alpha2, a4, ys1
- fxcsmadd ys2, alpha2, a8, ys2
- fxcsmadd ys3, alpha2, a12, ys3
- fxcsmadd ys4, alpha2, a16, ys4
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFSDUX ys2, YS, INCY
- STFDUX ys3, YS, INCY
- STFSDUX ys3, YS, INCY
- STFDUX ys4, YS, INCY
- STFSDUX ys4, YS, INCY
- .align 4
-
- .L75:
- andi. r0, M, 7
- ble .L79
-
- andi. r0, M, 4
- ble .L77
-
- LFDUX yl1, YL, INCY
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFSDUX yl1, YL, INCY
- LFPDUX a2, A2, INC2
- LFPDUX a6, A2, INC2
-
- LFDUX yl2, YL, INCY
- LFPDUX a3, A3, INC2
- LFPDUX a7, A3, INC2
- LFSDUX yl2, YL, INCY
- LFPDUX a4, A4, INC2
- LFPDUX a8, A4, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcpmadd ys2, alpha1, a5, yl2
- fxcsmadd ys1, alpha1, a2, ys1
- fxcsmadd ys2, alpha1, a6, ys2
-
- fxcpmadd ys1, alpha2, a3, ys1
- fxcpmadd ys2, alpha2, a7, ys2
- fxcsmadd ys1, alpha2, a4, ys1
- fxcsmadd ys2, alpha2, a8, ys2
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFSDUX ys2, YS, INCY
- .align 4
-
- .L77:
- andi. r0, M, 2
- ble .L78
-
- LFDUX yl1, YL, INCY
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
- LFSDUX yl1, YL, INCY
- LFPDUX a3, A3, INC2
- LFPDUX a4, A4, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
- fxcpmadd ys1, alpha2, a3, ys1
- fxcsmadd ys1, alpha2, a4, ys1
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- .align 4
-
- .L78:
- andi. r0, M, 1
- ble .L79
-
- LFDUX yl1, YL, INCY
-
- LFDUX a1, A1, INC2
- LFDUX a2, A2, INC2
- LFDUX a3, A3, INC2
- LFDUX a4, A4, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
- fxcpmadd ys1, alpha2, a3, ys1
- fxcsmadd ys1, alpha2, a4, ys1
-
- STFDUX ys1, YS, INCY
- .align 4
-
- .L79:
- addi J, J, -1
- cmpi cr0, 0, J, 0
- bgt .L71
- .align 4
-
- .L80:
- andi. J, N, 2
- ble .L90
-
- LFDUX alpha1, X, INCX
-
- mr A1, A
- add A2, A, LDA
- add A, A2, LDA
- LFSDUX alpha1, X, INCX
-
- mr YL, Y
- mr YS, Y
- fpmul alpha1, alpha, alpha1
-
- srawi. r0, M, 3
- mtspr CTR, r0
- ble .L85
-
- LFDUX yl1, YL, INCY
- LFDUX a9, YL, INCY
- LFDUX yl2, YL, INCY
- LFDUX a10, YL, INCY
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFPDUX a3, A1, INC2
- LFPDUX a7, A1, INC2
-
- LFDUX yl3, YL, INCY
- LFDUX a11, YL, INCY
- LFDUX yl4, YL, INCY
- LFDUX a12, YL, INCY
-
- LFPDUX a2, A2, INC2
- LFPDUX a6, A2, INC2
- LFPDUX a4, A2, INC2
- LFPDUX a8, A2, INC2
-
- bdz .L83
- .align 4
-
- .L82:
- fsmfp yl1, a9
- fsmfp yl2, a10
- fsmfp yl3, a11
- fsmfp yl4, a12
-
- fxcpmadd ys1, alpha1, a1, yl1
- LFDUX yl1, YL, INCY
- LFDUX a9, YL, INCY
- LFPDUX a1, A1, INC2
- fxcpmadd ys2, alpha1, a5, yl2
- LFDUX yl2, YL, INCY
- LFDUX a10, YL, INCY
- LFPDUX a5, A1, INC2
- fxcpmadd ys3, alpha1, a3, yl3
- LFDUX yl3, YL, INCY
- LFDUX a11, YL, INCY
- LFPDUX a3, A1, INC2
- fxcpmadd ys4, alpha1, a7, yl4
- LFDUX yl4, YL, INCY
- LFDUX a12, YL, INCY
- LFPDUX a7, A1, INC2
-
- fxcsmadd ys1, alpha1, a2, ys1
- LFPDUX a2, A2, INC2
- fxcsmadd ys2, alpha1, a6, ys2
- LFPDUX a6, A2, INC2
- fxcsmadd ys3, alpha1, a4, ys3
- LFPDUX a4, A2, INC2
- fxcsmadd ys4, alpha1, a8, ys4
- LFPDUX a8, A2, INC2
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFSDUX ys2, YS, INCY
-
- STFDUX ys3, YS, INCY
- STFSDUX ys3, YS, INCY
- STFDUX ys4, YS, INCY
- STFSDUX ys4, YS, INCY
- bdnz .L82
- .align 4
-
- .L83:
- fsmfp yl1, a9
- fsmfp yl2, a10
- fsmfp yl3, a11
- fsmfp yl4, a12
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcpmadd ys2, alpha1, a5, yl2
- fxcpmadd ys3, alpha1, a3, yl3
- fxcpmadd ys4, alpha1, a7, yl4
-
- fxcsmadd ys1, alpha1, a2, ys1
- fxcsmadd ys2, alpha1, a6, ys2
- fxcsmadd ys3, alpha1, a4, ys3
- fxcsmadd ys4, alpha1, a8, ys4
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFSDUX ys2, YS, INCY
- STFDUX ys3, YS, INCY
- STFSDUX ys3, YS, INCY
- STFDUX ys4, YS, INCY
- STFSDUX ys4, YS, INCY
- .align 4
-
- .L85:
- andi. r0, M, 7
- ble .L90
-
- andi. r0, M, 4
- ble .L87
-
- LFDUX yl1, YL, INCY
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
- LFSDUX yl1, YL, INCY
- LFDUX yl2, YL, INCY
- LFPDUX a5, A1, INC2
- LFPDUX a6, A2, INC2
- LFSDUX yl2, YL, INCY
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcpmadd ys2, alpha1, a5, yl2
- fxcsmadd ys1, alpha1, a2, ys1
- fxcsmadd ys2, alpha1, a6, ys2
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFSDUX ys2, YS, INCY
- .align 4
-
- .L87:
- andi. r0, M, 2
- ble .L88
-
- LFDUX yl1, YL, INCY
- LFPDUX a1, A1, INC2
- LFPDUX a2, A2, INC2
- LFSDUX yl1, YL, INCY
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- .align 4
-
- .L88:
- andi. r0, M, 1
- ble .L90
-
- LFDUX yl1, YL, INCY
- LFDUX a1, A1, INC2
- LFDUX a2, A2, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
- fxcsmadd ys1, alpha1, a2, ys1
-
- STFDUX ys1, YS, INCY
- .align 4
-
- .L90:
- andi. J, N, 1
- ble .L999
-
- LFDUX alpha1, X, INCX
-
- mr A1, A
- mr YL, Y
- mr YS, Y
- fmul alpha1, alpha, alpha1
-
- srawi. r0, M, 3
- mtspr CTR, r0
- ble .L95
-
- LFDUX yl1, YL, INCY
- LFSDUX a2, YL, INCY
- LFDUX yl2, YL, INCY
- LFSDUX a4, YL, INCY
- LFDUX yl3, YL, INCY
- LFSDUX a6, YL, INCY
- LFDUX yl4, YL, INCY
- LFSDUX a8, YL, INCY
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFPDUX a9, A1, INC2
- LFPDUX a13, A1, INC2
- bdz .L93
- .align 4
-
- .L92:
- fmr a2, yl1
- fmr a4, yl2
- fmr a6, yl3
- fmr a8, yl4
-
- fxcpmadd ys1, alpha1, a1, a2
- LFDUX yl1, YL, INCY
- LFSDUX a2, YL, INCY
- fxcpmadd ys2, alpha1, a5, a4
- LFDUX yl2, YL, INCY
- LFSDUX a4, YL, INCY
- fxcpmadd ys3, alpha1, a9, a6
- LFDUX yl3, YL, INCY
- LFSDUX a6, YL, INCY
- fxcpmadd ys4, alpha1, a13, a8
- LFDUX yl4, YL, INCY
- LFSDUX a8, YL, INCY
-
- LFPDUX a1, A1, INC2
- LFPDUX a5, A1, INC2
- LFPDUX a9, A1, INC2
- LFPDUX a13, A1, INC2
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFSDUX ys2, YS, INCY
- STFDUX ys3, YS, INCY
- STFSDUX ys3, YS, INCY
- STFDUX ys4, YS, INCY
- STFSDUX ys4, YS, INCY
- bdnz .L92
- .align 4
-
- .L93:
- fmr a2, yl1
- fmr a4, yl2
- fmr a6, yl3
- fmr a8, yl4
-
- fxcpmadd ys1, alpha1, a1, a2
- fxcpmadd ys2, alpha1, a5, a4
- fxcpmadd ys3, alpha1, a9, a6
- fxcpmadd ys4, alpha1, a13, a8
-
- STFDUX ys1, YS, INCY
- STFSDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFSDUX ys2, YS, INCY
- STFDUX ys3, YS, INCY
- STFSDUX ys3, YS, INCY
- STFDUX ys4, YS, INCY
- STFSDUX ys4, YS, INCY
- .align 4
-
- .L95:
- andi. r0, M, 7
- ble .L999
-
- andi. r0, M, 4
- ble .L97
-
- LFPDUX a1, A1, INC2
- LFDUX yl1, YL, INCY
- LFDUX yl2, YL, INCY
- LFPDUX a2, A1, INC2
- LFDUX yl3, YL, INCY
- LFDUX yl4, YL, INCY
-
- fxcpmadd ys1, a1, alpha1, yl1
- fxcsmadd ys2, a1, alpha1, yl2
- fxcpmadd ys3, a2, alpha1, yl3
- fxcsmadd ys4, a2, alpha1, yl4
-
- STFDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- STFDUX ys3, YS, INCY
- STFDUX ys4, YS, INCY
- .align 4
-
- .L97:
- andi. r0, M, 2
- ble .L98
-
- LFPDUX a1, A1, INC2
- LFDUX yl1, YL, INCY
- LFDUX yl2, YL, INCY
-
- fxcpmadd ys1, a1, alpha1, yl1
- fxcsmadd ys2, a1, alpha1, yl2
-
- STFDUX ys1, YS, INCY
- STFDUX ys2, YS, INCY
- .align 4
-
- .L98:
- andi. r0, M, 1
- ble .L999
-
- LFDUX yl1, YL, INCY
- LFDUX a1, A1, INC2
-
- fxcpmadd ys1, alpha1, a1, yl1
-
- STFDUX ys1, YS, INCY
- b .L999
- .align 4
-
-
- .L999:
- addi SP, SP, -4
-
- lwzu r16, 4(SP)
- lwzu r17, 4(SP)
- lwzu r18, 4(SP)
- lwzu r19, 4(SP)
-
- lwzu r20, 4(SP)
- lwzu r21, 4(SP)
- lwzu r22, 4(SP)
- lwzu r23, 4(SP)
-
- lwzu r24, 4(SP)
- lwzu r25, 4(SP)
- lwzu r26, 4(SP)
- lwzu r27, 4(SP)
-
- lwzu r28, 4(SP)
- lwzu r29, 4(SP)
- lwzu r30, 4(SP)
- lwzu r31, 4(SP)
-
- subi SP, SP, 12
- li r0, 16
-
- lfpdux f31, SP, r0
- lfpdux f30, SP, r0
- lfpdux f29, SP, r0
- lfpdux f28, SP, r0
- lfpdux f27, SP, r0
- lfpdux f26, SP, r0
- lfpdux f25, SP, r0
- lfpdux f24, SP, r0
- lfpdux f23, SP, r0
- lfpdux f22, SP, r0
- lfpdux f21, SP, r0
- lfpdux f20, SP, r0
- lfpdux f19, SP, r0
- lfpdux f18, SP, r0
- lfpdux f17, SP, r0
- lfpdux f16, SP, r0
- lfpdux f15, SP, r0
- lfpdux f14, SP, r0
- addi SP, SP, 16
- blr
-
- EPILOGUE
|