|
- #define ASSEMBLER
-
- #include "common.h"
- #define N $r4
- #define ALPHAR $f0
- #define ALPHAI $f1
- #define X $r5
- #define INCX $r6
- #define BETAR $f2
- #define BETAI $f3
- #define Y $r7
- #define INCY $r8
-
- #define I $r12
- #define TEMP $r13
- #define t1 $r14
- #define t2 $r16
- #define t3 $r15
- #define t4 $r17
- #define XX $r18
- #define YY $r19
- #define a1 $f12
- #define a2 $f13
- #define a3 $f14
- #define a4 $f15
- #define s1 $f16
- #define s2 $f17
- #define s3 $f18
- #define s4 $f19
- #define VX0 $xr8
- #define VX1 $xr20
- #define VX2 $xr21
- #define VX3 $xr22
- #define VXAR $xr23
- #define VXAI $xr19
- #define VXBR $xr14
- #define VXBI $xr13
- #define VXZ $xr12
- #define x1 $xr18
- #define x2 $xr17
- #define x3 $xr16
- #define x4 $xr15
-
- PROLOGUE
-
- bge $r0, N, .L999
- movgr2fr.d a1, $r0
- FFINT a1, a1
- slli.d INCX, INCX, ZBASE_SHIFT
- slli.d INCY, INCY, ZBASE_SHIFT
- MTG t1, ALPHAR
- MTG t2, ALPHAI
- MTG t3, BETAR
- MTG t4, BETAI
- #ifdef DOUBLE
- xvreplgr2vr.d VXAR, t1
- xvreplgr2vr.d VXAI, t2
- xvreplgr2vr.d VXBR, t3
- xvreplgr2vr.d VXBI, t4
- #else
- xvreplgr2vr.w VXAR, t1
- xvreplgr2vr.w VXAI, t2
- xvreplgr2vr.w VXBR, t3
- xvreplgr2vr.w VXBI, t4
- #endif
- xvxor.v VXZ, VXZ, VXZ
- // If incx == 0 || incy == 0, do one by one
- and TEMP, INCX, INCY
- or I, N, N
- beqz TEMP, .L998
-
- li.d TEMP, 1
- slli.d TEMP, TEMP, ZBASE_SHIFT
- #ifdef DOUBLE
- srai.d I, N, 2
- #else
- srai.d I, N, 3
- #endif
- bne INCX, TEMP, .L20
- bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
- b .L11 // INCX==1 and INCY==1
- .L20:
- bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
- b .L21 // INCX!=1 and INCY==1
-
- .L11:
- bge $r0, I, .L997
- CMPEQ $fcc0, BETAR, a1
- CMPEQ $fcc1, BETAI, a1
- CMPEQ $fcc2, ALPHAR, a1
- CMPEQ $fcc3, ALPHAI, a1
- bceqz $fcc0, .L13
- bceqz $fcc1, .L13
- b .L14
- .align 3
-
- .L13:
- bceqz $fcc2, .L114
- bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
-
- .L14:
- bceqz $fcc2, .L112
- bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- xvst VXZ, Y, 0 * SIZE
- #ifdef DOUBLE
- xvst VXZ, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- #else
- xvst VXZ, Y, 8 * SIZE
- addi.d Y, Y, 16 * SIZE
- #endif
- addi.d I, I, -1
- blt $r0, I, .L111
- b .L997
- .align 3
-
- .L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- xvld VX0, X, 0 * SIZE
- xvld VX1, X, 4 * SIZE
- xvpickev.d x1, VX1, VX0
- xvpickod.d x2, VX1, VX0
- #else
- xvld VX0, X, 0 * SIZE
- xvld VX1, X, 8 * SIZE
- xvpickev.w x1, VX1, VX0
- xvpickod.w x2, VX1, VX0
- #endif
- XVFMUL x3, VXAI, x2
- XVFMUL x4, VXAI, x1
- XVMSUB x3, VXAR, x1, x3
- XVFMADD x4, VXAR, x2, x4
- #ifdef DOUBLE
- xvilvl.d VX2, x4 ,x3
- xvilvh.d VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 4 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- #else
- xvilvl.w VX2, x4 ,x3
- xvilvh.w VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 8 * SIZE
- addi.d X, X, 16 * SIZE
- addi.d Y, Y, 16 * SIZE
- #endif
- addi.d I, I, -1
- blt $r0, I, .L112
- b .L997
- .align 3
-
- .L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- xvld VX0, Y, 0 * SIZE
- xvld VX1, Y, 4 * SIZE
- xvpickev.d x1, VX1, VX0
- xvpickod.d x2, VX1, VX0
- #else
- xvld VX0, Y, 0 * SIZE
- xvld VX1, Y, 8 * SIZE
- xvpickev.w x1, VX1, VX0
- xvpickod.w x2, VX1, VX0
- #endif
- XVFMUL x3, VXBI, x2
- XVFMUL x4, VXBI, x1
- XVMSUB x3, VXBR, x1, x3
- XVFMADD x4, VXBR, x2, x4
- #ifdef DOUBLE
- xvilvl.d VX2, x4 ,x3
- xvilvh.d VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- #else
- xvilvl.w VX2, x4 ,x3
- xvilvh.w VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 8 * SIZE
- addi.d Y, Y, 16 * SIZE
- #endif
- addi.d I, I, -1
- blt $r0, I, .L113
- b .L997
- .align 3
-
- .L114:
- #ifdef DOUBLE
- xvld VX0, X, 0 * SIZE
- xvld VX1, X, 4 * SIZE
- xvld VX2, Y, 0 * SIZE
- xvld VX3, Y, 4 * SIZE
- xvpickev.d x1, VX1, VX0
- xvpickod.d x2, VX1, VX0
- xvpickev.d x3, VX3, VX2
- xvpickod.d x4, VX3, VX2
- #else
- xvld VX0, X, 0 * SIZE
- xvld VX1, X, 8 * SIZE
- xvld VX2, Y, 0 * SIZE
- xvld VX3, Y, 8 * SIZE
- xvpickev.w x1, VX1, VX0
- xvpickod.w x2, VX1, VX0
- xvpickev.w x3, VX3, VX2
- xvpickod.w x4, VX3, VX2
- #endif
- XVFMUL VX0, VXAI, x2
- XVFMUL VX1, VXAI, x1
- XVFMUL VX2, VXBI, x4
- XVFMUL VX3, VXBI, x3
- XVMSUB VX0, VXAR, x1, VX0
- XVFMADD VX1, VXAR, x2, VX1
- XVMSUB VX2, VXBR, x3, VX2
- XVFMADD VX3, VXBR, x4, VX3
- XVFADD x3, VX0, VX2
- XVFADD x4, VX1, VX3
- #ifdef DOUBLE
- xvilvl.d VX2, x4 ,x3
- xvilvh.d VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 4 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- #else
- xvilvl.w VX2, x4 ,x3
- xvilvh.w VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 8 * SIZE
- addi.d X, X, 16 * SIZE
- addi.d Y, Y, 16 * SIZE
- #endif
- addi.d I, I, -1
- blt $r0, I, .L114
- b .L997
- .align 3
-
- .L12: // INCX==1 and INCY!=1
- bge $r0, I, .L997
- move YY, Y
- .align 3
-
- .L121:
- #ifdef DOUBLE
- xvld VX0, X, 0 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.d x3, t1, 0
- xvinsgr2vr.d x4, t2, 0
- xvinsgr2vr.d x3, t3, 2
- xvinsgr2vr.d x4, t4, 2
-
- xvld VX1, X, 4 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.d x3, t1, 1
- xvinsgr2vr.d x4, t2, 1
- xvinsgr2vr.d x3, t3, 3
- xvinsgr2vr.d x4, t4, 3
- add.d Y, Y, INCY
-
- xvpickev.d x1, VX1, VX0
- xvpickod.d x2, VX1, VX0
- xvfmul.d VX0, VXAI, x2
- xvfmul.d VX1, VXAI, x1
- xvfmul.d VX2, VXBI, x4
- xvfmul.d VX3, VXBI, x3
- xvfmsub.d VX0, VXAR, x1, VX0
- xvfmadd.d VX1, VXAR, x2, VX1
- xvfmsub.d VX2, VXBR, x3, VX2
- xvfmadd.d VX3, VXBR, x4, VX3
- xvfadd.d x3, VX0, VX2
- xvfadd.d x4, VX1, VX3
- addi.d I, I, -1
- xvstelm.d x3, YY, 0 * SIZE, 0
- xvstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 2
- xvstelm.d x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 1
- xvstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 3
- xvstelm.d x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- addi.d X, X, 8 * SIZE
- blt $r0, I, .L121
- b .L997
- .align 3
- #else
- xvld VX0, X, 0 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 0
- xvinsgr2vr.w x4, t2, 0
- xvinsgr2vr.w x3, t3, 1
- xvinsgr2vr.w x4, t4, 1
- xvld VX1, X, 8 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.w x3, t1, 4
- xvinsgr2vr.w x4, t2, 4
- xvinsgr2vr.w x3, t3, 5
- xvinsgr2vr.w x4, t4, 5
- add.d Y, Y, INCY
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 2
- xvinsgr2vr.w x4, t2, 2
- xvinsgr2vr.w x3, t3, 3
- xvinsgr2vr.w x4, t4, 3
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.w x3, t1, 6
- xvinsgr2vr.w x4, t2, 6
- xvinsgr2vr.w x3, t3, 7
- xvinsgr2vr.w x4, t4, 7
- add.d Y, Y, INCY
-
- xvpickev.w x1, VX1, VX0
- xvpickod.w x2, VX1, VX0
- XVFMUL VX0, VXAI, x2
- XVFMUL VX1, VXAI, x1
- XVFMUL VX2, VXBI, x4
- XVFMUL VX3, VXBI, x3
- XVMSUB VX0, VXAR, x1, VX0
- XVFMADD VX1, VXAR, x2, VX1
- XVMSUB VX2, VXBR, x3, VX2
- XVFMADD VX3, VXBR, x4, VX3
- XVFADD x3, VX0, VX2
- XVFADD x4, VX1, VX3
- addi.d I, I, -1
- xvstelm.w x3, YY, 0 * SIZE, 0
- xvstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 1
- xvstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 4
- xvstelm.w x4, YY, 1 * SIZE, 4
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 5
- xvstelm.w x4, YY, 1 * SIZE, 5
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 2
- xvstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 3
- xvstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 6
- xvstelm.w x4, YY, 1 * SIZE, 6
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 7
- xvstelm.w x4, YY, 1 * SIZE, 7
- add.d YY, YY, INCY
- addi.d X, X, 16 * SIZE
- blt $r0, I, .L121
- b .L997
- .align 3
- #endif
-
- .L21:// INCX!=1 and INCY==1
- bge $r0, I, .L997
- .align 3
-
- .L211:
- #ifdef DOUBLE
- xvld VX2, Y, 0 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.d x1, t1, 0
- xvinsgr2vr.d x2, t2, 0
- xvinsgr2vr.d x1, t3, 2
- xvinsgr2vr.d x2, t4, 2
- xvld VX3, Y, 4 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- xvinsgr2vr.d x1, t1, 1
- xvinsgr2vr.d x2, t2, 1
- xvinsgr2vr.d x1, t3, 3
- xvinsgr2vr.d x2, t4, 3
- add.d X, X, INCX
-
- xvpickev.d x3, VX3, VX2
- xvpickod.d x4, VX3, VX2
- xvfmul.d VX0, VXAI, x2
- xvfmul.d VX1, VXAI, x1
- xvfmul.d VX2, VXBI, x4
- xvfmul.d VX3, VXBI, x3
- xvfmsub.d VX0, VXAR, x1, VX0
- xvfmadd.d VX1, VXAR, x2, VX1
- xvfmsub.d VX2, VXBR, x3, VX2
- xvfmadd.d VX3, VXBR, x4, VX3
- xvfadd.d x3, VX0, VX2
- xvfadd.d x4, VX1, VX3
- xvilvl.d VX2, x4 ,x3
- xvilvh.d VX3, x4, x3
- addi.d I, I, -1
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- blt $r0, I, .L211
- b .L997
- .align 3
- #else
- xvld VX2, Y, 0 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 0
- xvinsgr2vr.w x2, t2, 0
- xvinsgr2vr.w x1, t3, 1
- xvinsgr2vr.w x2, t4, 1
- xvld VX3, Y, 8 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 4
- xvinsgr2vr.w x2, t2, 4
- xvinsgr2vr.w x1, t3, 5
- xvinsgr2vr.w x2, t4, 5
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 2
- xvinsgr2vr.w x2, t2, 2
- xvinsgr2vr.w x1, t3, 3
- xvinsgr2vr.w x2, t4, 3
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- xvinsgr2vr.w x1, t1, 6
- xvinsgr2vr.w x2, t2, 6
- xvinsgr2vr.w x1, t3, 7
- xvinsgr2vr.w x2, t4, 7
- add.d X, X, INCX
-
- xvpickev.w x3, VX3, VX2
- xvpickod.w x4, VX3, VX2
- XVFMUL VX0, VXAI, x2
- XVFMUL VX1, VXAI, x1
- XVFMUL VX2, VXBI, x4
- XVFMUL VX3, VXBI, x3
- XVMSUB VX0, VXAR, x1, VX0
- XVFMADD VX1, VXAR, x2, VX1
- XVMSUB VX2, VXBR, x3, VX2
- XVFMADD VX3, VXBR, x4, VX3
- XVFADD x3, VX0, VX2
- XVFADD x4, VX1, VX3
- xvilvl.w VX2, x4 ,x3
- xvilvh.w VX3, x4, x3
- addi.d I, I, -1
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 8 * SIZE
- addi.d Y, Y, 16 * SIZE
- blt $r0, I, .L211
- b .L997
- .align 3
- #endif
-
- .L22:
- bge $r0, I, .L997
- move YY, Y
- CMPEQ $fcc0, BETAR, a1
- CMPEQ $fcc1, BETAI, a1
- CMPEQ $fcc2, ALPHAR, a1
- CMPEQ $fcc3, ALPHAI, a1
- bceqz $fcc0, .L23
- bceqz $fcc1, .L23
- b .L24
- .align 3
-
- .L23:
- bceqz $fcc2, .L224
- bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L24:
- bceqz $fcc2, .L222
- bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- xvstelm.d VXZ, Y, 0, 0
- xvstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.d VXZ, Y, 0, 0
- xvstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.d VXZ, Y, 0, 0
- xvstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.d VXZ, Y, 0, 0
- xvstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- addi.d I, I, -1
- blt $r0, I, .L221
- b .L997
- .align 3
- #else
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, YY, 0, 0
- xvstelm.w VXZ, YY, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- addi.d I, I, -1
- blt $r0, I, .L221
- b .L997
- .align 3
- #endif
-
- .L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.d x1, t1, 0
- xvinsgr2vr.d x2, t2, 0
- xvinsgr2vr.d x1, t3, 1
- xvinsgr2vr.d x2, t4, 1
-
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- xvinsgr2vr.d x1, t1, 2
- xvinsgr2vr.d x2, t2, 2
- xvinsgr2vr.d x1, t3, 3
- xvinsgr2vr.d x2, t4, 3
- add.d X, X, INCX
- xvfmul.d x3, VXAI, x2
- xvfmul.d x4, VXAI, x1
- xvfmsub.d x3, VXAR, x1, x3
- xvfmadd.d x4, VXAR, x2, x4
- addi.d I, I, -1
- xvstelm.d x3, YY, 0 * SIZE, 0
- xvstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 1
- xvstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 2
- xvstelm.d x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 3
- xvstelm.d x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L222
- move Y, YY
- b .L997
- .align 3
- #else
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 0
- xvinsgr2vr.w x2, t2, 0
- xvinsgr2vr.w x1, t3, 1
- xvinsgr2vr.w x2, t4, 1
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 2
- xvinsgr2vr.w x2, t2, 2
- xvinsgr2vr.w x1, t3, 3
- xvinsgr2vr.w x2, t4, 3
-
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 4
- xvinsgr2vr.w x2, t2, 4
- xvinsgr2vr.w x1, t3, 5
- xvinsgr2vr.w x2, t4, 5
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- xvinsgr2vr.w x1, t1, 6
- xvinsgr2vr.w x2, t2, 6
- xvinsgr2vr.w x1, t3, 7
- xvinsgr2vr.w x2, t4, 7
- add.d X, X, INCX
- XVFMUL x3, VXAI, x2
- XVFMUL x4, VXAI, x1
- XVMSUB x3, VXAR, x1, x3
- XVFMADD x4, VXAR, x2, x4
- addi.d I, I, -1
- xvstelm.w x3, YY, 0 * SIZE, 0
- xvstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 1
- xvstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 2
- xvstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 3
- xvstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 4
- xvstelm.w x4, YY, 1 * SIZE, 4
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 5
- xvstelm.w x4, YY, 1 * SIZE, 5
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 6
- xvstelm.w x4, YY, 1 * SIZE, 6
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 7
- xvstelm.w x4, YY, 1 * SIZE, 7
- add.d YY, YY, INCY
- blt $r0, I, .L222
- move Y, YY
- b .L997
- .align 3
- #endif
-
- .L223:
- #ifdef DOUBLE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.d x1, t1, 0
- xvinsgr2vr.d x2, t2, 0
- xvinsgr2vr.d x1, t3, 1
- xvinsgr2vr.d x2, t4, 1
-
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.d x1, t1, 2
- xvinsgr2vr.d x2, t2, 2
- xvinsgr2vr.d x1, t3, 3
- xvinsgr2vr.d x2, t4, 3
- add.d Y, Y, INCY
- xvfmul.d x3, VXBI, x2
- xvfmul.d x4, VXBI, x1
- xvfmsub.d x3, VXBR, x1, x3
- xvfmadd.d x4, VXBR, x2, x4
-
- addi.d I, I, -1
- xvstelm.d x3, YY, 0 * SIZE, 0
- xvstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 1
- xvstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 2
- xvstelm.d x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 3
- xvstelm.d x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L223
- b .L997
- .align 3
- #else
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x1, t1, 0
- xvinsgr2vr.w x2, t2, 0
- xvinsgr2vr.w x1, t3, 1
- xvinsgr2vr.w x2, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x1, t1, 2
- xvinsgr2vr.w x2, t2, 2
- xvinsgr2vr.w x1, t3, 3
- xvinsgr2vr.w x2, t4, 3
-
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x1, t1, 4
- xvinsgr2vr.w x2, t2, 4
- xvinsgr2vr.w x1, t3, 5
- xvinsgr2vr.w x2, t4, 5
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.w x1, t1, 6
- xvinsgr2vr.w x2, t2, 6
- xvinsgr2vr.w x1, t3, 7
- xvinsgr2vr.w x2, t4, 7
- add.d Y, Y, INCY
-
- XVFMUL x3, VXBI, x2
- XVFMUL x4, VXBI, x1
- XVMSUB x3, VXBR, x1, x3
- XVFMADD x4, VXBR, x2, x4
- addi.d I, I, -1
- xvstelm.w x3, YY, 0 * SIZE, 0
- xvstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 1
- xvstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 2
- xvstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 3
- xvstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 4
- xvstelm.w x4, YY, 1 * SIZE, 4
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 5
- xvstelm.w x4, YY, 1 * SIZE, 5
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 6
- xvstelm.w x4, YY, 1 * SIZE, 6
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 7
- xvstelm.w x4, YY, 1 * SIZE, 7
- add.d YY, YY, INCY
- blt $r0, I, .L223
- b .L997
- .align 3
- #endif
-
- .L224:
- #ifdef DOUBLE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.d x1, t1, 0
- xvinsgr2vr.d x2, t2, 0
- xvinsgr2vr.d x1, t3, 1
- xvinsgr2vr.d x2, t4, 1
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.d x1, t1, 2
- xvinsgr2vr.d x2, t2, 2
- xvinsgr2vr.d x1, t3, 3
- xvinsgr2vr.d x2, t4, 3
-
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.d x3, t1, 0
- xvinsgr2vr.d x4, t2, 0
- xvinsgr2vr.d x3, t3, 1
- xvinsgr2vr.d x4, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.d x3, t1, 2
- xvinsgr2vr.d x4, t2, 2
- xvinsgr2vr.d x3, t3, 3
- xvinsgr2vr.d x4, t4, 3
- add.d Y, Y, INCY
- xvfmul.d VX0, VXAI, x2
- xvfmul.d VX1, VXAI, x1
- xvfmul.d VX2, VXBI, x4
- xvfmul.d VX3, VXBI, x3
- xvfmsub.d VX0, VXAR, x1, VX0
- xvfmadd.d VX1, VXAR, x2, VX1
- xvfmsub.d VX2, VXBR, x3, VX2
- xvfmadd.d VX3, VXBR, x4, VX3
- xvfadd.d x3, VX0, VX2
- xvfadd.d x4, VX1, VX3
- addi.d I, I, -1
-
- xvstelm.d x3, YY, 0 * SIZE, 0
- xvstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 1
- xvstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 2
- xvstelm.d x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 3
- xvstelm.d x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L224
- b .L997
- .align 3
- #else
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 0
- xvinsgr2vr.w x2, t2, 0
- xvinsgr2vr.w x1, t3, 1
- xvinsgr2vr.w x2, t4, 1
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 2
- xvinsgr2vr.w x2, t2, 2
- xvinsgr2vr.w x1, t3, 3
- xvinsgr2vr.w x2, t4, 3
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 4
- xvinsgr2vr.w x2, t2, 4
- xvinsgr2vr.w x1, t3, 5
- xvinsgr2vr.w x2, t4, 5
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 6
- xvinsgr2vr.w x2, t2, 6
- xvinsgr2vr.w x1, t3, 7
- xvinsgr2vr.w x2, t4, 7
-
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 0
- xvinsgr2vr.w x4, t2, 0
- xvinsgr2vr.w x3, t3, 1
- xvinsgr2vr.w x4, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 2
- xvinsgr2vr.w x4, t2, 2
- xvinsgr2vr.w x3, t3, 3
- xvinsgr2vr.w x4, t4, 3
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 4
- xvinsgr2vr.w x4, t2, 4
- xvinsgr2vr.w x3, t3, 5
- xvinsgr2vr.w x4, t4, 5
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.w x3, t1, 6
- xvinsgr2vr.w x4, t2, 6
- xvinsgr2vr.w x3, t3, 7
- xvinsgr2vr.w x4, t4, 7
- add.d Y, Y, INCY
-
- XVFMUL VX0, VXAI, x2
- XVFMUL VX1, VXAI, x1
- XVFMUL VX2, VXBI, x4
- XVFMUL VX3, VXBI, x3
- XVMSUB VX0, VXAR, x1, VX0
- XVFMADD VX1, VXAR, x2, VX1
- XVMSUB VX2, VXBR, x3, VX2
- XVFMADD VX3, VXBR, x4, VX3
- XVFADD x3, VX0, VX2
- XVFADD x4, VX1, VX3
- addi.d I, I, -1
-
- xvstelm.w x3, YY, 0 * SIZE, 0
- xvstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 1
- xvstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 2
- xvstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 3
- xvstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 4
- xvstelm.w x4, YY, 1 * SIZE, 4
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 5
- xvstelm.w x4, YY, 1 * SIZE, 5
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 6
- xvstelm.w x4, YY, 1 * SIZE, 6
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 7
- xvstelm.w x4, YY, 1 * SIZE, 7
- add.d YY, YY, INCY
- blt $r0, I, .L224
- b .L997
- .align 3
- #endif
-
- .L997:
- #ifdef DOUBLE
- andi I, N, 3
- #else
- andi I, N, 7
- #endif
- bge $r0, I, .L999
- .align 3
-
- .L998:
- LD a1, X, 0 * SIZE
- LD a2, X, 1 * SIZE
- LD a3, Y, 0 * SIZE
- LD a4, Y, 1 * SIZE
- addi.d I, I, -1
- MUL s1, ALPHAI, a2
- MUL s2, ALPHAI, a1
- MUL s3, BETAI, a4
- MUL s4, BETAI, a3
- MSUB s1, ALPHAR, a1, s1
- MADD s2, a2, ALPHAR, s2
- MSUB s3, BETAR, a3, s3
- MADD s4, a4, BETAR, s4
- ADD s3, s3, s1
- ADD s4, s4, s2
- ST s3, Y, 0 * SIZE
- ST s4, Y, 1 * SIZE
- add.d X, X, INCX
- add.d Y, Y, INCY
- blt $r0, I, .L998
- .align 3
-
- .L999:
- move $r4, $r12
- jirl $r0, $r1, 0x0
- .align 3
-
- EPILOGUE
|