|
- #define ASSEMBLER
-
- #include "common.h"
- #define N $r4
- #define ALPHAR $f0
- #define ALPHAI $f1
- #define X $r5
- #define INCX $r6
- #define BETAR $f2
- #define BETAI $f3
- #define Y $r7
- #define INCY $r8
-
- #define I $r12
- #define TEMP $r13
- #define t1 $r14
- #define t2 $r16
- #define t3 $r15
- #define t4 $r17
- #define XX $r18
- #define YY $r19
- #define a1 $f12
- #define a2 $f13
- #define a3 $f14
- #define a4 $f15
- #define s1 $f16
- #define s2 $f17
- #define s3 $f18
- #define s4 $f19
- #define VX0 $vr8
- #define VX1 $vr20
- #define VX2 $vr21
- #define VX3 $vr22
- #define VXAR $vr23
- #define VXAI $vr19
- #define VXBR $vr14
- #define VXBI $vr13
- #define VXZ $vr12
- #define x1 $vr18
- #define x2 $vr17
- #define x3 $vr16
- #define x4 $vr15
-
- PROLOGUE
-
- bge $r0, N, .L999
- movgr2fr.d a1, $r0
- #ifdef DOUBLE
- ffint.d.l a1, a1
- #else
- ffint.s.l a1, a1
- #endif
- slli.d INCX, INCX, ZBASE_SHIFT
- slli.d INCY, INCY, ZBASE_SHIFT
- #ifdef DOUBLE
- movfr2gr.d t1, ALPHAR
- vreplgr2vr.d VXAR, t1
- movfr2gr.d t2, ALPHAI
- vreplgr2vr.d VXAI, t2
- movfr2gr.d t3, BETAR
- vreplgr2vr.d VXBR, t3
- movfr2gr.d t4, BETAI
- vreplgr2vr.d VXBI, t4
- #else
- movfr2gr.s t1, ALPHAR
- vreplgr2vr.w VXAR, t1
- movfr2gr.s t2, ALPHAI
- vreplgr2vr.w VXAI, t2
- movfr2gr.s t3, BETAR
- vreplgr2vr.w VXBR, t3
- movfr2gr.s t4, BETAI
- vreplgr2vr.w VXBI, t4
- #endif
- vxor.v VXZ, VXZ, VXZ
- // If incx == 0 || incy == 0, do one by one
- and TEMP, INCX, INCY
- or I, N, N
- beqz TEMP, .L998
-
- li.d TEMP, 1
- slli.d TEMP, TEMP, ZBASE_SHIFT
- srai.d I, N, 2
- bne INCX, TEMP, .L20
- bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
- b .L11 // INCX==1 and INCY==1
- .L20:
- bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
- b .L21 // INCX!=1 and INCY==1
-
- .L11:
- bge $r0, I, .L997
- #ifdef DOUBLE
- fcmp.ceq.d $fcc0, BETAR, a1
- fcmp.ceq.d $fcc1, BETAI, a1
- fcmp.ceq.d $fcc2, ALPHAR, a1
- fcmp.ceq.d $fcc3, ALPHAI, a1
- #else
- fcmp.ceq.s $fcc0, BETAR, a1
- fcmp.ceq.s $fcc1, BETAI, a1
- fcmp.ceq.s $fcc2, ALPHAR, a1
- fcmp.ceq.s $fcc3, ALPHAI, a1
- #endif
- bceqz $fcc0, .L13
- bceqz $fcc1, .L13
- b .L14
- .align 3
-
- .L13:
- bceqz $fcc2, .L114
- bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
-
- .L14:
- bceqz $fcc2, .L112
- bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- vst VXZ, Y, 0 * SIZE
- vst VXZ, Y, 2 * SIZE
- vst VXZ, Y, 4 * SIZE
- vst VXZ, Y, 6 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L111
- b .L997
- .align 3
- #else
- vst VXZ, Y, 0 * SIZE
- vst VXZ, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L111
- b .L997
- .align 3
- #endif
-
- .L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- vld VX0, X, 0 * SIZE
- vld VX1, X, 2 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d x3, VXAI, x2
- vfmul.d x4, VXAI, x1
- vfmsub.d x3, VXAR, x1, x3
- vfmadd.d x4, VXAR, x2, x4
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 2 * SIZE
-
- vld VX0, X, 4 * SIZE
- vld VX1, X, 6 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d x3, VXAI, x2
- vfmul.d x4, VXAI, x1
- vfmsub.d x3, VXAR, x1, x3
- vfmadd.d x4, VXAR, x2, x4
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 4 * SIZE
- vst VX3, Y, 6 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L112
- b .L997
- .align 3
- #else
- vld VX0, X, 0 * SIZE
- vld VX1, X, 4 * SIZE
- vpickev.w x1, VX1, VX0
- vpickod.w x2, VX1, VX0
- vfmul.s x3, VXAI, x2
- vfmul.s x4, VXAI, x1
- vfmsub.s x3, VXAR, x1, x3
- vfmadd.s x4, VXAR, x2, x4
- vilvl.w VX2, x4 ,x3
- vilvh.w VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 4 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L112
- b .L997
- .align 3
- #endif
-
- .L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- vld VX0, Y, 0 * SIZE
- vld VX1, Y, 2 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d x3, VXBI, x2
- vfmul.d x4, VXBI, x1
- vfmsub.d x3, VXBR, x1, x3
- vfmadd.d x4, VXBR, x2, x4
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 2 * SIZE
- vld VX0, Y, 4 * SIZE
- vld VX1, Y, 6 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d x3, VXBI, x2
- vfmul.d x4, VXBI, x1
- vfmsub.d x3, VXBR, x1, x3
- vfmadd.d x4, VXBR, x2, x4
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 4 * SIZE
- vst VX3, Y, 6 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L113
- b .L997
- .align 3
- #else
- vld VX0, Y, 0 * SIZE
- vld VX1, Y, 4 * SIZE
- vpickev.w x1, VX1, VX0
- vpickod.w x2, VX1, VX0
- vfmul.s x3, VXBI, x2
- vfmul.s x4, VXBI, x1
- vfmsub.s x3, VXBR, x1, x3
- vfmadd.s x4, VXBR, x2, x4
- vilvl.w VX2, x4 ,x3
- vilvh.w VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L113
- b .L997
- .align 3
- #endif
-
- .L114:
- #ifdef DOUBLE
- vld VX0, X, 0 * SIZE
- vld VX1, X, 2 * SIZE
- vld VX2, Y, 0 * SIZE
- vld VX3, Y, 2 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vpickev.d x3, VX3, VX2
- vpickod.d x4, VX3, VX2
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 2 * SIZE
-
- vld VX0, X, 4 * SIZE
- vld VX1, X, 6 * SIZE
- vld VX2, Y, 4 * SIZE
- vld VX3, Y, 6 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vpickev.d x3, VX3, VX2
- vpickod.d x4, VX3, VX2
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 4 * SIZE
- vst VX3, Y, 6 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L114
- b .L997
- .align 3
- #else
- vld VX0, X, 0 * SIZE
- vld VX1, X, 4 * SIZE
- vld VX2, Y, 0 * SIZE
- vld VX3, Y, 4 * SIZE
- vpickev.w x1, VX1, VX0
- vpickod.w x2, VX1, VX0
- vpickev.w x3, VX3, VX2
- vpickod.w x4, VX3, VX2
- vfmul.s VX0, VXAI, x2
- vfmul.s VX1, VXAI, x1
- vfmul.s VX2, VXBI, x4
- vfmul.s VX3, VXBI, x3
- vfmsub.s VX0, VXAR, x1, VX0
- vfmadd.s VX1, VXAR, x2, VX1
- vfmsub.s VX2, VXBR, x3, VX2
- vfmadd.s VX3, VXBR, x4, VX3
- vfadd.s x3, VX0, VX2
- vfadd.s x4, VX1, VX3
- vilvl.w VX2, x4 ,x3
- vilvh.w VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 4 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L114
- b .L997
- .align 3
- #endif
-
- .L12: // INCX==1 and INCY!=1
- bge $r0, I, .L997
- move YY, Y
- .align 3
-
- .L121:
- #ifdef DOUBLE
- vld VX0, X, 0 * SIZE
- vld VX1, X, 2 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x3, t1, 0
- vinsgr2vr.d x4, t2, 0
- vinsgr2vr.d x3, t3, 1
- vinsgr2vr.d x4, t4, 1
- add.d Y, Y, INCY
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
-
- vld VX0, X, 4 * SIZE
- vld VX1, X, 6 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x3, t1, 0
- vinsgr2vr.d x4, t2, 0
- vinsgr2vr.d x3, t3, 1
- vinsgr2vr.d x4, t4, 1
- add.d Y, Y, INCY
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- addi.d I, I, -1
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- addi.d X, X, 8 * SIZE
- blt $r0, I, .L121
- b .L997
- .align 3
- #else
- vld VX0, X, 0 * SIZE
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- vinsgr2vr.w x3, t1, 0
- vinsgr2vr.w x4, t2, 0
- vinsgr2vr.w x3, t3, 1
- vinsgr2vr.w x4, t4, 1
-
- vld VX1, X, 4 * SIZE
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- vinsgr2vr.w x3, t1, 2
- vinsgr2vr.w x4, t2, 2
- vinsgr2vr.w x3, t3, 3
- vinsgr2vr.w x4, t4, 3
- add.d Y, Y, INCY
-
- vpickev.w x1, VX1, VX0
- vpickod.w x2, VX1, VX0
- vfmul.s VX0, VXAI, x2
- vfmul.s VX1, VXAI, x1
- vfmul.s VX2, VXBI, x4
- vfmul.s VX3, VXBI, x3
- vfmsub.s VX0, VXAR, x1, VX0
- vfmadd.s VX1, VXAR, x2, VX1
- vfmsub.s VX2, VXBR, x3, VX2
- vfmadd.s VX3, VXBR, x4, VX3
- vfadd.s x3, VX0, VX2
- vfadd.s x4, VX1, VX3
- addi.d I, I, -1
- vstelm.w x3, YY, 0 * SIZE, 0
- vstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 1
- vstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 2
- vstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 3
- vstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- addi.d X, X, 8 * SIZE
- blt $r0, I, .L121
- b .L997
- .align 3
- #endif
-
- .L21:// INCX!=1 and INCY==1
- bge $r0, I, .L997
- .align 3
-
- .L211:
- #ifdef DOUBLE
- vld VX2, Y, 0 * SIZE
- vld VX3, Y, 2 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d X, X, INCX
- vpickev.d x3, VX3, VX2
- vpickod.d x4, VX3, VX2
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 2 * SIZE
-
- vld VX2, Y, 4 * SIZE
- vld VX3, Y, 6 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d X, X, INCX
- vpickev.d x3, VX3, VX2
- vpickod.d x4, VX3, VX2
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- addi.d I, I, -1
- vst VX3, Y, 4 * SIZE
- vst VX3, Y, 6 * SIZE
- addi.d Y, Y, 8 * SIZE
- blt $r0, I, .L211
- b .L997
- .align 3
- #else
- vld VX2, Y, 0 * SIZE
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.w x1, t1, 0
- vinsgr2vr.w x2, t2, 0
- vinsgr2vr.w x1, t3, 1
- vinsgr2vr.w x2, t4, 1
- vld VX3, Y, 4 * SIZE
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- vinsgr2vr.w x1, t1, 2
- vinsgr2vr.w x2, t2, 2
- vinsgr2vr.w x1, t3, 3
- vinsgr2vr.w x2, t4, 3
- add.d X, X, INCX
-
- vpickev.w x3, VX3, VX2
- vpickod.w x4, VX3, VX2
- vfmul.s VX0, VXAI, x2
- vfmul.s VX1, VXAI, x1
- vfmul.s VX2, VXBI, x4
- vfmul.s VX3, VXBI, x3
- vfmsub.s VX0, VXAR, x1, VX0
- vfmadd.s VX1, VXAR, x2, VX1
- vfmsub.s VX2, VXBR, x3, VX2
- vfmadd.s VX3, VXBR, x4, VX3
- vfadd.s x3, VX0, VX2
- vfadd.s x4, VX1, VX3
- vilvl.w VX2, x4 ,x3
- vilvh.w VX3, x4, x3
- addi.d I, I, -1
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- blt $r0, I, .L211
- b .L997
- .align 3
- #endif
-
- .L22:
- bge $r0, I, .L997
- move YY, Y
- #ifdef DOUBLE
- fcmp.ceq.d $fcc0, BETAR, a1
- fcmp.ceq.d $fcc1, BETAI, a1
- fcmp.ceq.d $fcc2, ALPHAR, a1
- fcmp.ceq.d $fcc3, ALPHAI, a1
- #else
- fcmp.ceq.s $fcc0, BETAR, a1
- fcmp.ceq.s $fcc1, BETAI, a1
- fcmp.ceq.s $fcc2, ALPHAR, a1
- fcmp.ceq.s $fcc3, ALPHAI, a1
- #endif
- bceqz $fcc0, .L23
- bceqz $fcc1, .L23
- b .L24
- .align 3
-
- .L23:
- bceqz $fcc2, .L224
- bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L24:
- bceqz $fcc2, .L222
- bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- vstelm.d VXZ, Y, 0, 0
- vstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.d VXZ, Y, 0, 0
- vstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.d VXZ, Y, 0, 0
- vstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.d VXZ, Y, 0, 0
- vstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- addi.d I, I, -1
- blt $r0, I, .L221
- b .L997
- .align 3
- #else
- vstelm.w VXZ, Y, 0, 0
- vstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.w VXZ, Y, 0, 0
- vstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.w VXZ, Y, 0, 0
- vstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.w VXZ, Y, 0, 0
- vstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- addi.d I, I, -1
- blt $r0, I, .L221
- b .L997
- .align 3
- #endif
-
- .L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- vfmul.d x3, VXAI, x2
- vfmul.d x4, VXAI, x1
- vfmsub.d x3, VXAR, x1, x3
- vfmadd.d x4, VXAR, x2, x4
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
-
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d X, X, INCX
- vfmul.d x3, VXAI, x2
- vfmul.d x4, VXAI, x1
- vfmsub.d x3, VXAR, x1, x3
- vfmadd.d x4, VXAR, x2, x4
- addi.d I, I, -1
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- blt $r0, I, .L222
- b .L997
- .align 3
- #else
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.w x1, t1, 0
- vinsgr2vr.w x2, t2, 0
- vinsgr2vr.w x1, t3, 1
- vinsgr2vr.w x2, t4, 1
-
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- vinsgr2vr.w x1, t1, 2
- vinsgr2vr.w x2, t2, 2
- vinsgr2vr.w x1, t3, 3
- vinsgr2vr.w x2, t4, 3
- add.d X, X, INCX
- vfmul.s x3, VXAI, x2
- vfmul.s x4, VXAI, x1
- vfmsub.s x3, VXAR, x1, x3
- vfmadd.s x4, VXAR, x2, x4
- addi.d I, I, -1
- vstelm.w x3, YY, 0 * SIZE, 0
- vstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 1
- vstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 2
- vstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 3
- vstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L222
- b .L997
- .align 3
- #endif
-
- .L223:
- #ifdef DOUBLE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d Y, Y, INCY
- vfmul.d x3, VXBI, x2
- vfmul.d x4, VXBI, x1
- vfmsub.d x3, VXBR, x1, x3
- vfmadd.d x4, VXBR, x2, x4
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d Y, Y, INCY
- vfmul.d x3, VXBI, x2
- vfmul.d x4, VXBI, x1
- vfmsub.d x3, VXBR, x1, x3
- vfmadd.d x4, VXBR, x2, x4
- addi.d I, I, -1
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- blt $r0, I, .L223
- b .L997
- .align 3
- #else
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- vinsgr2vr.w x1, t1, 0
- vinsgr2vr.w x2, t2, 0
- vinsgr2vr.w x1, t3, 1
- vinsgr2vr.w x2, t4, 1
-
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- vinsgr2vr.w x1, t1, 2
- vinsgr2vr.w x2, t2, 2
- vinsgr2vr.w x1, t3, 3
- vinsgr2vr.w x2, t4, 3
- add.d Y, Y, INCY
- vfmul.s x3, VXBI, x2
- vfmul.s x4, VXBI, x1
- vfmsub.s x3, VXBR, x1, x3
- vfmadd.s x4, VXBR, x2, x4
-
- addi.d I, I, -1
- vstelm.w x3, YY, 0 * SIZE, 0
- vstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 1
- vstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 2
- vstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 3
- vstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L223
- b .L997
- .align 3
- #endif
-
- .L224:
- #ifdef DOUBLE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x3, t1, 0
- vinsgr2vr.d x4, t2, 0
- vinsgr2vr.d x3, t3, 1
- vinsgr2vr.d x4, t4, 1
- add.d Y, Y, INCY
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
-
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x3, t1, 0
- vinsgr2vr.d x4, t2, 0
- vinsgr2vr.d x3, t3, 1
- vinsgr2vr.d x4, t4, 1
- add.d Y, Y, INCY
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- addi.d I, I, -1
- blt $r0, I, .L224
- b .L997
- .align 3
- #else
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.w x1, t1, 0
- vinsgr2vr.w x2, t2, 0
- vinsgr2vr.w x1, t3, 1
- vinsgr2vr.w x2, t4, 1
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.w x1, t1, 2
- vinsgr2vr.w x2, t2, 2
- vinsgr2vr.w x1, t3, 3
- vinsgr2vr.w x2, t4, 3
-
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- vinsgr2vr.w x3, t1, 0
- vinsgr2vr.w x4, t2, 0
- vinsgr2vr.w x3, t3, 1
- vinsgr2vr.w x4, t4, 1
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- vinsgr2vr.w x3, t1, 2
- vinsgr2vr.w x4, t2, 2
- vinsgr2vr.w x3, t3, 3
- vinsgr2vr.w x4, t4, 3
- add.d Y, Y, INCY
- vfmul.s VX0, VXAI, x2
- vfmul.s VX1, VXAI, x1
- vfmul.s VX2, VXBI, x4
- vfmul.s VX3, VXBI, x3
- vfmsub.s VX0, VXAR, x1, VX0
- vfmadd.s VX1, VXAR, x2, VX1
- vfmsub.s VX2, VXBR, x3, VX2
- vfmadd.s VX3, VXBR, x4, VX3
- vfadd.s x3, VX0, VX2
- vfadd.s x4, VX1, VX3
- addi.d I, I, -1
-
- vstelm.w x3, YY, 0 * SIZE, 0
- vstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 1
- vstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 2
- vstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 3
- vstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L224
- b .L997
- .align 3
- #endif
-
- .L997:
- andi I, N, 3
- bge $r0, I, .L999
- .align 3
-
- .L998:
- #ifdef DOUBLE
- fld.d a1, X, 0 * SIZE
- fld.d a2, X, 1 * SIZE
- fld.d a3, Y, 0 * SIZE
- fld.d a4, Y, 1 * SIZE
- addi.d I, I, -1
- fmul.d s1, ALPHAI, a2
- fmul.d s2, ALPHAI, a1
- fmul.d s3, BETAI, a4
- fmul.d s4, BETAI, a3
- fmsub.d s1, ALPHAR, a1, s1
- fmadd.d s2, a2, ALPHAR, s2
- fmsub.d s3, BETAR, a3, s3
- fmadd.d s4, a4, BETAR, s4
- fadd.d s3, s3, s1
- fadd.d s4, s4, s2
- fst.d s3, Y, 0 * SIZE
- fst.d s4, Y, 1 * SIZE
- add.d X, X, INCX
- add.d Y, Y, INCY
- blt $r0, I, .L998
- .align 3
- #else
- fld.s a1, X, 0 * SIZE
- fld.s a2, X, 1 * SIZE
- fld.s a3, Y, 0 * SIZE
- fld.s a4, Y, 1 * SIZE
- addi.d I, I, -1
- fmul.s s1, ALPHAI, a2
- fmul.s s2, ALPHAI, a1
- fmul.s s3, BETAI, a4
- fmul.s s4, BETAI, a3
- fmsub.s s1, ALPHAR, a1, s1
- fmadd.s s2, a2, ALPHAR, s2
- fmsub.s s3, BETAR, a3, s3
- fmadd.s s4, a4, BETAR, s4
- fadd.s s3, s3, s1
- fadd.s s4, s4, s2
- fst.s s3, Y, 0 * SIZE
- fst.s s4, Y, 1 * SIZE
- add.d X, X, INCX
- add.d Y, Y, INCY
- blt $r0, I, .L998
- .align 3
- #endif
- .L999:
- move $r4, $r12
- jirl $r0, $r1, 0x0
- .align 3
-
- EPILOGUE
|