|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029 |
- #define ASSEMBLER
-
- #include "common.h"
- #define N $r4
- #define ALPHAR $f0
- #define ALPHAI $f1
- #define X $r5
- #define INCX $r6
- #define BETAR $f2
- #define BETAI $f3
- #define Y $r7
- #define INCY $r8
-
- #define I $r12
- #define TEMP $r13
- #define t1 $r14
- #define t2 $r16
- #define t3 $r15
- #define t4 $r17
- #define XX $r18
- #define YY $r19
- #define a1 $f12
- #define a2 $f13
- #define a3 $f14
- #define a4 $f15
- #define s1 $f16
- #define s2 $f17
- #define s3 $f18
- #define s4 $f19
- #define VX0 $vr8
- #define VX1 $vr20
- #define VX2 $vr21
- #define VX3 $vr22
- #define VXAR $vr23
- #define VXAI $vr19
- #define VXBR $vr14
- #define VXBI $vr13
- #define VXZ $vr12
- #define x1 $vr18
- #define x2 $vr17
- #define x3 $vr16
- #define x4 $vr15
-
- PROLOGUE
-
- bge $r0, N, .L999
- movgr2fr.d a1, $r0
- #ifdef DOUBLE
- ffint.d.l a1, a1
- #else
- ffint.s.l a1, a1
- #endif
- slli.d INCX, INCX, ZBASE_SHIFT
- slli.d INCY, INCY, ZBASE_SHIFT
- #ifdef DOUBLE
- movfr2gr.d t1, ALPHAR
- vreplgr2vr.d VXAR, t1
- movfr2gr.d t2, ALPHAI
- vreplgr2vr.d VXAI, t2
- movfr2gr.d t3, BETAR
- vreplgr2vr.d VXBR, t3
- movfr2gr.d t4, BETAI
- vreplgr2vr.d VXBI, t4
- #else
- movfr2gr.s t1, ALPHAR
- vreplgr2vr.w VXAR, t1
- movfr2gr.s t2, ALPHAI
- vreplgr2vr.w VXAI, t2
- movfr2gr.s t3, BETAR
- vreplgr2vr.w VXBR, t3
- movfr2gr.s t4, BETAI
- vreplgr2vr.w VXBI, t4
- #endif
- vxor.v VXZ, VXZ, VXZ
- // If incx == 0 || incy == 0, do one by one
- and TEMP, INCX, INCY
- or I, N, N
- beqz TEMP, .L998
-
- li.d TEMP, 1
- slli.d TEMP, TEMP, ZBASE_SHIFT
- srai.d I, N, 2
- bne INCX, TEMP, .L20
- bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
- b .L11 // INCX==1 and INCY==1
- .L20:
- bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
- b .L21 // INCX!=1 and INCY==1
-
- .L11:
- bge $r0, I, .L997
- #ifdef DOUBLE
- fcmp.ceq.d $fcc0, BETAR, a1
- fcmp.ceq.d $fcc1, BETAI, a1
- fcmp.ceq.d $fcc2, ALPHAR, a1
- fcmp.ceq.d $fcc3, ALPHAI, a1
- #else
- fcmp.ceq.s $fcc0, BETAR, a1
- fcmp.ceq.s $fcc1, BETAI, a1
- fcmp.ceq.s $fcc2, ALPHAR, a1
- fcmp.ceq.s $fcc3, ALPHAI, a1
- #endif
- bceqz $fcc0, .L13
- bceqz $fcc1, .L13
- b .L14
- .align 3
-
- .L13:
- bceqz $fcc2, .L114
- bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
-
- .L14:
- bceqz $fcc2, .L112
- bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- vst VXZ, Y, 0 * SIZE
- vst VXZ, Y, 2 * SIZE
- vst VXZ, Y, 4 * SIZE
- vst VXZ, Y, 6 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L111
- b .L997
- .align 3
- #else
- vst VXZ, Y, 0 * SIZE
- vst VXZ, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L111
- b .L997
- .align 3
- #endif
-
- .L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- vld VX0, X, 0 * SIZE
- vld VX1, X, 2 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d x3, VXAI, x2
- vfmul.d x4, VXAI, x1
- vfmsub.d x3, VXAR, x1, x3
- vfmadd.d x4, VXAR, x2, x4
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 2 * SIZE
-
- vld VX0, X, 4 * SIZE
- vld VX1, X, 6 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d x3, VXAI, x2
- vfmul.d x4, VXAI, x1
- vfmsub.d x3, VXAR, x1, x3
- vfmadd.d x4, VXAR, x2, x4
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 4 * SIZE
- vst VX3, Y, 6 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L112
- b .L997
- .align 3
- #else
- vld VX0, X, 0 * SIZE
- vld VX1, X, 4 * SIZE
- vpickev.w x1, VX1, VX0
- vpickod.w x2, VX1, VX0
- vfmul.s x3, VXAI, x2
- vfmul.s x4, VXAI, x1
- vfmsub.s x3, VXAR, x1, x3
- vfmadd.s x4, VXAR, x2, x4
- vilvl.w VX2, x4 ,x3
- vilvh.w VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 4 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L112
- b .L997
- .align 3
- #endif
-
- .L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- vld VX0, Y, 0 * SIZE
- vld VX1, Y, 2 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d x3, VXBI, x2
- vfmul.d x4, VXBI, x1
- vfmsub.d x3, VXBR, x1, x3
- vfmadd.d x4, VXBR, x2, x4
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 2 * SIZE
- vld VX0, Y, 4 * SIZE
- vld VX1, Y, 6 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d x3, VXBI, x2
- vfmul.d x4, VXBI, x1
- vfmsub.d x3, VXBR, x1, x3
- vfmadd.d x4, VXBR, x2, x4
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 4 * SIZE
- vst VX3, Y, 6 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L113
- b .L997
- .align 3
- #else
- vld VX0, Y, 0 * SIZE
- vld VX1, Y, 4 * SIZE
- vpickev.w x1, VX1, VX0
- vpickod.w x2, VX1, VX0
- vfmul.s x3, VXBI, x2
- vfmul.s x4, VXBI, x1
- vfmsub.s x3, VXBR, x1, x3
- vfmadd.s x4, VXBR, x2, x4
- vilvl.w VX2, x4 ,x3
- vilvh.w VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L113
- b .L997
- .align 3
- #endif
-
- .L114:
- #ifdef DOUBLE
- vld VX0, X, 0 * SIZE
- vld VX1, X, 2 * SIZE
- vld VX2, Y, 0 * SIZE
- vld VX3, Y, 2 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vpickev.d x3, VX3, VX2
- vpickod.d x4, VX3, VX2
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 2 * SIZE
-
- vld VX0, X, 4 * SIZE
- vld VX1, X, 6 * SIZE
- vld VX2, Y, 4 * SIZE
- vld VX3, Y, 6 * SIZE
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vpickev.d x3, VX3, VX2
- vpickod.d x4, VX3, VX2
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 4 * SIZE
- vst VX3, Y, 6 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L114
- b .L997
- .align 3
- #else
- vld VX0, X, 0 * SIZE
- vld VX1, X, 4 * SIZE
- vld VX2, Y, 0 * SIZE
- vld VX3, Y, 4 * SIZE
- vpickev.w x1, VX1, VX0
- vpickod.w x2, VX1, VX0
- vpickev.w x3, VX3, VX2
- vpickod.w x4, VX3, VX2
- vfmul.s VX0, VXAI, x2
- vfmul.s VX1, VXAI, x1
- vfmul.s VX2, VXBI, x4
- vfmul.s VX3, VXBI, x3
- vfmsub.s VX0, VXAR, x1, VX0
- vfmadd.s VX1, VXAR, x2, VX1
- vfmsub.s VX2, VXBR, x3, VX2
- vfmadd.s VX3, VXBR, x4, VX3
- vfadd.s x3, VX0, VX2
- vfadd.s x4, VX1, VX3
- vilvl.w VX2, x4 ,x3
- vilvh.w VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 4 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- addi.d I, I, -1
- blt $r0, I, .L114
- b .L997
- .align 3
- #endif
-
- .L12: // INCX==1 and INCY!=1
- bge $r0, I, .L997
- move YY, Y
- .align 3
-
- .L121:
- #ifdef DOUBLE
- vld VX0, X, 0 * SIZE
- vld VX1, X, 2 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x3, t1, 0
- vinsgr2vr.d x4, t2, 0
- vinsgr2vr.d x3, t3, 1
- vinsgr2vr.d x4, t4, 1
- add.d Y, Y, INCY
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
-
- vld VX0, X, 4 * SIZE
- vld VX1, X, 6 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x3, t1, 0
- vinsgr2vr.d x4, t2, 0
- vinsgr2vr.d x3, t3, 1
- vinsgr2vr.d x4, t4, 1
- add.d Y, Y, INCY
- vpickev.d x1, VX1, VX0
- vpickod.d x2, VX1, VX0
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- addi.d I, I, -1
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- addi.d X, X, 8 * SIZE
- blt $r0, I, .L121
- b .L997
- .align 3
- #else
- vld VX0, X, 0 * SIZE
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- vinsgr2vr.w x3, t1, 0
- vinsgr2vr.w x4, t2, 0
- vinsgr2vr.w x3, t3, 1
- vinsgr2vr.w x4, t4, 1
-
- vld VX1, X, 4 * SIZE
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- vinsgr2vr.w x3, t1, 2
- vinsgr2vr.w x4, t2, 2
- vinsgr2vr.w x3, t3, 3
- vinsgr2vr.w x4, t4, 3
- add.d Y, Y, INCY
-
- vpickev.w x1, VX1, VX0
- vpickod.w x2, VX1, VX0
- vfmul.s VX0, VXAI, x2
- vfmul.s VX1, VXAI, x1
- vfmul.s VX2, VXBI, x4
- vfmul.s VX3, VXBI, x3
- vfmsub.s VX0, VXAR, x1, VX0
- vfmadd.s VX1, VXAR, x2, VX1
- vfmsub.s VX2, VXBR, x3, VX2
- vfmadd.s VX3, VXBR, x4, VX3
- vfadd.s x3, VX0, VX2
- vfadd.s x4, VX1, VX3
- addi.d I, I, -1
- vstelm.w x3, YY, 0 * SIZE, 0
- vstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 1
- vstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 2
- vstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 3
- vstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- addi.d X, X, 8 * SIZE
- blt $r0, I, .L121
- b .L997
- .align 3
- #endif
-
- .L21:// INCX!=1 and INCY==1
- bge $r0, I, .L997
- .align 3
-
- .L211:
- #ifdef DOUBLE
- vld VX2, Y, 0 * SIZE
- vld VX3, Y, 2 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d X, X, INCX
- vpickev.d x3, VX3, VX2
- vpickod.d x4, VX3, VX2
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 2 * SIZE
-
- vld VX2, Y, 4 * SIZE
- vld VX3, Y, 6 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d X, X, INCX
- vpickev.d x3, VX3, VX2
- vpickod.d x4, VX3, VX2
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vilvl.d VX2, x4 ,x3
- vilvh.d VX3, x4, x3
- addi.d I, I, -1
- vst VX3, Y, 4 * SIZE
- vst VX3, Y, 6 * SIZE
- addi.d Y, Y, 8 * SIZE
- blt $r0, I, .L211
- b .L997
- .align 3
- #else
- vld VX2, Y, 0 * SIZE
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.w x1, t1, 0
- vinsgr2vr.w x2, t2, 0
- vinsgr2vr.w x1, t3, 1
- vinsgr2vr.w x2, t4, 1
- vld VX3, Y, 4 * SIZE
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- vinsgr2vr.w x1, t1, 2
- vinsgr2vr.w x2, t2, 2
- vinsgr2vr.w x1, t3, 3
- vinsgr2vr.w x2, t4, 3
- add.d X, X, INCX
-
- vpickev.w x3, VX3, VX2
- vpickod.w x4, VX3, VX2
- vfmul.s VX0, VXAI, x2
- vfmul.s VX1, VXAI, x1
- vfmul.s VX2, VXBI, x4
- vfmul.s VX3, VXBI, x3
- vfmsub.s VX0, VXAR, x1, VX0
- vfmadd.s VX1, VXAR, x2, VX1
- vfmsub.s VX2, VXBR, x3, VX2
- vfmadd.s VX3, VXBR, x4, VX3
- vfadd.s x3, VX0, VX2
- vfadd.s x4, VX1, VX3
- vilvl.w VX2, x4 ,x3
- vilvh.w VX3, x4, x3
- addi.d I, I, -1
- vst VX2, Y, 0 * SIZE
- vst VX3, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- blt $r0, I, .L211
- b .L997
- .align 3
- #endif
-
- .L22:
- bge $r0, I, .L997
- move YY, Y
- #ifdef DOUBLE
- fcmp.ceq.d $fcc0, BETAR, a1
- fcmp.ceq.d $fcc1, BETAI, a1
- fcmp.ceq.d $fcc2, ALPHAR, a1
- fcmp.ceq.d $fcc3, ALPHAI, a1
- #else
- fcmp.ceq.s $fcc0, BETAR, a1
- fcmp.ceq.s $fcc1, BETAI, a1
- fcmp.ceq.s $fcc2, ALPHAR, a1
- fcmp.ceq.s $fcc3, ALPHAI, a1
- #endif
- bceqz $fcc0, .L23
- bceqz $fcc1, .L23
- b .L24
- .align 3
-
- .L23:
- bceqz $fcc2, .L224
- bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L24:
- bceqz $fcc2, .L222
- bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- vstelm.d VXZ, Y, 0, 0
- vstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.d VXZ, Y, 0, 0
- vstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.d VXZ, Y, 0, 0
- vstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.d VXZ, Y, 0, 0
- vstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- addi.d I, I, -1
- blt $r0, I, .L221
- b .L997
- .align 3
- #else
- vstelm.w VXZ, Y, 0, 0
- vstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.w VXZ, Y, 0, 0
- vstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.w VXZ, Y, 0, 0
- vstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- vstelm.w VXZ, Y, 0, 0
- vstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- addi.d I, I, -1
- blt $r0, I, .L221
- b .L997
- .align 3
- #endif
-
- .L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- vfmul.d x3, VXAI, x2
- vfmul.d x4, VXAI, x1
- vfmsub.d x3, VXAR, x1, x3
- vfmadd.d x4, VXAR, x2, x4
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
-
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d X, X, INCX
- vfmul.d x3, VXAI, x2
- vfmul.d x4, VXAI, x1
- vfmsub.d x3, VXAR, x1, x3
- vfmadd.d x4, VXAR, x2, x4
- addi.d I, I, -1
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- blt $r0, I, .L222
- b .L997
- .align 3
- #else
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.w x1, t1, 0
- vinsgr2vr.w x2, t2, 0
- vinsgr2vr.w x1, t3, 1
- vinsgr2vr.w x2, t4, 1
-
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- vinsgr2vr.w x1, t1, 2
- vinsgr2vr.w x2, t2, 2
- vinsgr2vr.w x1, t3, 3
- vinsgr2vr.w x2, t4, 3
- add.d X, X, INCX
- vfmul.s x3, VXAI, x2
- vfmul.s x4, VXAI, x1
- vfmsub.s x3, VXAR, x1, x3
- vfmadd.s x4, VXAR, x2, x4
- addi.d I, I, -1
- vstelm.w x3, YY, 0 * SIZE, 0
- vstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 1
- vstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 2
- vstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 3
- vstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L222
- b .L997
- .align 3
- #endif
-
- .L223:
- #ifdef DOUBLE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d Y, Y, INCY
- vfmul.d x3, VXBI, x2
- vfmul.d x4, VXBI, x1
- vfmsub.d x3, VXBR, x1, x3
- vfmadd.d x4, VXBR, x2, x4
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- add.d Y, Y, INCY
- vfmul.d x3, VXBI, x2
- vfmul.d x4, VXBI, x1
- vfmsub.d x3, VXBR, x1, x3
- vfmadd.d x4, VXBR, x2, x4
- addi.d I, I, -1
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- blt $r0, I, .L223
- b .L997
- .align 3
- #else
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- vinsgr2vr.w x1, t1, 0
- vinsgr2vr.w x2, t2, 0
- vinsgr2vr.w x1, t3, 1
- vinsgr2vr.w x2, t4, 1
-
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- vinsgr2vr.w x1, t1, 2
- vinsgr2vr.w x2, t2, 2
- vinsgr2vr.w x1, t3, 3
- vinsgr2vr.w x2, t4, 3
- add.d Y, Y, INCY
- vfmul.s x3, VXBI, x2
- vfmul.s x4, VXBI, x1
- vfmsub.s x3, VXBR, x1, x3
- vfmadd.s x4, VXBR, x2, x4
-
- addi.d I, I, -1
- vstelm.w x3, YY, 0 * SIZE, 0
- vstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 1
- vstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 2
- vstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 3
- vstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L223
- b .L997
- .align 3
- #endif
-
- .L224:
- #ifdef DOUBLE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x3, t1, 0
- vinsgr2vr.d x4, t2, 0
- vinsgr2vr.d x3, t3, 1
- vinsgr2vr.d x4, t4, 1
- add.d Y, Y, INCY
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
-
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.d x1, t1, 0
- vinsgr2vr.d x2, t2, 0
- vinsgr2vr.d x1, t3, 1
- vinsgr2vr.d x2, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- vinsgr2vr.d x3, t1, 0
- vinsgr2vr.d x4, t2, 0
- vinsgr2vr.d x3, t3, 1
- vinsgr2vr.d x4, t4, 1
- add.d Y, Y, INCY
- vfmul.d VX0, VXAI, x2
- vfmul.d VX1, VXAI, x1
- vfmul.d VX2, VXBI, x4
- vfmul.d VX3, VXBI, x3
- vfmsub.d VX0, VXAR, x1, VX0
- vfmadd.d VX1, VXAR, x2, VX1
- vfmsub.d VX2, VXBR, x3, VX2
- vfmadd.d VX3, VXBR, x4, VX3
- vfadd.d x3, VX0, VX2
- vfadd.d x4, VX1, VX3
- vstelm.d x3, YY, 0 * SIZE, 0
- vstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.d x3, YY, 0 * SIZE, 1
- vstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- addi.d I, I, -1
- blt $r0, I, .L224
- b .L997
- .align 3
- #else
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.w x1, t1, 0
- vinsgr2vr.w x2, t2, 0
- vinsgr2vr.w x1, t3, 1
- vinsgr2vr.w x2, t4, 1
- ld.w t1, X, 0 * SIZE
- ld.w t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.w t3, X, 0 * SIZE
- ld.w t4, X, 1 * SIZE
- add.d X, X, INCX
- vinsgr2vr.w x1, t1, 2
- vinsgr2vr.w x2, t2, 2
- vinsgr2vr.w x1, t3, 3
- vinsgr2vr.w x2, t4, 3
-
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- vinsgr2vr.w x3, t1, 0
- vinsgr2vr.w x4, t2, 0
- vinsgr2vr.w x3, t3, 1
- vinsgr2vr.w x4, t4, 1
- ld.w t1, Y, 0 * SIZE
- ld.w t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.w t3, Y, 0 * SIZE
- ld.w t4, Y, 1 * SIZE
- vinsgr2vr.w x3, t1, 2
- vinsgr2vr.w x4, t2, 2
- vinsgr2vr.w x3, t3, 3
- vinsgr2vr.w x4, t4, 3
- add.d Y, Y, INCY
- vfmul.s VX0, VXAI, x2
- vfmul.s VX1, VXAI, x1
- vfmul.s VX2, VXBI, x4
- vfmul.s VX3, VXBI, x3
- vfmsub.s VX0, VXAR, x1, VX0
- vfmadd.s VX1, VXAR, x2, VX1
- vfmsub.s VX2, VXBR, x3, VX2
- vfmadd.s VX3, VXBR, x4, VX3
- vfadd.s x3, VX0, VX2
- vfadd.s x4, VX1, VX3
- addi.d I, I, -1
-
- vstelm.w x3, YY, 0 * SIZE, 0
- vstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 1
- vstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 2
- vstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- vstelm.w x3, YY, 0 * SIZE, 3
- vstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L224
- b .L997
- .align 3
- #endif
-
- .L997:
- andi I, N, 3
- bge $r0, I, .L999
- .align 3
-
- .L998:
- #ifdef DOUBLE
- fld.d a1, X, 0 * SIZE
- fld.d a2, X, 1 * SIZE
- fld.d a3, Y, 0 * SIZE
- fld.d a4, Y, 1 * SIZE
- addi.d I, I, -1
- fmul.d s1, ALPHAI, a2
- fmul.d s2, ALPHAI, a1
- fmul.d s3, BETAI, a4
- fmul.d s4, BETAI, a3
- fmsub.d s1, ALPHAR, a1, s1
- fmadd.d s2, a2, ALPHAR, s2
- fmsub.d s3, BETAR, a3, s3
- fmadd.d s4, a4, BETAR, s4
- fadd.d s3, s3, s1
- fadd.d s4, s4, s2
- fst.d s3, Y, 0 * SIZE
- fst.d s4, Y, 1 * SIZE
- add.d X, X, INCX
- add.d Y, Y, INCY
- blt $r0, I, .L998
- .align 3
- #else
- fld.s a1, X, 0 * SIZE
- fld.s a2, X, 1 * SIZE
- fld.s a3, Y, 0 * SIZE
- fld.s a4, Y, 1 * SIZE
- addi.d I, I, -1
- fmul.s s1, ALPHAI, a2
- fmul.s s2, ALPHAI, a1
- fmul.s s3, BETAI, a4
- fmul.s s4, BETAI, a3
- fmsub.s s1, ALPHAR, a1, s1
- fmadd.s s2, a2, ALPHAR, s2
- fmsub.s s3, BETAR, a3, s3
- fmadd.s s4, a4, BETAR, s4
- fadd.s s3, s3, s1
- fadd.s s4, s4, s2
- fst.s s3, Y, 0 * SIZE
- fst.s s4, Y, 1 * SIZE
- add.d X, X, INCX
- add.d Y, Y, INCY
- blt $r0, I, .L998
- .align 3
- #endif
- .L999:
- move $r4, $r12
- jirl $r0, $r1, 0x0
- .align 3
-
- EPILOGUE
|