|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052 |
- #define ASSEMBLER
-
- #include "common.h"
- #define N $r4
- #define ALPHAR $f0
- #define ALPHAI $f1
- #define X $r5
- #define INCX $r6
- #define BETAR $f2
- #define BETAI $f3
- #define Y $r7
- #define INCY $r8
-
- #define I $r12
- #define TEMP $r13
- #define t1 $r14
- #define t2 $r16
- #define t3 $r15
- #define t4 $r17
- #define XX $r18
- #define YY $r19
- #define a1 $f12
- #define a2 $f13
- #define a3 $f14
- #define a4 $f15
- #define s1 $f16
- #define s2 $f17
- #define s3 $f18
- #define s4 $f19
- #define VX0 $xr8
- #define VX1 $xr20
- #define VX2 $xr21
- #define VX3 $xr22
- #define VXAR $xr23
- #define VXAI $xr19
- #define VXBR $xr14
- #define VXBI $xr13
- #define VXZ $xr12
- #define x1 $xr18
- #define x2 $xr17
- #define x3 $xr16
- #define x4 $xr15
-
- PROLOGUE
-
- bge $r0, N, .L999
- movgr2fr.d a1, $r0
- FFINT a1, a1
- slli.d INCX, INCX, ZBASE_SHIFT
- slli.d INCY, INCY, ZBASE_SHIFT
- MTG t1, ALPHAR
- MTG t2, ALPHAI
- MTG t3, BETAR
- MTG t4, BETAI
- #ifdef DOUBLE
- xvreplgr2vr.d VXAR, t1
- xvreplgr2vr.d VXAI, t2
- xvreplgr2vr.d VXBR, t3
- xvreplgr2vr.d VXBI, t4
- #else
- xvreplgr2vr.w VXAR, t1
- xvreplgr2vr.w VXAI, t2
- xvreplgr2vr.w VXBR, t3
- xvreplgr2vr.w VXBI, t4
- #endif
- xvxor.v VXZ, VXZ, VXZ
- // If incx == 0 || incy == 0, do one by one
- and TEMP, INCX, INCY
- or I, N, N
- beqz TEMP, .L998
-
- li.d TEMP, 1
- slli.d TEMP, TEMP, ZBASE_SHIFT
- #ifdef DOUBLE
- srai.d I, N, 2
- #else
- srai.d I, N, 3
- #endif
- bne INCX, TEMP, .L20
- bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
- b .L11 // INCX==1 and INCY==1
- .L20:
- bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
- b .L21 // INCX!=1 and INCY==1
-
- .L11:
- bge $r0, I, .L997
- CMPEQ $fcc0, BETAR, a1
- CMPEQ $fcc1, BETAI, a1
- CMPEQ $fcc2, ALPHAR, a1
- CMPEQ $fcc3, ALPHAI, a1
- bceqz $fcc0, .L13
- bceqz $fcc1, .L13
- b .L14
- .align 3
-
- .L13:
- bceqz $fcc2, .L114
- bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
-
- .L14:
- bceqz $fcc2, .L112
- bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- xvst VXZ, Y, 0 * SIZE
- #ifdef DOUBLE
- xvst VXZ, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- #else
- xvst VXZ, Y, 8 * SIZE
- addi.d Y, Y, 16 * SIZE
- #endif
- addi.d I, I, -1
- blt $r0, I, .L111
- b .L997
- .align 3
-
- .L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- xvld VX0, X, 0 * SIZE
- xvld VX1, X, 4 * SIZE
- xvpickev.d x1, VX1, VX0
- xvpickod.d x2, VX1, VX0
- #else
- xvld VX0, X, 0 * SIZE
- xvld VX1, X, 8 * SIZE
- xvpickev.w x1, VX1, VX0
- xvpickod.w x2, VX1, VX0
- #endif
- XVFMUL x3, VXAI, x2
- XVFMUL x4, VXAI, x1
- XVMSUB x3, VXAR, x1, x3
- XVFMADD x4, VXAR, x2, x4
- #ifdef DOUBLE
- xvilvl.d VX2, x4 ,x3
- xvilvh.d VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 4 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- #else
- xvilvl.w VX2, x4 ,x3
- xvilvh.w VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 8 * SIZE
- addi.d X, X, 16 * SIZE
- addi.d Y, Y, 16 * SIZE
- #endif
- addi.d I, I, -1
- blt $r0, I, .L112
- b .L997
- .align 3
-
- .L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- xvld VX0, Y, 0 * SIZE
- xvld VX1, Y, 4 * SIZE
- xvpickev.d x1, VX1, VX0
- xvpickod.d x2, VX1, VX0
- #else
- xvld VX0, Y, 0 * SIZE
- xvld VX1, Y, 8 * SIZE
- xvpickev.w x1, VX1, VX0
- xvpickod.w x2, VX1, VX0
- #endif
- XVFMUL x3, VXBI, x2
- XVFMUL x4, VXBI, x1
- XVMSUB x3, VXBR, x1, x3
- XVFMADD x4, VXBR, x2, x4
- #ifdef DOUBLE
- xvilvl.d VX2, x4 ,x3
- xvilvh.d VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- #else
- xvilvl.w VX2, x4 ,x3
- xvilvh.w VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 8 * SIZE
- addi.d Y, Y, 16 * SIZE
- #endif
- addi.d I, I, -1
- blt $r0, I, .L113
- b .L997
- .align 3
-
- .L114:
- #ifdef DOUBLE
- xvld VX0, X, 0 * SIZE
- xvld VX1, X, 4 * SIZE
- xvld VX2, Y, 0 * SIZE
- xvld VX3, Y, 4 * SIZE
- xvpickev.d x1, VX1, VX0
- xvpickod.d x2, VX1, VX0
- xvpickev.d x3, VX3, VX2
- xvpickod.d x4, VX3, VX2
- #else
- xvld VX0, X, 0 * SIZE
- xvld VX1, X, 8 * SIZE
- xvld VX2, Y, 0 * SIZE
- xvld VX3, Y, 8 * SIZE
- xvpickev.w x1, VX1, VX0
- xvpickod.w x2, VX1, VX0
- xvpickev.w x3, VX3, VX2
- xvpickod.w x4, VX3, VX2
- #endif
- XVFMUL VX0, VXAI, x2
- XVFMUL VX1, VXAI, x1
- XVFMUL VX2, VXBI, x4
- XVFMUL VX3, VXBI, x3
- XVMSUB VX0, VXAR, x1, VX0
- XVFMADD VX1, VXAR, x2, VX1
- XVMSUB VX2, VXBR, x3, VX2
- XVFMADD VX3, VXBR, x4, VX3
- XVFADD x3, VX0, VX2
- XVFADD x4, VX1, VX3
- #ifdef DOUBLE
- xvilvl.d VX2, x4 ,x3
- xvilvh.d VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 4 * SIZE
- addi.d X, X, 8 * SIZE
- addi.d Y, Y, 8 * SIZE
- #else
- xvilvl.w VX2, x4 ,x3
- xvilvh.w VX3, x4, x3
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 8 * SIZE
- addi.d X, X, 16 * SIZE
- addi.d Y, Y, 16 * SIZE
- #endif
- addi.d I, I, -1
- blt $r0, I, .L114
- b .L997
- .align 3
-
- .L12: // INCX==1 and INCY!=1
- bge $r0, I, .L997
- move YY, Y
- .align 3
-
- .L121:
- #ifdef DOUBLE
- xvld VX0, X, 0 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.d x3, t1, 0
- xvinsgr2vr.d x4, t2, 0
- xvinsgr2vr.d x3, t3, 2
- xvinsgr2vr.d x4, t4, 2
-
- xvld VX1, X, 4 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.d x3, t1, 1
- xvinsgr2vr.d x4, t2, 1
- xvinsgr2vr.d x3, t3, 3
- xvinsgr2vr.d x4, t4, 3
- add.d Y, Y, INCY
-
- xvpickev.d x1, VX1, VX0
- xvpickod.d x2, VX1, VX0
- xvfmul.d VX0, VXAI, x2
- xvfmul.d VX1, VXAI, x1
- xvfmul.d VX2, VXBI, x4
- xvfmul.d VX3, VXBI, x3
- xvfmsub.d VX0, VXAR, x1, VX0
- xvfmadd.d VX1, VXAR, x2, VX1
- xvfmsub.d VX2, VXBR, x3, VX2
- xvfmadd.d VX3, VXBR, x4, VX3
- xvfadd.d x3, VX0, VX2
- xvfadd.d x4, VX1, VX3
- addi.d I, I, -1
- xvstelm.d x3, YY, 0 * SIZE, 0
- xvstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 2
- xvstelm.d x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 1
- xvstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 3
- xvstelm.d x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- addi.d X, X, 8 * SIZE
- blt $r0, I, .L121
- b .L997
- .align 3
- #else
- xvld VX0, X, 0 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 0
- xvinsgr2vr.w x4, t2, 0
- xvinsgr2vr.w x3, t3, 1
- xvinsgr2vr.w x4, t4, 1
- xvld VX1, X, 8 * SIZE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.w x3, t1, 4
- xvinsgr2vr.w x4, t2, 4
- xvinsgr2vr.w x3, t3, 5
- xvinsgr2vr.w x4, t4, 5
- add.d Y, Y, INCY
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 2
- xvinsgr2vr.w x4, t2, 2
- xvinsgr2vr.w x3, t3, 3
- xvinsgr2vr.w x4, t4, 3
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.w x3, t1, 6
- xvinsgr2vr.w x4, t2, 6
- xvinsgr2vr.w x3, t3, 7
- xvinsgr2vr.w x4, t4, 7
- add.d Y, Y, INCY
-
- xvpickev.w x1, VX1, VX0
- xvpickod.w x2, VX1, VX0
- XVFMUL VX0, VXAI, x2
- XVFMUL VX1, VXAI, x1
- XVFMUL VX2, VXBI, x4
- XVFMUL VX3, VXBI, x3
- XVMSUB VX0, VXAR, x1, VX0
- XVFMADD VX1, VXAR, x2, VX1
- XVMSUB VX2, VXBR, x3, VX2
- XVFMADD VX3, VXBR, x4, VX3
- XVFADD x3, VX0, VX2
- XVFADD x4, VX1, VX3
- addi.d I, I, -1
- xvstelm.w x3, YY, 0 * SIZE, 0
- xvstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 1
- xvstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 4
- xvstelm.w x4, YY, 1 * SIZE, 4
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 5
- xvstelm.w x4, YY, 1 * SIZE, 5
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 2
- xvstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 3
- xvstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 6
- xvstelm.w x4, YY, 1 * SIZE, 6
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 7
- xvstelm.w x4, YY, 1 * SIZE, 7
- add.d YY, YY, INCY
- addi.d X, X, 16 * SIZE
- blt $r0, I, .L121
- b .L997
- .align 3
- #endif
-
- .L21:// INCX!=1 and INCY==1
- bge $r0, I, .L997
- .align 3
-
- .L211:
- #ifdef DOUBLE
- xvld VX2, Y, 0 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.d x1, t1, 0
- xvinsgr2vr.d x2, t2, 0
- xvinsgr2vr.d x1, t3, 2
- xvinsgr2vr.d x2, t4, 2
- xvld VX3, Y, 4 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- xvinsgr2vr.d x1, t1, 1
- xvinsgr2vr.d x2, t2, 1
- xvinsgr2vr.d x1, t3, 3
- xvinsgr2vr.d x2, t4, 3
- add.d X, X, INCX
-
- xvpickev.d x3, VX3, VX2
- xvpickod.d x4, VX3, VX2
- xvfmul.d VX0, VXAI, x2
- xvfmul.d VX1, VXAI, x1
- xvfmul.d VX2, VXBI, x4
- xvfmul.d VX3, VXBI, x3
- xvfmsub.d VX0, VXAR, x1, VX0
- xvfmadd.d VX1, VXAR, x2, VX1
- xvfmsub.d VX2, VXBR, x3, VX2
- xvfmadd.d VX3, VXBR, x4, VX3
- xvfadd.d x3, VX0, VX2
- xvfadd.d x4, VX1, VX3
- xvilvl.d VX2, x4 ,x3
- xvilvh.d VX3, x4, x3
- addi.d I, I, -1
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 4 * SIZE
- addi.d Y, Y, 8 * SIZE
- blt $r0, I, .L211
- b .L997
- .align 3
- #else
- xvld VX2, Y, 0 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 0
- xvinsgr2vr.w x2, t2, 0
- xvinsgr2vr.w x1, t3, 1
- xvinsgr2vr.w x2, t4, 1
- xvld VX3, Y, 8 * SIZE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 4
- xvinsgr2vr.w x2, t2, 4
- xvinsgr2vr.w x1, t3, 5
- xvinsgr2vr.w x2, t4, 5
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 2
- xvinsgr2vr.w x2, t2, 2
- xvinsgr2vr.w x1, t3, 3
- xvinsgr2vr.w x2, t4, 3
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- xvinsgr2vr.w x1, t1, 6
- xvinsgr2vr.w x2, t2, 6
- xvinsgr2vr.w x1, t3, 7
- xvinsgr2vr.w x2, t4, 7
- add.d X, X, INCX
-
- xvpickev.w x3, VX3, VX2
- xvpickod.w x4, VX3, VX2
- XVFMUL VX0, VXAI, x2
- XVFMUL VX1, VXAI, x1
- XVFMUL VX2, VXBI, x4
- XVFMUL VX3, VXBI, x3
- XVMSUB VX0, VXAR, x1, VX0
- XVFMADD VX1, VXAR, x2, VX1
- XVMSUB VX2, VXBR, x3, VX2
- XVFMADD VX3, VXBR, x4, VX3
- XVFADD x3, VX0, VX2
- XVFADD x4, VX1, VX3
- xvilvl.w VX2, x4 ,x3
- xvilvh.w VX3, x4, x3
- addi.d I, I, -1
- xvst VX2, Y, 0 * SIZE
- xvst VX3, Y, 8 * SIZE
- addi.d Y, Y, 16 * SIZE
- blt $r0, I, .L211
- b .L997
- .align 3
- #endif
-
- .L22:
- bge $r0, I, .L997
- move YY, Y
- CMPEQ $fcc0, BETAR, a1
- CMPEQ $fcc1, BETAI, a1
- CMPEQ $fcc2, ALPHAR, a1
- CMPEQ $fcc3, ALPHAI, a1
- bceqz $fcc0, .L23
- bceqz $fcc1, .L23
- b .L24
- .align 3
-
- .L23:
- bceqz $fcc2, .L224
- bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L24:
- bceqz $fcc2, .L222
- bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- .align 3
-
- .L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- xvstelm.d VXZ, Y, 0, 0
- xvstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.d VXZ, Y, 0, 0
- xvstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.d VXZ, Y, 0, 0
- xvstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.d VXZ, Y, 0, 0
- xvstelm.d VXZ, Y, 0, 0
- add.d Y, Y, INCY
- addi.d I, I, -1
- blt $r0, I, .L221
- b .L997
- .align 3
- #else
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, YY, 0, 0
- xvstelm.w VXZ, YY, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- xvstelm.w VXZ, Y, 0, 0
- xvstelm.w VXZ, Y, 0, 0
- add.d Y, Y, INCY
- addi.d I, I, -1
- blt $r0, I, .L221
- b .L997
- .align 3
- #endif
-
- .L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
- #ifdef DOUBLE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.d x1, t1, 0
- xvinsgr2vr.d x2, t2, 0
- xvinsgr2vr.d x1, t3, 1
- xvinsgr2vr.d x2, t4, 1
-
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- xvinsgr2vr.d x1, t1, 2
- xvinsgr2vr.d x2, t2, 2
- xvinsgr2vr.d x1, t3, 3
- xvinsgr2vr.d x2, t4, 3
- add.d X, X, INCX
- xvfmul.d x3, VXAI, x2
- xvfmul.d x4, VXAI, x1
- xvfmsub.d x3, VXAR, x1, x3
- xvfmadd.d x4, VXAR, x2, x4
- addi.d I, I, -1
- xvstelm.d x3, YY, 0 * SIZE, 0
- xvstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 1
- xvstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 2
- xvstelm.d x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 3
- xvstelm.d x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L222
- move Y, YY
- b .L997
- .align 3
- #else
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 0
- xvinsgr2vr.w x2, t2, 0
- xvinsgr2vr.w x1, t3, 1
- xvinsgr2vr.w x2, t4, 1
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 2
- xvinsgr2vr.w x2, t2, 2
- xvinsgr2vr.w x1, t3, 3
- xvinsgr2vr.w x2, t4, 3
-
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 4
- xvinsgr2vr.w x2, t2, 4
- xvinsgr2vr.w x1, t3, 5
- xvinsgr2vr.w x2, t4, 5
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- xvinsgr2vr.w x1, t1, 6
- xvinsgr2vr.w x2, t2, 6
- xvinsgr2vr.w x1, t3, 7
- xvinsgr2vr.w x2, t4, 7
- add.d X, X, INCX
- XVFMUL x3, VXAI, x2
- XVFMUL x4, VXAI, x1
- XVMSUB x3, VXAR, x1, x3
- XVFMADD x4, VXAR, x2, x4
- addi.d I, I, -1
- xvstelm.w x3, YY, 0 * SIZE, 0
- xvstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 1
- xvstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 2
- xvstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 3
- xvstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 4
- xvstelm.w x4, YY, 1 * SIZE, 4
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 5
- xvstelm.w x4, YY, 1 * SIZE, 5
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 6
- xvstelm.w x4, YY, 1 * SIZE, 6
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 7
- xvstelm.w x4, YY, 1 * SIZE, 7
- add.d YY, YY, INCY
- blt $r0, I, .L222
- move Y, YY
- b .L997
- .align 3
- #endif
-
- .L223:
- #ifdef DOUBLE
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.d x1, t1, 0
- xvinsgr2vr.d x2, t2, 0
- xvinsgr2vr.d x1, t3, 1
- xvinsgr2vr.d x2, t4, 1
-
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.d x1, t1, 2
- xvinsgr2vr.d x2, t2, 2
- xvinsgr2vr.d x1, t3, 3
- xvinsgr2vr.d x2, t4, 3
- add.d Y, Y, INCY
- xvfmul.d x3, VXBI, x2
- xvfmul.d x4, VXBI, x1
- xvfmsub.d x3, VXBR, x1, x3
- xvfmadd.d x4, VXBR, x2, x4
-
- addi.d I, I, -1
- xvstelm.d x3, YY, 0 * SIZE, 0
- xvstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 1
- xvstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 2
- xvstelm.d x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 3
- xvstelm.d x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L223
- b .L997
- .align 3
- #else
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x1, t1, 0
- xvinsgr2vr.w x2, t2, 0
- xvinsgr2vr.w x1, t3, 1
- xvinsgr2vr.w x2, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x1, t1, 2
- xvinsgr2vr.w x2, t2, 2
- xvinsgr2vr.w x1, t3, 3
- xvinsgr2vr.w x2, t4, 3
-
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x1, t1, 4
- xvinsgr2vr.w x2, t2, 4
- xvinsgr2vr.w x1, t3, 5
- xvinsgr2vr.w x2, t4, 5
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.w x1, t1, 6
- xvinsgr2vr.w x2, t2, 6
- xvinsgr2vr.w x1, t3, 7
- xvinsgr2vr.w x2, t4, 7
- add.d Y, Y, INCY
-
- XVFMUL x3, VXBI, x2
- XVFMUL x4, VXBI, x1
- XVMSUB x3, VXBR, x1, x3
- XVFMADD x4, VXBR, x2, x4
- addi.d I, I, -1
- xvstelm.w x3, YY, 0 * SIZE, 0
- xvstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 1
- xvstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 2
- xvstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 3
- xvstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 4
- xvstelm.w x4, YY, 1 * SIZE, 4
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 5
- xvstelm.w x4, YY, 1 * SIZE, 5
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 6
- xvstelm.w x4, YY, 1 * SIZE, 6
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 7
- xvstelm.w x4, YY, 1 * SIZE, 7
- add.d YY, YY, INCY
- blt $r0, I, .L223
- b .L997
- .align 3
- #endif
-
- .L224:
- #ifdef DOUBLE
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.d x1, t1, 0
- xvinsgr2vr.d x2, t2, 0
- xvinsgr2vr.d x1, t3, 1
- xvinsgr2vr.d x2, t4, 1
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.d x1, t1, 2
- xvinsgr2vr.d x2, t2, 2
- xvinsgr2vr.d x1, t3, 3
- xvinsgr2vr.d x2, t4, 3
-
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.d x3, t1, 0
- xvinsgr2vr.d x4, t2, 0
- xvinsgr2vr.d x3, t3, 1
- xvinsgr2vr.d x4, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.d x3, t1, 2
- xvinsgr2vr.d x4, t2, 2
- xvinsgr2vr.d x3, t3, 3
- xvinsgr2vr.d x4, t4, 3
- add.d Y, Y, INCY
- xvfmul.d VX0, VXAI, x2
- xvfmul.d VX1, VXAI, x1
- xvfmul.d VX2, VXBI, x4
- xvfmul.d VX3, VXBI, x3
- xvfmsub.d VX0, VXAR, x1, VX0
- xvfmadd.d VX1, VXAR, x2, VX1
- xvfmsub.d VX2, VXBR, x3, VX2
- xvfmadd.d VX3, VXBR, x4, VX3
- xvfadd.d x3, VX0, VX2
- xvfadd.d x4, VX1, VX3
- addi.d I, I, -1
-
- xvstelm.d x3, YY, 0 * SIZE, 0
- xvstelm.d x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 1
- xvstelm.d x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 2
- xvstelm.d x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.d x3, YY, 0 * SIZE, 3
- xvstelm.d x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- blt $r0, I, .L224
- b .L997
- .align 3
- #else
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 0
- xvinsgr2vr.w x2, t2, 0
- xvinsgr2vr.w x1, t3, 1
- xvinsgr2vr.w x2, t4, 1
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 2
- xvinsgr2vr.w x2, t2, 2
- xvinsgr2vr.w x1, t3, 3
- xvinsgr2vr.w x2, t4, 3
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 4
- xvinsgr2vr.w x2, t2, 4
- xvinsgr2vr.w x1, t3, 5
- xvinsgr2vr.w x2, t4, 5
- ld.d t1, X, 0 * SIZE
- ld.d t2, X, 1 * SIZE
- add.d X, X, INCX
- ld.d t3, X, 0 * SIZE
- ld.d t4, X, 1 * SIZE
- add.d X, X, INCX
- xvinsgr2vr.w x1, t1, 6
- xvinsgr2vr.w x2, t2, 6
- xvinsgr2vr.w x1, t3, 7
- xvinsgr2vr.w x2, t4, 7
-
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 0
- xvinsgr2vr.w x4, t2, 0
- xvinsgr2vr.w x3, t3, 1
- xvinsgr2vr.w x4, t4, 1
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 2
- xvinsgr2vr.w x4, t2, 2
- xvinsgr2vr.w x3, t3, 3
- xvinsgr2vr.w x4, t4, 3
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- add.d Y, Y, INCY
- xvinsgr2vr.w x3, t1, 4
- xvinsgr2vr.w x4, t2, 4
- xvinsgr2vr.w x3, t3, 5
- xvinsgr2vr.w x4, t4, 5
- ld.d t1, Y, 0 * SIZE
- ld.d t2, Y, 1 * SIZE
- add.d Y, Y, INCY
- ld.d t3, Y, 0 * SIZE
- ld.d t4, Y, 1 * SIZE
- xvinsgr2vr.w x3, t1, 6
- xvinsgr2vr.w x4, t2, 6
- xvinsgr2vr.w x3, t3, 7
- xvinsgr2vr.w x4, t4, 7
- add.d Y, Y, INCY
-
- XVFMUL VX0, VXAI, x2
- XVFMUL VX1, VXAI, x1
- XVFMUL VX2, VXBI, x4
- XVFMUL VX3, VXBI, x3
- XVMSUB VX0, VXAR, x1, VX0
- XVFMADD VX1, VXAR, x2, VX1
- XVMSUB VX2, VXBR, x3, VX2
- XVFMADD VX3, VXBR, x4, VX3
- XVFADD x3, VX0, VX2
- XVFADD x4, VX1, VX3
- addi.d I, I, -1
-
- xvstelm.w x3, YY, 0 * SIZE, 0
- xvstelm.w x4, YY, 1 * SIZE, 0
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 1
- xvstelm.w x4, YY, 1 * SIZE, 1
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 2
- xvstelm.w x4, YY, 1 * SIZE, 2
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 3
- xvstelm.w x4, YY, 1 * SIZE, 3
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 4
- xvstelm.w x4, YY, 1 * SIZE, 4
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 5
- xvstelm.w x4, YY, 1 * SIZE, 5
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 6
- xvstelm.w x4, YY, 1 * SIZE, 6
- add.d YY, YY, INCY
- xvstelm.w x3, YY, 0 * SIZE, 7
- xvstelm.w x4, YY, 1 * SIZE, 7
- add.d YY, YY, INCY
- blt $r0, I, .L224
- b .L997
- .align 3
- #endif
-
- .L997:
- #ifdef DOUBLE
- andi I, N, 3
- #else
- andi I, N, 7
- #endif
- bge $r0, I, .L999
- .align 3
-
- .L998:
- LD a1, X, 0 * SIZE
- LD a2, X, 1 * SIZE
- LD a3, Y, 0 * SIZE
- LD a4, Y, 1 * SIZE
- addi.d I, I, -1
- MUL s1, ALPHAI, a2
- MUL s2, ALPHAI, a1
- MUL s3, BETAI, a4
- MUL s4, BETAI, a3
- MSUB s1, ALPHAR, a1, s1
- MADD s2, a2, ALPHAR, s2
- MSUB s3, BETAR, a3, s3
- MADD s4, a4, BETAR, s4
- ADD s3, s3, s1
- ADD s4, s4, s2
- ST s3, Y, 0 * SIZE
- ST s4, Y, 1 * SIZE
- add.d X, X, INCX
- add.d Y, Y, INCY
- blt $r0, I, .L998
- .align 3
-
- .L999:
- move $r4, $r12
- jirl $r0, $r1, 0x0
- .align 3
-
- EPILOGUE
|