|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337 |
- /*
-
- AUTOGENERATED KERNEL
- Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=1
- N=4
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='trmm'
- param_precision='double'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
- Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f64m1'
- VFMUL='__riscv_vfmul_vf_f64m1'
- VLEV='__riscv_vle64_v_f64m1'
- VLSEV='__riscv_vlse64_v_f64m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
- VSETVL='__riscv_vsetvl_e64m1'
- VSEV='__riscv_vse64_v_f64m1'
- VSSEV='__riscv_vsse64_v_f64m1'
- acc_vector_t='vfloat64m1_t'
- output='ztrmm_kernel_8x4_zvl256b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m1_t'
-
- */
-
- #include "common.h"
-
-
-
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define S0 1
- #define S1 -1
- #define S2 1
- #define S3 1
- #define VFMACC_RR __riscv_vfmsac
- #define VFMACC_RI __riscv_vfmacc
- #endif
- #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
- #define S0 1
- #define S1 1
- #define S2 1
- #define S3 -1
- #define VFMACC_RR __riscv_vfmacc
- #define VFMACC_RI __riscv_vfmsac
- #endif
- #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
- #define S0 1
- #define S1 1
- #define S2 -1
- #define S3 1
- #define VFMACC_RR __riscv_vfmacc
- #define VFMACC_RI __riscv_vfnmsac
- #endif
- #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
- #define S0 1
- #define S1 -1
- #define S2 -1
- #define S3 -1
- #define VFMACC_RR __riscv_vfmsac
- #define VFMACC_RI __riscv_vfnmacc
- #endif
-
-
- #if defined(LEFT) != defined(TRANSA)
- #define BACKWARDS
- #endif
-
- int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
-
- {
- BLASLONG gvl = 0;
- BLASLONG m_top = 0;
- BLASLONG n_top = 0;
-
-
- // -- MAIN PASS
-
- for (BLASLONG j=0; j<N/4; j+=1) {
- m_top = 0;
- BLASLONG gvl = __riscv_vsetvl_e64m1(4);
-
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8*2;
- bi += off*4*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 4;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- double B1r = B[bi+1*2+0];
- double B1i = B[bi+1*2+1];
- double B2r = B[bi+2*2+0];
- double B2i = B[bi+2*2+1];
- double B3r = B[bi+3*2+0];
- double B3i = B[bi+3*2+1];
- bi += 4*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
- // leaving 12 vector registers for temporaries
- // performing 4 operations between reuses of temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
- vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
- vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
- vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
- vfloat64m1_t ACC1r = tmp1r;
- vfloat64m1_t ACC1i = tmp1i;
- vfloat64m1_t ACC2r = tmp2r;
- vfloat64m1_t ACC2i = tmp2i;
- vfloat64m1_t ACC3r = tmp3r;
- vfloat64m1_t ACC3i = tmp3i;
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
- tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
- tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
- tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
- vfloat64m1_t ACC4r = tmp0r;
- vfloat64m1_t ACC4i = tmp0i;
- vfloat64m1_t ACC5r = tmp1r;
- vfloat64m1_t ACC5i = tmp1i;
- vfloat64m1_t ACC6r = tmp2r;
- vfloat64m1_t ACC6i = tmp2i;
- vfloat64m1_t ACC7r = tmp3r;
- vfloat64m1_t ACC7i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- bi += 4*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
- A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
- tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
- tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
- tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
- tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
- tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
- ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
- ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
- ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
- ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
- ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
- ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
- ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
- ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- vfloat64m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
- vfloat64m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
- vfloat64m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
- vfloat64m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
- vfloat64m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
- vfloat64m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
- vfloat64m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
- vfloat64m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
- C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
- C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
- C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
- C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
- C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
- C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
- C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += gvl;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*1;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += gvl;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
- ci += ldc-gvl*1;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
- ci += gvl;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
- ci += ldc-gvl*1;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
- ci += gvl;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-
- m_top += 8;
- }
-
-
-
- // -- tails for main pass
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e64m1(4);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4*2;
- bi += off*4*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 4;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- double B1r = B[bi+1*2+0];
- double B1i = B[bi+1*2+1];
- double B2r = B[bi+2*2+0];
- double B2i = B[bi+2*2+1];
- double B3r = B[bi+3*2+0];
- double B3i = B[bi+3*2+1];
- bi += 4*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
- // leaving 22 vector registers for temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
- vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
- vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
- vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
- vfloat64m1_t ACC1r = tmp1r;
- vfloat64m1_t ACC1i = tmp1i;
- vfloat64m1_t ACC2r = tmp2r;
- vfloat64m1_t ACC2i = tmp2i;
- vfloat64m1_t ACC3r = tmp3r;
- vfloat64m1_t ACC3i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- bi += 4*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
- tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
- tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- gvl = __riscv_vsetvl_e64m1(2);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2*2;
- bi += off*4*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 4;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- double B1r = B[bi+1*2+0];
- double B1i = B[bi+1*2+1];
- double B2r = B[bi+2*2+0];
- double B2i = B[bi+2*2+1];
- double B3r = B[bi+3*2+0];
- double B3i = B[bi+3*2+1];
- bi += 4*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
- // leaving 22 vector registers for temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
- vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
- vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
- vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
- vfloat64m1_t ACC1r = tmp1r;
- vfloat64m1_t ACC1i = tmp1i;
- vfloat64m1_t ACC2r = tmp2r;
- vfloat64m1_t ACC2i = tmp2i;
- vfloat64m1_t ACC3r = tmp3r;
- vfloat64m1_t ACC3i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- bi += 4*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
- tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
- tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-
- m_top += 2;
- }
-
-
- if( M & 1 ) {
- double result0 = 0;
- double result1 = 0;
- double result2 = 0;
- double result3 = 0;
- double result4 = 0;
- double result5 = 0;
- double result6 = 0;
- double result7 = 0;
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1*2;
- bi += off*4*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 4;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
- result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
- result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
- result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
- result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
- result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
- result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
- result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
- ai+=1*2;
- bi+=4*2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- double Cr, Ci;
- Cr = result0*alphar;
- Ci = result1*alphar;
- Cr -= result1*alphai;
- Ci += result0*alphai;
- C[(ci+0*ldc+0)*2+0] = Cr;
- C[(ci+0*ldc+0)*2+1] = Ci;
- Cr = result2*alphar;
- Ci = result3*alphar;
- Cr -= result3*alphai;
- Ci += result2*alphai;
- C[(ci+1*ldc+0)*2+0] = Cr;
- C[(ci+1*ldc+0)*2+1] = Ci;
- Cr = result4*alphar;
- Ci = result5*alphar;
- Cr -= result5*alphai;
- Ci += result4*alphai;
- C[(ci+2*ldc+0)*2+0] = Cr;
- C[(ci+2*ldc+0)*2+1] = Ci;
- Cr = result6*alphar;
- Ci = result7*alphar;
- Cr -= result7*alphai;
- Ci += result6*alphai;
- C[(ci+3*ldc+0)*2+0] = Cr;
- C[(ci+3*ldc+0)*2+1] = Ci;
- m_top+=1;
- }
-
- n_top += 4;
- }
-
-
-
- // -- tails for N=2
-
- if( N & 2 ) {
- gvl = __riscv_vsetvl_e64m1(4);
- m_top = 0;
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8*2;
- bi += off*2*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 2;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- double B1r = B[bi+1*2+0];
- double B1i = B[bi+1*2+1];
- bi += 2*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
- // leaving 20 vector registers for temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
- vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
- vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
- vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
- vfloat64m1_t ACC1r = tmp1r;
- vfloat64m1_t ACC1i = tmp1i;
- vfloat64m1_t ACC2r = tmp2r;
- vfloat64m1_t ACC2i = tmp2i;
- vfloat64m1_t ACC3r = tmp3r;
- vfloat64m1_t ACC3i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- bi += 2*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
- A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
- tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
- tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += gvl;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*1;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += gvl;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-
- m_top += 8;
- }
-
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e64m1(4);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4*2;
- bi += off*2*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 2;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- double B1r = B[bi+1*2+0];
- double B1i = B[bi+1*2+1];
- bi += 2*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
- // leaving 26 vector registers for temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
- vfloat64m1_t ACC1r = tmp1r;
- vfloat64m1_t ACC1i = tmp1i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- bi += 2*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- gvl = __riscv_vsetvl_e64m1(2);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2*2;
- bi += off*2*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 2;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- double B1r = B[bi+1*2+0];
- double B1i = B[bi+1*2+1];
- bi += 2*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
- // leaving 26 vector registers for temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
- vfloat64m1_t ACC1r = tmp1r;
- vfloat64m1_t ACC1i = tmp1i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- bi += 2*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-
- m_top += 2;
- }
-
-
- if( M & 1 ) {
- double result0 = 0;
- double result1 = 0;
- double result2 = 0;
- double result3 = 0;
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1*2;
- bi += off*2*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 2;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
- result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
- result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
- result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
- ai+=1*2;
- bi+=2*2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- double Cr, Ci;
- Cr = result0*alphar;
- Ci = result1*alphar;
- Cr -= result1*alphai;
- Ci += result0*alphai;
- C[(ci+0*ldc+0)*2+0] = Cr;
- C[(ci+0*ldc+0)*2+1] = Ci;
- Cr = result2*alphar;
- Ci = result3*alphar;
- Cr -= result3*alphai;
- Ci += result2*alphai;
- C[(ci+1*ldc+0)*2+0] = Cr;
- C[(ci+1*ldc+0)*2+1] = Ci;
- m_top+=1;
- }
-
- n_top += 2;
- }
-
-
-
- // -- tails for N=1
-
- if( N & 1 ) {
- gvl = __riscv_vsetvl_e64m1(4);
- m_top = 0;
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8*2;
- bi += off*1*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 1;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- bi += 1*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
- // leaving 24 vector registers for temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
- vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
- vfloat64m1_t ACC1r = tmp1r;
- vfloat64m1_t ACC1i = tmp1i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- bi += 1*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
- A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
- tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += gvl;
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-
- m_top += 8;
- }
-
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e64m1(4);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4*2;
- bi += off*1*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 1;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- bi += 1*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
- // leaving 28 vector registers for temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- bi += 1*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- gvl = __riscv_vsetvl_e64m1(2);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2*2;
- bi += off*1*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 1;
- #endif
- #endif
- double B0r = B[bi+0*2+0];
- double B0i = B[bi+0*2+1];
- bi += 1*2;
-
- vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
- // leaving 28 vector registers for temporaries
- vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- vfloat64m1_t ACC0r = tmp0r;
- vfloat64m1_t ACC0i = tmp0i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- bi += 1*2;
-
- A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-
- m_top += 2;
- }
-
-
- if( M & 1 ) {
- double result0 = 0;
- double result1 = 0;
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1*2;
- bi += off*1*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 1;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
- result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
- ai+=1*2;
- bi+=1*2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- double Cr, Ci;
- Cr = result0*alphar;
- Ci = result1*alphar;
- Cr -= result1*alphai;
- Ci += result0*alphai;
- C[(ci+0*ldc+0)*2+0] = Cr;
- C[(ci+0*ldc+0)*2+1] = Ci;
- m_top+=1;
- }
-
- n_top += 1;
- }
-
- return 0;
- }
-
|