|
- /*
-
- AUTOGENERATED KERNEL
- Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=1
- N=8
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='trmm'
- param_precision='float'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
- Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f32m1'
- VFMUL='__riscv_vfmul_vf_f32m1'
- VLEV='__riscv_vle32_v_f32m1'
- VLSEV='__riscv_vlse32_v_f32m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
- VSETVL='__riscv_vsetvl_e32m1'
- VSEV='__riscv_vse32_v_f32m1'
- VSSEV='__riscv_vsse32_v_f32m1'
- acc_vector_t='vfloat32m1_t'
- output='ctrmm_kernel_8x8_zvl256b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m1_t'
-
- */
-
- #include "common.h"
-
-
-
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define S0 1
- #define S1 -1
- #define S2 1
- #define S3 1
- #define VFMACC_RR __riscv_vfmsac
- #define VFMACC_RI __riscv_vfmacc
- #endif
- #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
- #define S0 1
- #define S1 1
- #define S2 1
- #define S3 -1
- #define VFMACC_RR __riscv_vfmacc
- #define VFMACC_RI __riscv_vfmsac
- #endif
- #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
- #define S0 1
- #define S1 1
- #define S2 -1
- #define S3 1
- #define VFMACC_RR __riscv_vfmacc
- #define VFMACC_RI __riscv_vfnmsac
- #endif
- #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
- #define S0 1
- #define S1 -1
- #define S2 -1
- #define S3 -1
- #define VFMACC_RR __riscv_vfmsac
- #define VFMACC_RI __riscv_vfnmacc
- #endif
-
-
- #if defined(LEFT) != defined(TRANSA)
- #define BACKWARDS
- #endif
-
- int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
-
- {
- BLASLONG gvl = 0;
- BLASLONG m_top = 0;
- BLASLONG n_top = 0;
-
-
- // -- MAIN PASS
-
- for (BLASLONG j=0; j<N/8; j+=1) {
- m_top = 0;
- BLASLONG gvl = __riscv_vsetvl_e32m1(8);
-
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8*2;
- bi += off*8*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 8;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- float B2r = B[bi+2*2+0];
- float B2i = B[bi+2*2+1];
- float B3r = B[bi+3*2+0];
- float B3i = B[bi+3*2+1];
- float B4r = B[bi+4*2+0];
- float B4i = B[bi+4*2+1];
- float B5r = B[bi+5*2+0];
- float B5i = B[bi+5*2+1];
- float B6r = B[bi+6*2+0];
- float B6i = B[bi+6*2+1];
- float B7r = B[bi+7*2+0];
- float B7i = B[bi+7*2+1];
- bi += 8*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
- // leaving 14 vector registers for temporaries
- // performing 4 operations between reuses of temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
- vfloat32m1_t ACC2r = tmp2r;
- vfloat32m1_t ACC2i = tmp2i;
- vfloat32m1_t ACC3r = tmp3r;
- vfloat32m1_t ACC3i = tmp3i;
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
- vfloat32m1_t ACC4r = tmp0r;
- vfloat32m1_t ACC4i = tmp0i;
- vfloat32m1_t ACC5r = tmp1r;
- vfloat32m1_t ACC5i = tmp1i;
- vfloat32m1_t ACC6r = tmp2r;
- vfloat32m1_t ACC6i = tmp2i;
- vfloat32m1_t ACC7r = tmp3r;
- vfloat32m1_t ACC7i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- B4r = B[bi+4*2+0];
- B4i = B[bi+4*2+1];
- B5r = B[bi+5*2+0];
- B5i = B[bi+5*2+1];
- B6r = B[bi+6*2+0];
- B6i = B[bi+6*2+1];
- B7r = B[bi+7*2+0];
- B7i = B[bi+7*2+1];
- bi += 8*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
- ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
- ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
- ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
- ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
- ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
- ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
- ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
- ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
- vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
- vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
- vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
- vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
- vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
- vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
- vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
- C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
- C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
- C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
- C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
- C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
- C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
- C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-
- m_top += 8;
- }
-
-
-
- // -- tails for main pass
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e32m1(4);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4*2;
- bi += off*8*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 8;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- float B2r = B[bi+2*2+0];
- float B2i = B[bi+2*2+1];
- float B3r = B[bi+3*2+0];
- float B3i = B[bi+3*2+1];
- float B4r = B[bi+4*2+0];
- float B4i = B[bi+4*2+1];
- float B5r = B[bi+5*2+0];
- float B5i = B[bi+5*2+1];
- float B6r = B[bi+6*2+0];
- float B6i = B[bi+6*2+1];
- float B7r = B[bi+7*2+0];
- float B7i = B[bi+7*2+1];
- bi += 8*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
- // leaving 14 vector registers for temporaries
- // performing 4 operations between reuses of temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
- vfloat32m1_t ACC2r = tmp2r;
- vfloat32m1_t ACC2i = tmp2i;
- vfloat32m1_t ACC3r = tmp3r;
- vfloat32m1_t ACC3i = tmp3i;
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
- vfloat32m1_t ACC4r = tmp0r;
- vfloat32m1_t ACC4i = tmp0i;
- vfloat32m1_t ACC5r = tmp1r;
- vfloat32m1_t ACC5i = tmp1i;
- vfloat32m1_t ACC6r = tmp2r;
- vfloat32m1_t ACC6i = tmp2i;
- vfloat32m1_t ACC7r = tmp3r;
- vfloat32m1_t ACC7i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- B4r = B[bi+4*2+0];
- B4i = B[bi+4*2+1];
- B5r = B[bi+5*2+0];
- B5i = B[bi+5*2+1];
- B6r = B[bi+6*2+0];
- B6i = B[bi+6*2+1];
- B7r = B[bi+7*2+0];
- B7i = B[bi+7*2+1];
- bi += 8*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
- ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
- ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
- ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
- ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
- ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
- ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
- ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
- ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
- vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
- vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
- vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
- vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
- vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
- vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
- vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
- C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
- C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
- C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
- C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
- C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
- C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
- C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- gvl = __riscv_vsetvl_e32m1(2);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2*2;
- bi += off*8*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 8;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- float B2r = B[bi+2*2+0];
- float B2i = B[bi+2*2+1];
- float B3r = B[bi+3*2+0];
- float B3i = B[bi+3*2+1];
- float B4r = B[bi+4*2+0];
- float B4i = B[bi+4*2+1];
- float B5r = B[bi+5*2+0];
- float B5i = B[bi+5*2+1];
- float B6r = B[bi+6*2+0];
- float B6i = B[bi+6*2+1];
- float B7r = B[bi+7*2+0];
- float B7i = B[bi+7*2+1];
- bi += 8*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
- // leaving 14 vector registers for temporaries
- // performing 4 operations between reuses of temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
- vfloat32m1_t ACC2r = tmp2r;
- vfloat32m1_t ACC2i = tmp2i;
- vfloat32m1_t ACC3r = tmp3r;
- vfloat32m1_t ACC3i = tmp3i;
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
- vfloat32m1_t ACC4r = tmp0r;
- vfloat32m1_t ACC4i = tmp0i;
- vfloat32m1_t ACC5r = tmp1r;
- vfloat32m1_t ACC5i = tmp1i;
- vfloat32m1_t ACC6r = tmp2r;
- vfloat32m1_t ACC6i = tmp2i;
- vfloat32m1_t ACC7r = tmp3r;
- vfloat32m1_t ACC7i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- B4r = B[bi+4*2+0];
- B4i = B[bi+4*2+1];
- B5r = B[bi+5*2+0];
- B5i = B[bi+5*2+1];
- B6r = B[bi+6*2+0];
- B6i = B[bi+6*2+1];
- B7r = B[bi+7*2+0];
- B7i = B[bi+7*2+1];
- bi += 8*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
- ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
- ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
- ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
- ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
- ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
- ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
- ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
- ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
- vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
- vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
- vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
- vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
- vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
- vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
- vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
- C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
- C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
- C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
- C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
- C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
- C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
- C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-
- m_top += 2;
- }
-
-
- if( M & 1 ) {
- float result0 = 0;
- float result1 = 0;
- float result2 = 0;
- float result3 = 0;
- float result4 = 0;
- float result5 = 0;
- float result6 = 0;
- float result7 = 0;
- float result8 = 0;
- float result9 = 0;
- float result10 = 0;
- float result11 = 0;
- float result12 = 0;
- float result13 = 0;
- float result14 = 0;
- float result15 = 0;
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1*2;
- bi += off*8*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 8;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
- result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
- result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
- result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
- result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
- result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
- result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
- result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
- result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
- result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
- result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
- result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
- result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
- result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
- result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
- result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
- ai+=1*2;
- bi+=8*2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- float Cr, Ci;
- Cr = result0*alphar;
- Ci = result1*alphar;
- Cr -= result1*alphai;
- Ci += result0*alphai;
- C[(ci+0*ldc+0)*2+0] = Cr;
- C[(ci+0*ldc+0)*2+1] = Ci;
- Cr = result2*alphar;
- Ci = result3*alphar;
- Cr -= result3*alphai;
- Ci += result2*alphai;
- C[(ci+1*ldc+0)*2+0] = Cr;
- C[(ci+1*ldc+0)*2+1] = Ci;
- Cr = result4*alphar;
- Ci = result5*alphar;
- Cr -= result5*alphai;
- Ci += result4*alphai;
- C[(ci+2*ldc+0)*2+0] = Cr;
- C[(ci+2*ldc+0)*2+1] = Ci;
- Cr = result6*alphar;
- Ci = result7*alphar;
- Cr -= result7*alphai;
- Ci += result6*alphai;
- C[(ci+3*ldc+0)*2+0] = Cr;
- C[(ci+3*ldc+0)*2+1] = Ci;
- Cr = result8*alphar;
- Ci = result9*alphar;
- Cr -= result9*alphai;
- Ci += result8*alphai;
- C[(ci+4*ldc+0)*2+0] = Cr;
- C[(ci+4*ldc+0)*2+1] = Ci;
- Cr = result10*alphar;
- Ci = result11*alphar;
- Cr -= result11*alphai;
- Ci += result10*alphai;
- C[(ci+5*ldc+0)*2+0] = Cr;
- C[(ci+5*ldc+0)*2+1] = Ci;
- Cr = result12*alphar;
- Ci = result13*alphar;
- Cr -= result13*alphai;
- Ci += result12*alphai;
- C[(ci+6*ldc+0)*2+0] = Cr;
- C[(ci+6*ldc+0)*2+1] = Ci;
- Cr = result14*alphar;
- Ci = result15*alphar;
- Cr -= result15*alphai;
- Ci += result14*alphai;
- C[(ci+7*ldc+0)*2+0] = Cr;
- C[(ci+7*ldc+0)*2+1] = Ci;
- m_top+=1;
- }
-
- n_top += 8;
- }
-
-
-
- // -- tails for N=4
-
- if( N & 4 ) {
- gvl = __riscv_vsetvl_e32m1(8);
- m_top = 0;
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8*2;
- bi += off*4*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 4;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- float B2r = B[bi+2*2+0];
- float B2i = B[bi+2*2+1];
- float B3r = B[bi+3*2+0];
- float B3i = B[bi+3*2+1];
- bi += 4*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
- // leaving 22 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
- vfloat32m1_t ACC2r = tmp2r;
- vfloat32m1_t ACC2i = tmp2i;
- vfloat32m1_t ACC3r = tmp3r;
- vfloat32m1_t ACC3i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- bi += 4*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-
- m_top += 8;
- }
-
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e32m1(4);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4*2;
- bi += off*4*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 4;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- float B2r = B[bi+2*2+0];
- float B2i = B[bi+2*2+1];
- float B3r = B[bi+3*2+0];
- float B3i = B[bi+3*2+1];
- bi += 4*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
- // leaving 22 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
- vfloat32m1_t ACC2r = tmp2r;
- vfloat32m1_t ACC2i = tmp2i;
- vfloat32m1_t ACC3r = tmp3r;
- vfloat32m1_t ACC3i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- bi += 4*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- gvl = __riscv_vsetvl_e32m1(2);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2*2;
- bi += off*4*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 4;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- float B2r = B[bi+2*2+0];
- float B2i = B[bi+2*2+1];
- float B3r = B[bi+3*2+0];
- float B3i = B[bi+3*2+1];
- bi += 4*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
- // leaving 22 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
- vfloat32m1_t ACC2r = tmp2r;
- vfloat32m1_t ACC2i = tmp2i;
- vfloat32m1_t ACC3r = tmp3r;
- vfloat32m1_t ACC3i = tmp3i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- B2r = B[bi+2*2+0];
- B2i = B[bi+2*2+1];
- B3r = B[bi+3*2+0];
- B3i = B[bi+3*2+1];
- bi += 4*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
- tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
- tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
- tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
- tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
- tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
- tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
- ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
- ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
- ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
- vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
- vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
- vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
- C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
- C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
- C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-
- m_top += 2;
- }
-
-
- if( M & 1 ) {
- float result0 = 0;
- float result1 = 0;
- float result2 = 0;
- float result3 = 0;
- float result4 = 0;
- float result5 = 0;
- float result6 = 0;
- float result7 = 0;
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1*2;
- bi += off*4*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 4;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
- result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
- result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
- result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
- result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
- result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
- result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
- result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
- ai+=1*2;
- bi+=4*2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- float Cr, Ci;
- Cr = result0*alphar;
- Ci = result1*alphar;
- Cr -= result1*alphai;
- Ci += result0*alphai;
- C[(ci+0*ldc+0)*2+0] = Cr;
- C[(ci+0*ldc+0)*2+1] = Ci;
- Cr = result2*alphar;
- Ci = result3*alphar;
- Cr -= result3*alphai;
- Ci += result2*alphai;
- C[(ci+1*ldc+0)*2+0] = Cr;
- C[(ci+1*ldc+0)*2+1] = Ci;
- Cr = result4*alphar;
- Ci = result5*alphar;
- Cr -= result5*alphai;
- Ci += result4*alphai;
- C[(ci+2*ldc+0)*2+0] = Cr;
- C[(ci+2*ldc+0)*2+1] = Ci;
- Cr = result6*alphar;
- Ci = result7*alphar;
- Cr -= result7*alphai;
- Ci += result6*alphai;
- C[(ci+3*ldc+0)*2+0] = Cr;
- C[(ci+3*ldc+0)*2+1] = Ci;
- m_top+=1;
- }
-
- n_top += 4;
- }
-
-
-
- // -- tails for N=2
-
- if( N & 2 ) {
- gvl = __riscv_vsetvl_e32m1(8);
- m_top = 0;
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8*2;
- bi += off*2*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 2;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- bi += 2*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
- // leaving 26 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- bi += 2*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-
- m_top += 8;
- }
-
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e32m1(4);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4*2;
- bi += off*2*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 2;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- bi += 2*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
- // leaving 26 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- bi += 2*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- gvl = __riscv_vsetvl_e32m1(2);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2*2;
- bi += off*2*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 2;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- float B1r = B[bi+1*2+0];
- float B1i = B[bi+1*2+1];
- bi += 2*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
- // leaving 26 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
- vfloat32m1_t ACC1r = tmp1r;
- vfloat32m1_t ACC1i = tmp1i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- B1r = B[bi+1*2+0];
- B1i = B[bi+1*2+1];
- bi += 2*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
- tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
- tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
- ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
- vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
- C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
- ci += ldc-gvl*0;
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-
- m_top += 2;
- }
-
-
- if( M & 1 ) {
- float result0 = 0;
- float result1 = 0;
- float result2 = 0;
- float result3 = 0;
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1*2;
- bi += off*2*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 2;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
- result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
- result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
- result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
- ai+=1*2;
- bi+=2*2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- float Cr, Ci;
- Cr = result0*alphar;
- Ci = result1*alphar;
- Cr -= result1*alphai;
- Ci += result0*alphai;
- C[(ci+0*ldc+0)*2+0] = Cr;
- C[(ci+0*ldc+0)*2+1] = Ci;
- Cr = result2*alphar;
- Ci = result3*alphar;
- Cr -= result3*alphai;
- Ci += result2*alphai;
- C[(ci+1*ldc+0)*2+0] = Cr;
- C[(ci+1*ldc+0)*2+1] = Ci;
- m_top+=1;
- }
-
- n_top += 2;
- }
-
-
-
- // -- tails for N=1
-
- if( N & 1 ) {
- gvl = __riscv_vsetvl_e32m1(8);
- m_top = 0;
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8*2;
- bi += off*1*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 1;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- bi += 1*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
- // leaving 28 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- bi += 1*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 8*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-
- m_top += 8;
- }
-
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e32m1(4);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4*2;
- bi += off*1*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 1;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- bi += 1*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
- // leaving 28 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- bi += 1*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 4*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- gvl = __riscv_vsetvl_e32m1(2);
-
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2*2;
- bi += off*1*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 1;
- #endif
- #endif
- float B0r = B[bi+0*2+0];
- float B0i = B[bi+0*2+1];
- bi += 1*2;
-
- vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
- // leaving 28 vector registers for temporaries
- vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- vfloat32m1_t ACC0r = tmp0r;
- vfloat32m1_t ACC0i = tmp0i;
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0r = B[bi+0*2+0];
- B0i = B[bi+0*2+1];
- bi += 1*2;
-
- A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
- A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
- ai += 2*2;
-
- tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
- tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
- tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
- tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
- ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
- ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
- vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
- C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
- C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
- __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
- __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-
- m_top += 2;
- }
-
-
- if( M & 1 ) {
- float result0 = 0;
- float result1 = 0;
- BLASLONG ai=m_top*K*2;
- BLASLONG bi=n_top*K*2;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1*2;
- bi += off*1*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 1;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
- result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
- ai+=1*2;
- bi+=1*2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- float Cr, Ci;
- Cr = result0*alphar;
- Ci = result1*alphar;
- Cr -= result1*alphai;
- Ci += result0*alphai;
- C[(ci+0*ldc+0)*2+0] = Cr;
- C[(ci+0*ldc+0)*2+1] = Ci;
- m_top+=1;
- }
-
- n_top += 1;
- }
-
- return 0;
- }
|