|
- /*
-
- AUTOGENERATED KERNEL
- Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=2
- N=8
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='trmm'
- param_precision='double'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
- Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f64m1'
- VFMUL='__riscv_vfmul_vf_f64m1'
- VLEV='__riscv_vle64_v_f64m1'
- VLSEV='__riscv_vlse64_v_f64m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
- VSETVL='__riscv_vsetvl_e64m1'
- VSEV='__riscv_vse64_v_f64m1'
- VSSEV='__riscv_vsse64_v_f64m1'
- acc_vector_t='vfloat64m1_t'
- output='dtrmm_kernel_8x8_zvl256b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m1_t'
-
- */
-
- #include "common.h"
-
-
-
- #if defined(LEFT) != defined(TRANSA)
- #define BACKWARDS
- #endif
-
- int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
-
- {
- BLASLONG gvl = 0;
- BLASLONG m_top = 0;
- BLASLONG n_top = 0;
-
-
- // -- MAIN PASS
-
- for (BLASLONG j=0; j<N/8; j+=1) {
- m_top = 0;
- BLASLONG gvl = __riscv_vsetvl_e64m1(4);
-
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8;
- bi += off*8;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 8;
- #endif
- #endif
- double B0 = B[bi+0];
- double B1 = B[bi+1];
- double B2 = B[bi+2];
- double B3 = B[bi+3];
- double B4 = B[bi+4];
- double B5 = B[bi+5];
- double B6 = B[bi+6];
- double B7 = B[bi+7];
- bi += 8;
-
- vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
- ai += 8;
-
- vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
- vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
- vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
- vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
- vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
- vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
- vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
- vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
- vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
- vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
- vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
- vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
- vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
- vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
- vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
- vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0 = B[bi+0];
- B1 = B[bi+1];
- B2 = B[bi+2];
- B3 = B[bi+3];
- B4 = B[bi+4];
- B5 = B[bi+5];
- B6 = B[bi+6];
- B7 = B[bi+7];
- bi += 8;
-
- A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
- ai += 8;
-
- result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
- result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
- result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
- result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
- result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
- result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
- result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
- result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
- result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
- result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
- result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
- result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
- result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
- result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
- result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
- result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
- vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
- vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
- vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
- vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
- vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
- vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
- vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
- vfloat64m1_t c8 = __riscv_vfmul_vf_f64m1( result8, alpha, gvl );
- vfloat64m1_t c9 = __riscv_vfmul_vf_f64m1( result9, alpha, gvl );
- vfloat64m1_t c10 = __riscv_vfmul_vf_f64m1( result10, alpha, gvl );
- vfloat64m1_t c11 = __riscv_vfmul_vf_f64m1( result11, alpha, gvl );
- vfloat64m1_t c12 = __riscv_vfmul_vf_f64m1( result12, alpha, gvl );
- vfloat64m1_t c13 = __riscv_vfmul_vf_f64m1( result13, alpha, gvl );
- vfloat64m1_t c14 = __riscv_vfmul_vf_f64m1( result14, alpha, gvl );
- vfloat64m1_t c15 = __riscv_vfmul_vf_f64m1( result15, alpha, gvl );
- __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
- m_top += 8;
- }
-
-
-
- // -- tails for main pass
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e64m1(4);
-
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4;
- bi += off*8;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 8;
- #endif
- #endif
- double B0 = B[bi+0];
- double B1 = B[bi+1];
- double B2 = B[bi+2];
- double B3 = B[bi+3];
- double B4 = B[bi+4];
- double B5 = B[bi+5];
- double B6 = B[bi+6];
- double B7 = B[bi+7];
- bi += 8;
-
- vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- ai += 4;
-
- vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
- vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
- vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
- vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
- vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
- vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
- vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
- vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0 = B[bi+0];
- B1 = B[bi+1];
- B2 = B[bi+2];
- B3 = B[bi+3];
- B4 = B[bi+4];
- B5 = B[bi+5];
- B6 = B[bi+6];
- B7 = B[bi+7];
- bi += 8;
-
- A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- ai += 4;
-
- result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
- result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
- result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
- result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
- result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
- result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
- result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
- result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
- vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
- vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
- vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
- vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
- vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
- vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
- vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
- __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- double result0 = 0;
- double result1 = 0;
- double result2 = 0;
- double result3 = 0;
- double result4 = 0;
- double result5 = 0;
- double result6 = 0;
- double result7 = 0;
- double result8 = 0;
- double result9 = 0;
- double result10 = 0;
- double result11 = 0;
- double result12 = 0;
- double result13 = 0;
- double result14 = 0;
- double result15 = 0;
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2;
- bi += off*8;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 8;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=A[ai+0]*B[bi+0];
- result1+=A[ai+1]*B[bi+0];
- result2+=A[ai+0]*B[bi+1];
- result3+=A[ai+1]*B[bi+1];
- result4+=A[ai+0]*B[bi+2];
- result5+=A[ai+1]*B[bi+2];
- result6+=A[ai+0]*B[bi+3];
- result7+=A[ai+1]*B[bi+3];
- result8+=A[ai+0]*B[bi+4];
- result9+=A[ai+1]*B[bi+4];
- result10+=A[ai+0]*B[bi+5];
- result11+=A[ai+1]*B[bi+5];
- result12+=A[ai+0]*B[bi+6];
- result13+=A[ai+1]*B[bi+6];
- result14+=A[ai+0]*B[bi+7];
- result15+=A[ai+1]*B[bi+7];
- ai+=2;
- bi+=8;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- C[ci+0*ldc+0] = alpha * result0;
- C[ci+0*ldc+1] = alpha * result1;
- C[ci+1*ldc+0] = alpha * result2;
- C[ci+1*ldc+1] = alpha * result3;
- C[ci+2*ldc+0] = alpha * result4;
- C[ci+2*ldc+1] = alpha * result5;
- C[ci+3*ldc+0] = alpha * result6;
- C[ci+3*ldc+1] = alpha * result7;
- C[ci+4*ldc+0] = alpha * result8;
- C[ci+4*ldc+1] = alpha * result9;
- C[ci+5*ldc+0] = alpha * result10;
- C[ci+5*ldc+1] = alpha * result11;
- C[ci+6*ldc+0] = alpha * result12;
- C[ci+6*ldc+1] = alpha * result13;
- C[ci+7*ldc+0] = alpha * result14;
- C[ci+7*ldc+1] = alpha * result15;
- m_top+=2;
- }
-
-
- if( M & 1 ) {
- double result0 = 0;
- double result1 = 0;
- double result2 = 0;
- double result3 = 0;
- double result4 = 0;
- double result5 = 0;
- double result6 = 0;
- double result7 = 0;
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1;
- bi += off*8;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 8;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=A[ai+0]*B[bi+0];
- result1+=A[ai+0]*B[bi+1];
- result2+=A[ai+0]*B[bi+2];
- result3+=A[ai+0]*B[bi+3];
- result4+=A[ai+0]*B[bi+4];
- result5+=A[ai+0]*B[bi+5];
- result6+=A[ai+0]*B[bi+6];
- result7+=A[ai+0]*B[bi+7];
- ai+=1;
- bi+=8;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- C[ci+0*ldc+0] = alpha * result0;
- C[ci+1*ldc+0] = alpha * result1;
- C[ci+2*ldc+0] = alpha * result2;
- C[ci+3*ldc+0] = alpha * result3;
- C[ci+4*ldc+0] = alpha * result4;
- C[ci+5*ldc+0] = alpha * result5;
- C[ci+6*ldc+0] = alpha * result6;
- C[ci+7*ldc+0] = alpha * result7;
- m_top+=1;
- }
-
- n_top += 8;
- }
-
-
-
- // -- tails for N=4
-
- if( N & 4 ) {
- gvl = __riscv_vsetvl_e64m1(4);
- m_top = 0;
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8;
- bi += off*4;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 4;
- #endif
- #endif
- double B0 = B[bi+0];
- double B1 = B[bi+1];
- double B2 = B[bi+2];
- double B3 = B[bi+3];
- bi += 4;
-
- vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
- ai += 8;
-
- vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
- vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
- vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
- vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
- vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
- vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
- vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
- vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0 = B[bi+0];
- B1 = B[bi+1];
- B2 = B[bi+2];
- B3 = B[bi+3];
- bi += 4;
-
- A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
- ai += 8;
-
- result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
- result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
- result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
- result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
- result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
- result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
- result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
- result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
- vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
- vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
- vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
- vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
- vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
- vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
- vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
- __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
- m_top += 8;
- }
-
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e64m1(4);
-
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4;
- bi += off*4;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 4;
- #endif
- #endif
- double B0 = B[bi+0];
- double B1 = B[bi+1];
- double B2 = B[bi+2];
- double B3 = B[bi+3];
- bi += 4;
-
- vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- ai += 4;
-
- vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
- vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
- vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
- vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0 = B[bi+0];
- B1 = B[bi+1];
- B2 = B[bi+2];
- B3 = B[bi+3];
- bi += 4;
-
- A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- ai += 4;
-
- result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
- result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
- result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
- result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
- vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
- vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
- vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
- __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- double result0 = 0;
- double result1 = 0;
- double result2 = 0;
- double result3 = 0;
- double result4 = 0;
- double result5 = 0;
- double result6 = 0;
- double result7 = 0;
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2;
- bi += off*4;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 4;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=A[ai+0]*B[bi+0];
- result1+=A[ai+1]*B[bi+0];
- result2+=A[ai+0]*B[bi+1];
- result3+=A[ai+1]*B[bi+1];
- result4+=A[ai+0]*B[bi+2];
- result5+=A[ai+1]*B[bi+2];
- result6+=A[ai+0]*B[bi+3];
- result7+=A[ai+1]*B[bi+3];
- ai+=2;
- bi+=4;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- C[ci+0*ldc+0] = alpha * result0;
- C[ci+0*ldc+1] = alpha * result1;
- C[ci+1*ldc+0] = alpha * result2;
- C[ci+1*ldc+1] = alpha * result3;
- C[ci+2*ldc+0] = alpha * result4;
- C[ci+2*ldc+1] = alpha * result5;
- C[ci+3*ldc+0] = alpha * result6;
- C[ci+3*ldc+1] = alpha * result7;
- m_top+=2;
- }
-
-
- if( M & 1 ) {
- double result0 = 0;
- double result1 = 0;
- double result2 = 0;
- double result3 = 0;
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1;
- bi += off*4;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 4;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=A[ai+0]*B[bi+0];
- result1+=A[ai+0]*B[bi+1];
- result2+=A[ai+0]*B[bi+2];
- result3+=A[ai+0]*B[bi+3];
- ai+=1;
- bi+=4;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- C[ci+0*ldc+0] = alpha * result0;
- C[ci+1*ldc+0] = alpha * result1;
- C[ci+2*ldc+0] = alpha * result2;
- C[ci+3*ldc+0] = alpha * result3;
- m_top+=1;
- }
-
- n_top += 4;
- }
-
-
-
- // -- tails for N=2
-
- if( N & 2 ) {
- gvl = __riscv_vsetvl_e64m1(4);
- m_top = 0;
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8;
- bi += off*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 2;
- #endif
- #endif
- double B0 = B[bi+0];
- double B1 = B[bi+1];
- bi += 2;
-
- vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
- ai += 8;
-
- vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
- vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
- vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
- vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0 = B[bi+0];
- B1 = B[bi+1];
- bi += 2;
-
- A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
- ai += 8;
-
- result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
- result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
- result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
- result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
- vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
- vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
- vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
- __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
- __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
- m_top += 8;
- }
-
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e64m1(4);
-
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4;
- bi += off*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 2;
- #endif
- #endif
- double B0 = B[bi+0];
- double B1 = B[bi+1];
- bi += 2;
-
- vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- ai += 4;
-
- vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
- vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0 = B[bi+0];
- B1 = B[bi+1];
- bi += 2;
-
- A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- ai += 4;
-
- result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
- result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
- vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
- __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
- __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- double result0 = 0;
- double result1 = 0;
- double result2 = 0;
- double result3 = 0;
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2;
- bi += off*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 2;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=A[ai+0]*B[bi+0];
- result1+=A[ai+1]*B[bi+0];
- result2+=A[ai+0]*B[bi+1];
- result3+=A[ai+1]*B[bi+1];
- ai+=2;
- bi+=2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- C[ci+0*ldc+0] = alpha * result0;
- C[ci+0*ldc+1] = alpha * result1;
- C[ci+1*ldc+0] = alpha * result2;
- C[ci+1*ldc+1] = alpha * result3;
- m_top+=2;
- }
-
-
- if( M & 1 ) {
- double result0 = 0;
- double result1 = 0;
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1;
- bi += off*2;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 2;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=A[ai+0]*B[bi+0];
- result1+=A[ai+0]*B[bi+1];
- ai+=1;
- bi+=2;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- C[ci+0*ldc+0] = alpha * result0;
- C[ci+1*ldc+0] = alpha * result1;
- m_top+=1;
- }
-
- n_top += 2;
- }
-
-
-
- // -- tails for N=1
-
- if( N & 1 ) {
- gvl = __riscv_vsetvl_e64m1(4);
- m_top = 0;
-
- for (BLASLONG i=0; i<M/8; i+=1) {
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*8;
- bi += off*1;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 8;
- #else
- pass_K = off + 1;
- #endif
- #endif
- double B0 = B[bi+0];
- bi += 1;
-
- vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
- ai += 8;
-
- vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
- vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0 = B[bi+0];
- bi += 1;
-
- A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
- ai += 8;
-
- result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
- result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
- vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
- __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
- __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
- m_top += 8;
- }
-
-
- if( M & 4 ) {
- gvl = __riscv_vsetvl_e64m1(4);
-
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*4;
- bi += off*1;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 4;
- #else
- pass_K = off + 1;
- #endif
- #endif
- double B0 = B[bi+0];
- bi += 1;
-
- vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- ai += 4;
-
- vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-
- for(BLASLONG k=1; k<pass_K; k++) {
- B0 = B[bi+0];
- bi += 1;
-
- A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
- ai += 4;
-
- result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
- }
-
-
- BLASLONG ci=n_top*ldc+m_top;
-
- vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
- __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
- m_top += 4;
- }
-
-
- if( M & 2 ) {
- double result0 = 0;
- double result1 = 0;
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*2;
- bi += off*1;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 2;
- #else
- pass_K = off + 1;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=A[ai+0]*B[bi+0];
- result1+=A[ai+1]*B[bi+0];
- ai+=2;
- bi+=1;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- C[ci+0*ldc+0] = alpha * result0;
- C[ci+0*ldc+1] = alpha * result1;
- m_top+=2;
- }
-
-
- if( M & 1 ) {
- double result0 = 0;
- BLASLONG ai=m_top*K;
- BLASLONG bi=n_top*K;
- BLASLONG pass_K = K;
- #ifdef LEFT
- BLASLONG off = offset + m_top;
- #else
- BLASLONG off = -offset + n_top;
- #endif
- #ifdef BACKWARDS
- ai += off*1;
- bi += off*1;
- pass_K -= off;
- #else
- #ifdef LEFT
- pass_K = off + 1;
- #else
- pass_K = off + 1;
- #endif
- #endif
-
- for(BLASLONG k=0; k<pass_K; k++) {
- result0+=A[ai+0]*B[bi+0];
- ai+=1;
- bi+=1;
- }
-
- BLASLONG ci=n_top*ldc+m_top;
- C[ci+0*ldc+0] = alpha * result0;
- m_top+=1;
- }
-
- n_top += 1;
- }
-
- return 0;
- }
|