| @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <math.h> | #include <math.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| @@ -62,19 +66,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if (n <= 0 || inc_x <= 0) return(maxf); | if (n <= 0 || inc_x <= 0) return(maxf); | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_max; | FLOAT_V_T v0, v1, v_max; | ||||
| FLOAT_V_T_M1 v_res, v_zero; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_zero = VFMVVF_FLOAT_M1(0, gvl); | |||||
| MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
| FLOAT zero = 0.0; | FLOAT zero = 0.0; | ||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_max = VFMVVF_FLOAT(0, gvl); | v_max = VFMVVF_FLOAT(0, gvl); | ||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -83,6 +93,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -98,6 +109,7 @@ asm volatile( | |||||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -106,6 +118,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -117,17 +130,17 @@ asm volatile( | |||||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | ||||
| j += gvl*2; | j += gvl*2; | ||||
| } | } | ||||
| v0 = VFMVVF_FLOAT(0, gvl); | |||||
| v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); | |||||
| maxf = v0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||||
| maxf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -136,6 +149,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -144,14 +158,13 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #endif | #endif | ||||
| v1 = VFMVVF_FLOAT(0, gvl); | |||||
| v0 = VFREDMAXVS_FLOAT(v0, v1, gvl); | |||||
| if(v0[0] > maxf) | |||||
| maxf = v0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||||
| if(v_res[0] > maxf) | |||||
| maxf = v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| @@ -162,6 +175,7 @@ asm volatile( | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -170,6 +184,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -185,6 +200,7 @@ asm volatile( | |||||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -193,6 +209,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -205,17 +222,17 @@ asm volatile( | |||||
| j += gvl*2; | j += gvl*2; | ||||
| ix += inc_xv*2; | ix += inc_xv*2; | ||||
| } | } | ||||
| v0 = VFMVVF_FLOAT(0, gvl); | |||||
| v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); | |||||
| maxf = v0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||||
| maxf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -224,6 +241,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -232,10 +250,9 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #endif | #endif | ||||
| v1 = VFMVVF_FLOAT(0, gvl); | |||||
| v0 = VFREDMAXVS_FLOAT(v0, v1, gvl); | |||||
| if(v0[0] > maxf) | |||||
| maxf = v0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||||
| if(v_res[0] > maxf) | |||||
| maxf = v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| } | } | ||||
| @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <float.h> | #include <float.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| @@ -62,11 +66,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT minf=FLT_MAX; | FLOAT minf=FLT_MAX; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_min; | FLOAT_V_T v0, v1, v_min; | ||||
| FLOAT_V_T_M1 v_res, v_max; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
| MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
| FLOAT zero = 0.0; | |||||
| FLOAT zero = 0.0; | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | ||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| @@ -75,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -83,6 +92,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -97,6 +107,7 @@ asm volatile( | |||||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -105,6 +116,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -116,17 +128,17 @@ asm volatile( | |||||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | v_min = VFMINVV_FLOAT(v_min, v1, gvl); | ||||
| j += gvl*2; | j += gvl*2; | ||||
| } | } | ||||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); | |||||
| minf = v0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -135,6 +147,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -142,14 +155,13 @@ asm volatile( | |||||
| :"v"(mask0), "f"(zero), "r"(gvl) | :"v"(mask0), "f"(zero), "r"(gvl) | ||||
| :"v0"); | :"v0"); | ||||
| #endif | #endif | ||||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v0 = VFREDMINVS_FLOAT(v0, v1, gvl); | |||||
| if(v0[0] < minf) | |||||
| minf = v0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
| if(v_res[0] < minf) | |||||
| minf = v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG idx = 0, inc_xv = inc_x * gvl; | BLASLONG idx = 0, inc_xv = inc_x * gvl; | ||||
| @@ -160,6 +172,7 @@ asm volatile( | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -168,6 +181,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -182,6 +196,7 @@ asm volatile( | |||||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -190,6 +205,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -202,17 +218,17 @@ asm volatile( | |||||
| j += gvl*2; | j += gvl*2; | ||||
| idx += inc_xv*2; | idx += inc_xv*2; | ||||
| } | } | ||||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); | |||||
| minf = v0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e64,m8 \n\t" | "vsetvli x0, %3, e64,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -221,6 +237,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #else | #else | ||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | "vor.vv v0, %1, %1\n\t" | ||||
| "vsetvli x0, %3, e32,m8 \n\t" | "vsetvli x0, %3, e32,m8 \n\t" | ||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | "vfrsub.vf %0, %0, %2, v0.t \n\t" | ||||
| @@ -228,10 +245,9 @@ asm volatile( | |||||
| :"v"(mask0), "f"(zero), "r"(gvl) | :"v"(mask0), "f"(zero), "r"(gvl) | ||||
| :"v0"); | :"v0"); | ||||
| #endif | #endif | ||||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v0 = VFREDMINVS_FLOAT(v0, v1, gvl); | |||||
| if(v0[0] < minf) | |||||
| minf = v0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
| if(v_res[0] < minf) | |||||
| minf = v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| } | } | ||||
| @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <math.h> | #include <math.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| { | { | ||||
| @@ -61,39 +65,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if (n <= 0 || inc_x <= 0) return(asumf); | if (n <= 0 || inc_x <= 0) return(asumf); | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_zero,v_sum; | FLOAT_V_T v0, v1, v_zero,v_sum; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| v_zero = VFMVVF_FLOAT(0, gvl); | v_zero = VFMVVF_FLOAT(0, gvl); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
| j += gvl * 2; | j += gvl * 2; | ||||
| } | } | ||||
| v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||||
| asumf += v0[0]; | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
| asumf += v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl); | |||||
| asumf += v0[0]; | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| asumf += v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| v_zero = VFMVVF_FLOAT(0, gvl); | v_zero = VFMVVF_FLOAT(0, gvl); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| @@ -102,26 +110,26 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | ||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
| j += gvl * 2; | j += gvl * 2; | ||||
| inc_xv += inc_xv * 2; | inc_xv += inc_xv * 2; | ||||
| } | } | ||||
| v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||||
| asumf += v0[0]; | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
| asumf += v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl); | |||||
| asumf += v0[0]; | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| asumf += v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| } | } | ||||
| @@ -28,27 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSEV_FLOAT vsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSEV_FLOAT vse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSEV_FLOAT vsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSEV_FLOAT vse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | ||||
| @@ -65,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| if(beta == 0.0){ | if(beta == 0.0){ | ||||
| if(alpha == 0.0){//alpha == 0 && beta == 0 | if(alpha == 0.0){//alpha == 0 && beta == 0 | ||||
| if(inc_y == 1){ | if(inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | vy0 = VFMVVF_FLOAT(0.0, gvl); | ||||
| for(i=0,j=0;i<n/(gvl*2);i++){ | for(i=0,j=0;i<n/(gvl*2);i++){ | ||||
| @@ -75,13 +73,13 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | vy0 = VFMVVF_FLOAT(0.0, gvl); | ||||
| VSEV_FLOAT(&y[j], vy0, gvl); | VSEV_FLOAT(&y[j], vy0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | vy0 = VFMVVF_FLOAT(0.0, gvl); | ||||
| @@ -94,7 +92,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | vy0 = VFMVVF_FLOAT(0.0, gvl); | ||||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| @@ -103,7 +101,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| }else{//alpha != 0 && beta == 0, y = ax | }else{//alpha != 0 && beta == 0, y = ax | ||||
| if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| for(i=0,j=0;i<n/(2*gvl);i++){ | for(i=0,j=0;i<n/(2*gvl);i++){ | ||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| @@ -117,14 +115,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | ||||
| VSEV_FLOAT(&y[j], vy0, gvl); | VSEV_FLOAT(&y[j], vy0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else if(inc_y == 1){ | }else if(inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| @@ -141,14 +139,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | ||||
| VSEV_FLOAT(&y[j], vy0, gvl); | VSEV_FLOAT(&y[j], vy0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| @@ -165,14 +163,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | ||||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{//inc_x !=1 && inc_y != 1 | }else{//inc_x !=1 && inc_y != 1 | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| @@ -192,7 +190,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | ||||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | ||||
| @@ -203,7 +201,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| }else{//beta != 0 | }else{//beta != 0 | ||||
| if(alpha == 0.0){//alpha == 0 && beta != 0; y = by | if(alpha == 0.0){//alpha == 0 && beta != 0; y = by | ||||
| if(inc_y == 1){ | if(inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| for(i=0,j=0;i<n/(2*gvl);i++){ | for(i=0,j=0;i<n/(2*gvl);i++){ | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| @@ -217,14 +215,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| vy0 = VFMULVF_FLOAT(vy0, beta, gvl); | vy0 = VFMULVF_FLOAT(vy0, beta, gvl); | ||||
| VSEV_FLOAT(&y[j], vy0, gvl); | VSEV_FLOAT(&y[j], vy0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| @@ -241,7 +239,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| vy0 = VFMULVF_FLOAT(vy0, beta, gvl); | vy0 = VFMULVF_FLOAT(vy0, beta, gvl); | ||||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | ||||
| @@ -251,7 +249,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| }else{//alpha != 0 && beta != 0; y = ax + by | }else{//alpha != 0 && beta != 0; y = ax + by | ||||
| if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| for(i=0,j=0;i<n/(2*gvl);i++){ | for(i=0,j=0;i<n/(2*gvl);i++){ | ||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| @@ -269,7 +267,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| @@ -278,7 +276,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else if(inc_y == 1){ | }else if(inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| @@ -299,7 +297,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| @@ -308,7 +306,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| @@ -329,7 +327,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| @@ -338,7 +336,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{//inc_x != 1 && inc_y != 1 | }else{//inc_x != 1 && inc_y != 1 | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| @@ -362,7 +360,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| @@ -28,23 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSEV_FLOAT vsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSEV_FLOAT vse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSEV_FLOAT vsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSEV_FLOAT vse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| @@ -60,7 +58,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| if (inc_x == 1 && inc_y == 1) { | if (inc_x == 1 && inc_y == 1) { | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if (gvl <= n/2) { | if (gvl <= n/2) { | ||||
| for (i = 0, j=0; i < n/(2*gvl); i++, j+=2*gvl) { | for (i = 0, j=0; i < n/(2*gvl); i++, j+=2*gvl) { | ||||
| @@ -77,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| //tail | //tail | ||||
| for (; j < n; ) { | for (; j < n; ) { | ||||
| gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n - j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | ||||
| @@ -87,7 +85,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| }else if (inc_y == 1) { | }else if (inc_y == 1) { | ||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| for(i=0,j=0; i<n/(2*gvl); i++){ | for(i=0,j=0; i<n/(2*gvl); i++){ | ||||
| @@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| } | } | ||||
| for (; j<n; ) { | for (; j<n; ) { | ||||
| gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n - j); | |||||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | ||||
| @@ -115,7 +113,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| }else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| for(i=0,j=0; i<n/(2*gvl); i++){ | for(i=0,j=0; i<n/(2*gvl); i++){ | ||||
| @@ -134,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| } | } | ||||
| for (; j<n; ) { | for (; j<n; ) { | ||||
| gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n - j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | ||||
| @@ -144,7 +142,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| }else{ | }else{ | ||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| @@ -165,7 +163,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| } | } | ||||
| for (; j<n; ) { | for (; j<n; ) { | ||||
| gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n - j); | |||||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | ||||
| @@ -26,21 +26,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VSEV_FLOAT vsev_float32xm8 | |||||
| #define VSSEV_FLOAT vssev_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VSEV_FLOAT vse_v_f32m8 | |||||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VSEV_FLOAT vsev_float64xm8 | |||||
| #define VSSEV_FLOAT vssev_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VSEV_FLOAT vse_v_f64m8 | |||||
| #define VSSEV_FLOAT vsse_v_f64m8 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | ||||
| @@ -56,7 +54,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
| memcpy(&y[0], &x[0], n*sizeof(FLOAT)); | memcpy(&y[0], &x[0], n*sizeof(FLOAT)); | ||||
| }else if (inc_y == 1){ | }else if (inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/4){ | if(gvl <= n/4){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| @@ -77,13 +75,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| VSEV_FLOAT(&y[j], v0, gvl); | VSEV_FLOAT(&y[j], v0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/4){ | if(gvl <= n/4){ | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| @@ -104,14 +102,14 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl); | VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/4){ | if(gvl <= n/4){ | ||||
| @@ -136,7 +134,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl); | VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| @@ -27,25 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
| #endif | #endif | ||||
| #if defined(DSDOT) | #if defined(DSDOT) | ||||
| @@ -61,8 +65,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| FLOAT_V_T vr, vx, vy; | FLOAT_V_T vr, vx, vy; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| @@ -71,23 +80,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j > 0){ | if(j > 0){ | ||||
| vx = VFMVVF_FLOAT(0, gvl); | |||||
| vx = VFREDSUM_FLOAT(vr, vx, gvl); | |||||
| dot += vx[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += v_res[0]; | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| vy = VLEV_FLOAT(&y[j], gvl); | vy = VLEV_FLOAT(&y[j], gvl); | ||||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
| vx = VFREDSUM_FLOAT(vr, vz, gvl); | |||||
| dot += vx[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += v_res[0]; | |||||
| } | } | ||||
| }else if(inc_y == 1){ | }else if(inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| @@ -97,23 +105,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j > 0){ | if(j > 0){ | ||||
| vx = VFMVVF_FLOAT(0, gvl); | |||||
| vx = VFREDSUM_FLOAT(vr, vx, gvl); | |||||
| dot += vx[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += v_res[0]; | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy = VLEV_FLOAT(&y[j], gvl); | vy = VLEV_FLOAT(&y[j], gvl); | ||||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
| vx = VFREDSUM_FLOAT(vr, vz, gvl); | |||||
| dot += vx[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += v_res[0]; | |||||
| } | } | ||||
| }else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| unsigned int stride_y = inc_y * sizeof(FLOAT); | unsigned int stride_y = inc_y * sizeof(FLOAT); | ||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| @@ -123,23 +130,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j > 0){ | if(j > 0){ | ||||
| vx = VFMVVF_FLOAT(0, gvl); | |||||
| vx = VFREDSUM_FLOAT(vr, vx, gvl); | |||||
| dot += vx[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += v_res[0]; | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
| vx = VFREDSUM_FLOAT(vr, vz, gvl); | |||||
| dot += vx[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += v_res[0]; | |||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| unsigned int stride_y = inc_y * sizeof(FLOAT); | unsigned int stride_y = inc_y * sizeof(FLOAT); | ||||
| @@ -150,20 +156,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j > 0){ | if(j > 0){ | ||||
| vx = VFMVVF_FLOAT(0, gvl); | |||||
| vx = VFREDSUM_FLOAT(vr, vx, gvl); | |||||
| dot += vx[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += v_res[0]; | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
| vx = VFREDSUM_FLOAT(vr, vz, gvl); | |||||
| dot += vx[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| return(dot); | return(dot); | ||||
| @@ -27,23 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSEV_FLOAT vsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSEV_FLOAT vse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSEV_FLOAT vsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSEV_FLOAT vse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
| @@ -57,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| FLOAT_V_T va0, va1, vy0, vy1; | FLOAT_V_T va0, va1, vy0, vy1; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| if(inc_y == 1){ | if(inc_y == 1){ | ||||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m); | |||||
| if(gvl <= m/2){ | if(gvl <= m/2){ | ||||
| for(k=0,j=0; k<m/(2*gvl); k++){ | for(k=0,j=0; k<m/(2*gvl); k++){ | ||||
| a_ptr = a; | a_ptr = a; | ||||
| @@ -81,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| } | } | ||||
| //tail | //tail | ||||
| for(;j < m;){ | for(;j < m;){ | ||||
| gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-j); | |||||
| a_ptr = a; | a_ptr = a; | ||||
| ix = 0; | ix = 0; | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| @@ -98,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| } | } | ||||
| }else{ | }else{ | ||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | BLASLONG stride_y = inc_y * sizeof(FLOAT); | ||||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m); | |||||
| if(gvl <= m/2){ | if(gvl <= m/2){ | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| for(k=0,j=0; k<m/(2*gvl); k++){ | for(k=0,j=0; k<m/(2*gvl); k++){ | ||||
| @@ -124,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| } | } | ||||
| //tail | //tail | ||||
| for(;j < m;){ | for(;j < m;){ | ||||
| gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-j); | |||||
| a_ptr = a; | a_ptr = a; | ||||
| ix = 0; | ix = 0; | ||||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| @@ -27,41 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
| { | { | ||||
| BLASLONG i = 0, j = 0, k = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT *a_ptr = a; | |||||
| BLASLONG i = 0, j = 0, k = 0; | |||||
| BLASLONG ix = 0, iy = 0; | |||||
| FLOAT *a_ptr = a; | |||||
| FLOAT temp; | FLOAT temp; | ||||
| FLOAT_V_T va, vr, vx; | FLOAT_V_T va, vr, vx; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| for(i = 0; i < n; i++){ | for(i = 0; i < n; i++){ | ||||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m); | |||||
| j = 0; | j = 0; | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| for(k = 0; k < m/gvl; k++){ | for(k = 0; k < m/gvl; k++){ | ||||
| @@ -70,29 +79,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| vr = VFMACCVV_FLOAT(vr, va, vx, gvl); | vr = VFMACCVV_FLOAT(vr, va, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp = v_res[0]; | |||||
| if(j < m){ | if(j < m){ | ||||
| gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-j); | |||||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | va = VLEV_FLOAT(&a_ptr[j], gvl); | ||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| vr = VFMULVV_FLOAT(va, vx, gvl); | vr = VFMULVV_FLOAT(va, vx, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp += v_res[0]; | |||||
| } | } | ||||
| y[iy] += alpha * temp; | y[iy] += alpha * temp; | ||||
| iy += inc_y; | iy += inc_y; | ||||
| a_ptr += lda; | a_ptr += lda; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| for(i = 0; i < n; i++){ | for(i = 0; i < n; i++){ | ||||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m); | |||||
| j = 0; | j = 0; | ||||
| ix = 0; | ix = 0; | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| @@ -103,18 +109,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| j += gvl; | j += gvl; | ||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp = v_res[0]; | |||||
| if(j < m){ | if(j < m){ | ||||
| gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-j); | |||||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | va = VLEV_FLOAT(&a_ptr[j], gvl); | ||||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vr = VFMULVV_FLOAT(va, vx, gvl); | vr = VFMULVV_FLOAT(va, vx, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp += v_res[0]; | |||||
| } | } | ||||
| y[iy] += alpha * temp; | y[iy] += alpha * temp; | ||||
| iy += inc_y; | iy += inc_y; | ||||
| @@ -31,49 +31,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #define ABS fabs | #define ABS fabs | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||||
| #define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 | |||||
| #define VMFIRSTM vmfirstm_e64xm8 | |||||
| #define UINT_V_T uint64xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||||
| #define VIDV_UINT vidv_uint64xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||||
| #define VMFIRSTM vmfirst_m_b8 | |||||
| #define UINT_V_T vuint64m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||||
| #define VIDV_UINT vid_v_u64m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
| #define VADDVX_UINT vadd_vx_u64m8 | |||||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||||
| #else | #else | ||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||||
| #define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 | |||||
| #define VMFIRSTM vmfirstm_e32xm8 | |||||
| #define UINT_V_T uint32xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||||
| #define VIDV_UINT vidv_uint32xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||||
| #define VMFIRSTM vmfirst_m_b4 | |||||
| #define UINT_V_T vuint32m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||||
| #define VIDV_UINT vid_v_u32m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
| #define VADDVX_UINT vadd_vx_u32m8 | |||||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||||
| #endif | #endif | ||||
| @@ -88,42 +92,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| UINT_V_T v_max_index; | UINT_V_T v_max_index; | ||||
| MASK_T mask; | MASK_T mask; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| v_max_index = VMVVX_UINT(0, gvl); | v_max_index = VMVVX_UINT(0, gvl); | ||||
| v_max = VFMVVF_FLOAT(-1, gvl); | v_max = VFMVVF_FLOAT(-1, gvl); | ||||
| for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | mask = VMFLTVF_FLOAT(vx, 0, gvl); | ||||
| vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
| //index where element greater than v_max | //index where element greater than v_max | ||||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | mask = VMFLTVV_FLOAT(v_max, vx, gvl); | ||||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); | |||||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||||
| //update v_max and start_index j | //update v_max and start_index j | ||||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| vx = VFMVVF_FLOAT(0, gvl); | |||||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||||
| maxf = vx[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
| maxf = v_res[0]; | |||||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
| max_index = VMFIRSTM(mask,gvl); | max_index = VMFIRSTM(mask,gvl); | ||||
| max_index = v_max_index[max_index]; | max_index = v_max_index[max_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | mask = VMFLTVF_FLOAT(vx, 0, gvl); | ||||
| v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||||
| v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
| vx = VFMVVF_FLOAT(0, gvl); | |||||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||||
| FLOAT cur_maxf = vx[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
| FLOAT cur_maxf = v_res[0]; | |||||
| if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
| //tail index | //tail index | ||||
| v_max_index = VIDV_UINT(gvl); | v_max_index = VIDV_UINT(gvl); | ||||
| @@ -135,7 +142,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| unsigned int idx = 0, inc_v = gvl * inc_x; | unsigned int idx = 0, inc_v = gvl * inc_x; | ||||
| @@ -145,35 +152,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | mask = VMFLTVF_FLOAT(vx, 0, gvl); | ||||
| vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
| //index where element greater than v_max | //index where element greater than v_max | ||||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | mask = VMFLTVV_FLOAT(v_max, vx, gvl); | ||||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); | |||||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||||
| //update v_max and start_index j | //update v_max and start_index j | ||||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| idx += inc_v; | idx += inc_v; | ||||
| } | } | ||||
| vx = VFMVVF_FLOAT(0, gvl); | |||||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||||
| maxf = vx[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
| maxf = v_res[0]; | |||||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
| max_index = VMFIRSTM(mask,gvl); | max_index = VMFIRSTM(mask,gvl); | ||||
| max_index = v_max_index[max_index]; | max_index = v_max_index[max_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | mask = VMFLTVF_FLOAT(vx, 0, gvl); | ||||
| v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||||
| v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
| vx = VFMVVF_FLOAT(0, gvl); | |||||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||||
| FLOAT cur_maxf = vx[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
| FLOAT cur_maxf = v_res[0]; | |||||
| if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
| //tail index | //tail index | ||||
| v_max_index = VIDV_UINT(gvl); | v_max_index = VIDV_UINT(gvl); | ||||
| @@ -32,49 +32,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #define ABS fabs | #define ABS fabs | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||||
| #define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 | |||||
| #define VMFIRSTM vmfirstm_e64xm8 | |||||
| #define UINT_V_T uint64xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||||
| #define VIDV_UINT vidv_uint64xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||||
| #define VMFIRSTM vmfirst_m_b8 | |||||
| #define UINT_V_T vuint64m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||||
| #define VIDV_UINT vid_v_u64m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
| #define VADDVX_UINT vadd_vx_u64m8 | |||||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||||
| #else | #else | ||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||||
| #define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 | |||||
| #define VMFIRSTM vmfirstm_e32xm8 | |||||
| #define UINT_V_T uint32xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||||
| #define VIDV_UINT vidv_uint32xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||||
| #define VMFIRSTM vmfirst_m_b4 | |||||
| #define UINT_V_T vuint32m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||||
| #define VIDV_UINT vid_v_u32m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
| #define VADDVX_UINT vadd_vx_u32m8 | |||||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||||
| #endif | #endif | ||||
| @@ -89,42 +93,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| UINT_V_T v_min_index; | UINT_V_T v_min_index; | ||||
| MASK_T mask; | MASK_T mask; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_max; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | ||||
| v_min_index = VMVVX_UINT(0, gvl); | v_min_index = VMVVX_UINT(0, gvl); | ||||
| for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | mask = VMFLTVF_FLOAT(vx, 0, gvl); | ||||
| vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
| //index where element less than v_min | //index where element less than v_min | ||||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | mask = VMFLTVV_FLOAT(vx, v_min, gvl); | ||||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); | |||||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); | |||||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | |||||
| //update v_min and start_index j | //update v_min and start_index j | ||||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | v_min = VFMINVV_FLOAT(v_min, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||||
| minf = vx[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | mask = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
| min_index = VMFIRSTM(mask,gvl); | min_index = VMFIRSTM(mask,gvl); | ||||
| min_index = v_min_index[min_index]; | min_index = v_min_index[min_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | mask = VMFLTVF_FLOAT(vx, 0, gvl); | ||||
| v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||||
| v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||||
| FLOAT cur_minf = vx[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| FLOAT cur_minf = v_res[0]; | |||||
| if(cur_minf < minf){ | if(cur_minf < minf){ | ||||
| //tail index | //tail index | ||||
| v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
| @@ -136,7 +143,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| unsigned int idx = 0, inc_v = gvl * inc_x; | unsigned int idx = 0, inc_v = gvl * inc_x; | ||||
| @@ -146,35 +153,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | mask = VMFLTVF_FLOAT(vx, 0, gvl); | ||||
| vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
| //index where element less than v_min | //index where element less than v_min | ||||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | mask = VMFLTVV_FLOAT(vx, v_min, gvl); | ||||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); | |||||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); | |||||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | |||||
| //update v_min and start_index j | //update v_min and start_index j | ||||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | v_min = VFMINVV_FLOAT(v_min, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| idx += inc_v; | idx += inc_v; | ||||
| } | } | ||||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||||
| minf = vx[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | mask = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
| min_index = VMFIRSTM(mask,gvl); | min_index = VMFIRSTM(mask,gvl); | ||||
| min_index = v_min_index[min_index]; | min_index = v_min_index[min_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | mask = VMFLTVF_FLOAT(vx, 0, gvl); | ||||
| v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||||
| v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||||
| FLOAT cur_minf = vx[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| FLOAT cur_minf = v_res[0]; | |||||
| if(cur_minf < minf){ | if(cur_minf < minf){ | ||||
| //tail index | //tail index | ||||
| v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
| @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #define ABS fabs | #define ABS fabs | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||||
| #define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 | |||||
| #define VMFIRSTM vmfirstm_e64xm8 | |||||
| #define UINT_V_T uint64xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||||
| #define VIDV_UINT vidv_uint64xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||||
| #define VMFIRSTM vmfirst_m_b8 | |||||
| #define UINT_V_T vuint64m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||||
| #define VIDV_UINT vid_v_u64m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
| #define VADDVX_UINT vadd_vx_u64m8 | |||||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||||
| #else | #else | ||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||||
| #define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 | |||||
| #define VMFIRSTM vmfirstm_e32xm8 | |||||
| #define UINT_V_T uint32xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||||
| #define VIDV_UINT vidv_uint32xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||||
| #define VMFIRSTM vmfirst_m_b4 | |||||
| #define UINT_V_T vuint32m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||||
| #define VIDV_UINT vid_v_u32m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
| #define VADDVX_UINT vadd_vx_u32m8 | |||||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||||
| #endif | #endif | ||||
| @@ -85,8 +89,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| UINT_V_T v_max_index; | UINT_V_T v_max_index; | ||||
| MASK_T mask; | MASK_T mask; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_min; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| v_max_index = VMVVX_UINT(0, gvl); | v_max_index = VMVVX_UINT(0, gvl); | ||||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | ||||
| for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
| @@ -94,27 +103,25 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| //index where element greater than v_max | //index where element greater than v_max | ||||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | mask = VMFLTVV_FLOAT(v_max, vx, gvl); | ||||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); | |||||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||||
| //update v_max and start_index j | //update v_max and start_index j | ||||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| vx = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||||
| maxf = vx[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
| maxf = v_res[0]; | |||||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
| max_index = VMFIRSTM(mask,gvl); | max_index = VMFIRSTM(mask,gvl); | ||||
| max_index = v_max_index[max_index]; | max_index = v_max_index[max_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v_max = VLEV_FLOAT(&x[j], gvl); | v_max = VLEV_FLOAT(&x[j], gvl); | ||||
| vx = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||||
| FLOAT cur_maxf = vx[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
| FLOAT cur_maxf = v_res[0]; | |||||
| if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
| //tail index | //tail index | ||||
| v_max_index = VIDV_UINT(gvl); | v_max_index = VIDV_UINT(gvl); | ||||
| @@ -126,7 +133,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| unsigned int idx = 0, inc_v = gvl * inc_x; | unsigned int idx = 0, inc_v = gvl * inc_x; | ||||
| @@ -137,28 +144,26 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| //index where element greater than v_max | //index where element greater than v_max | ||||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | mask = VMFLTVV_FLOAT(v_max, vx, gvl); | ||||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); | |||||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||||
| //update v_max and start_index j | //update v_max and start_index j | ||||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| idx += inc_v; | idx += inc_v; | ||||
| } | } | ||||
| vx = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||||
| maxf = vx[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
| maxf = v_res[0]; | |||||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
| max_index = VMFIRSTM(mask,gvl); | max_index = VMFIRSTM(mask,gvl); | ||||
| max_index = v_max_index[max_index]; | max_index = v_max_index[max_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| vx = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||||
| FLOAT cur_maxf = vx[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
| FLOAT cur_maxf = v_res[0]; | |||||
| if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
| //tail index | //tail index | ||||
| v_max_index = VIDV_UINT(gvl); | v_max_index = VIDV_UINT(gvl); | ||||
| @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #define ABS fabs | #define ABS fabs | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||||
| #define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 | |||||
| #define VMFIRSTM vmfirstm_e64xm8 | |||||
| #define UINT_V_T uint64xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||||
| #define VIDV_UINT vidv_uint64xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||||
| #define VMFIRSTM vmfirst_m_b8 | |||||
| #define UINT_V_T vuint64m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||||
| #define VIDV_UINT vid_v_u64m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
| #define VADDVX_UINT vadd_vx_u64m8 | |||||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||||
| #else | #else | ||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||||
| #define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 | |||||
| #define VMFIRSTM vmfirstm_e32xm8 | |||||
| #define UINT_V_T uint32xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||||
| #define VIDV_UINT vidv_uint32xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||||
| #define VMFIRSTM vmfirst_m_b4 | |||||
| #define UINT_V_T vuint32m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||||
| #define VIDV_UINT vid_v_u32m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
| #define VADDVX_UINT vadd_vx_u32m8 | |||||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||||
| #endif | #endif | ||||
| @@ -85,15 +89,20 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| UINT_V_T v_min_index; | UINT_V_T v_min_index; | ||||
| MASK_T mask; | MASK_T mask; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_max; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | ||||
| v_min_index = VMVVX_UINT(0, gvl); | v_min_index = VMVVX_UINT(0, gvl); | ||||
| for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| //index where element less than v_min | //index where element less than v_min | ||||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | mask = VMFLTVV_FLOAT(vx, v_min, gvl); | ||||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); | |||||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -113,26 +122,24 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #endif | #endif | ||||
| */ | */ | ||||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); | |||||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); | |||||
| //update v_min and start_index j | //update v_min and start_index j | ||||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | v_min = VFMINVV_FLOAT(v_min, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||||
| minf = vx[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | mask = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
| min_index = VMFIRSTM(mask,gvl); | min_index = VMFIRSTM(mask,gvl); | ||||
| min_index = v_min_index[min_index]; | min_index = v_min_index[min_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v_min = VLEV_FLOAT(&x[j], gvl); | v_min = VLEV_FLOAT(&x[j], gvl); | ||||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||||
| FLOAT cur_minf = vx[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| FLOAT cur_minf = v_res[0]; | |||||
| if(cur_minf < minf){ | if(cur_minf < minf){ | ||||
| //tail index | //tail index | ||||
| v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
| @@ -143,7 +150,7 @@ asm volatile( | |||||
| } | } | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| unsigned int idx = 0, inc_v = gvl * inc_x; | unsigned int idx = 0, inc_v = gvl * inc_x; | ||||
| @@ -154,7 +161,7 @@ asm volatile( | |||||
| //index where element less than v_min | //index where element less than v_min | ||||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | mask = VMFLTVV_FLOAT(vx, v_min, gvl); | ||||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); | |||||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -175,27 +182,25 @@ asm volatile( | |||||
| #endif | #endif | ||||
| */ | */ | ||||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); | |||||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); | |||||
| //update v_min and start_index j | //update v_min and start_index j | ||||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | v_min = VFMINVV_FLOAT(v_min, vx, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| idx += inc_v; | idx += inc_v; | ||||
| } | } | ||||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||||
| minf = vx[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | mask = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
| min_index = VMFIRSTM(mask,gvl); | min_index = VMFIRSTM(mask,gvl); | ||||
| min_index = v_min_index[min_index]; | min_index = v_min_index[min_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||||
| FLOAT cur_minf = vx[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| FLOAT cur_minf = v_res[0]; | |||||
| if(cur_minf < minf){ | if(cur_minf < minf){ | ||||
| //tail index | //tail index | ||||
| v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
| @@ -30,47 +30,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||||
| #define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 | |||||
| #define VMFIRSTM vmfirstm_e64xm8 | |||||
| #define UINT_V_T uint64xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||||
| #define VIDV_UINT vidv_uint64xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||||
| #define VMFIRSTM vmfirst_m_b8 | |||||
| #define UINT_V_T vuint64m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||||
| #define VIDV_UINT vid_v_u64m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
| #define VADDVX_UINT vadd_vx_u64m8 | |||||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||||
| #else | #else | ||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||||
| #define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 | |||||
| #define VMFIRSTM vmfirstm_e32xm8 | |||||
| #define UINT_V_T uint32xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||||
| #define VIDV_UINT vidv_uint32xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||||
| #define VMFIRSTM vmfirst_m_b4 | |||||
| #define UINT_V_T vuint32m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||||
| #define VIDV_UINT vid_v_u32m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
| #define VADDVX_UINT vadd_vx_u32m8 | |||||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||||
| #endif | #endif | ||||
| #define RVV_M RVV_M8 | #define RVV_M RVV_M8 | ||||
| @@ -86,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| UINT_V_T v_max_index; | UINT_V_T v_max_index; | ||||
| MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| gvl = VSETVL(n); | |||||
| v_max_index = VMVVX_UINT(0, gvl); | v_max_index = VMVVX_UINT(0, gvl); | ||||
| v_max = VFMVVF_FLOAT(-1, gvl); | v_max = VFMVVF_FLOAT(-1, gvl); | ||||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| @@ -96,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | ||||
| vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); | |||||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -119,7 +130,7 @@ asm volatile( | |||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | ||||
| vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); | |||||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -143,7 +154,7 @@ asm volatile( | |||||
| //index where element greater than v_max | //index where element greater than v_max | ||||
| mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); | mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); | ||||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); | |||||
| v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -163,7 +174,7 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #endif | #endif | ||||
| */ | */ | ||||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); | |||||
| v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); | |||||
| //update v_max and start_index j | //update v_max and start_index j | ||||
| v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); | ||||
| @@ -171,19 +182,19 @@ asm volatile( | |||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| vx0 = VFMVVF_FLOAT(0, gvl); | vx0 = VFMVVF_FLOAT(0, gvl); | ||||
| vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); | |||||
| maxf = vx0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
| maxf = v_res[0]; | |||||
| mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
| max_index = VMFIRSTM(mask0,gvl); | max_index = VMFIRSTM(mask0,gvl); | ||||
| max_index = v_max_index[max_index]; | max_index = v_max_index[max_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v_max_index = VMVVX_UINT(0, gvl); | v_max_index = VMVVX_UINT(0, gvl); | ||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | ||||
| vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); | |||||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -206,7 +217,7 @@ asm volatile( | |||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | ||||
| vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); | |||||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -227,9 +238,8 @@ asm volatile( | |||||
| #endif | #endif | ||||
| */ | */ | ||||
| v_max = VFADDVV_FLOAT(vx0, vx1, gvl); | v_max = VFADDVV_FLOAT(vx0, vx1, gvl); | ||||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||||
| vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); | |||||
| FLOAT cur_maxf = vx0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
| FLOAT cur_maxf = v_res[0]; | |||||
| if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
| //tail index | //tail index | ||||
| v_max_index = VIDV_UINT(gvl); | v_max_index = VIDV_UINT(gvl); | ||||
| @@ -31,50 +31,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||||
| #define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 | |||||
| #define VMFIRSTM vmfirstm_e64xm8 | |||||
| #define UINT_V_T uint64xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||||
| #define VIDV_UINT vidv_uint64xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||||
| #define VMFIRSTM vmfirst_m_b8 | |||||
| #define UINT_V_T vuint64m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||||
| #define VIDV_UINT vid_v_u64m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
| #define VADDVX_UINT vadd_vx_u64m8 | |||||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||||
| #else | #else | ||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||||
| #define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 | |||||
| #define VMFIRSTM vmfirstm_e32xm8 | |||||
| #define UINT_V_T uint32xm8_t | |||||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||||
| #define VIDV_UINT vidv_uint32xm8 | |||||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||||
| #define VMFIRSTM vmfirst_m_b4 | |||||
| #define UINT_V_T vuint32m8_t | |||||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||||
| #define VIDV_UINT vid_v_u32m8 | |||||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
| #define VADDVX_UINT vadd_vx_u32m8 | |||||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||||
| #endif | #endif | ||||
| #define RVV_M RVV_M8 | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| { | { | ||||
| @@ -87,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| UINT_V_T v_min_index; | UINT_V_T v_min_index; | ||||
| MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| FLOAT_V_T_M1 v_res, v_max; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
| gvl = VSETVL(n); | |||||
| v_min_index = VMVVX_UINT(0, gvl); | v_min_index = VMVVX_UINT(0, gvl); | ||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | ||||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| @@ -97,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | ||||
| vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); | |||||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -120,7 +130,7 @@ asm volatile( | |||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | ||||
| vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); | |||||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -144,7 +154,7 @@ asm volatile( | |||||
| //index where element less than v_min | //index where element less than v_min | ||||
| mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); | mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); | ||||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); | |||||
| v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -164,27 +174,26 @@ asm volatile( | |||||
| :"v0"); | :"v0"); | ||||
| #endif | #endif | ||||
| */ | */ | ||||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); | |||||
| v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); | |||||
| //update v_min and start_index j | //update v_min and start_index j | ||||
| v_min = VFMINVV_FLOAT(v_min, vx0, gvl); | v_min = VFMINVV_FLOAT(v_min, vx0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); | |||||
| minf = vx0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); | mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
| min_index = VMFIRSTM(mask0,gvl); | min_index = VMFIRSTM(mask0,gvl); | ||||
| min_index = v_min_index[min_index]; | min_index = v_min_index[min_index]; | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v_min_index = VMVVX_UINT(0, gvl); | v_min_index = VMVVX_UINT(0, gvl); | ||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | ||||
| vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); | |||||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -207,7 +216,7 @@ asm volatile( | |||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | ||||
| vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); | |||||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||||
| /* | /* | ||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| asm volatile( | asm volatile( | ||||
| @@ -228,9 +237,8 @@ asm volatile( | |||||
| #endif | #endif | ||||
| */ | */ | ||||
| v_min = VFADDVV_FLOAT(vx0, vx1, gvl); | v_min = VFADDVV_FLOAT(vx0, vx1, gvl); | ||||
| vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); | |||||
| FLOAT cur_minf = vx0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| FLOAT cur_minf = v_res[0]; | |||||
| if(cur_minf < minf){ | if(cur_minf < minf){ | ||||
| //tail index | //tail index | ||||
| v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
| @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <math.h> | #include <math.h> | ||||
| #include <float.h> | #include <float.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT maxf=-FLT_MAX; | FLOAT maxf=-FLT_MAX; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_max; | FLOAT_V_T v0, v1, v_max; | ||||
| FLOAT_V_T_M1 v_res, v_min; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | ||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| @@ -68,21 +76,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | ||||
| j += gvl * 2; | j += gvl * 2; | ||||
| } | } | ||||
| v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
| v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); | |||||
| maxf = v0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
| maxf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
| v0 = VFREDMAXVS_FLOAT(v0, v1, gvl); | |||||
| if(v0[0] > maxf) | |||||
| maxf = v0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||||
| if(v_res[0] > maxf) | |||||
| maxf = v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | ||||
| @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| j += gvl * 2; | j += gvl * 2; | ||||
| idx += inc_xv * 2; | idx += inc_xv * 2; | ||||
| } | } | ||||
| v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
| v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); | |||||
| maxf = v0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
| maxf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
| v0 = VFREDMAXVS_FLOAT(v0, v1, gvl); | |||||
| if(v0[0] > maxf) | |||||
| maxf = v0[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||||
| if(v_res[0] > maxf) | |||||
| maxf = v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| } | } | ||||
| @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <math.h> | #include <math.h> | ||||
| #include <float.h> | #include <float.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT minf=FLT_MAX; | FLOAT minf=FLT_MAX; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_min; | FLOAT_V_T v0, v1, v_min; | ||||
| FLOAT_V_T_M1 v_res, v_max; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | ||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| @@ -68,21 +76,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | v_min = VFMINVV_FLOAT(v_min, v1, gvl); | ||||
| j += gvl * 2; | j += gvl * 2; | ||||
| } | } | ||||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); | |||||
| minf = v0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v0 = VFREDMINVS_FLOAT(v0, v1, gvl); | |||||
| if(v0[0] < minf) | |||||
| minf = v0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
| if(v_res[0] < minf) | |||||
| minf = v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | ||||
| @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| j += gvl * 2; | j += gvl * 2; | ||||
| idx += inc_xv * 2; | idx += inc_xv * 2; | ||||
| } | } | ||||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); | |||||
| minf = v0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v0 = VFREDMINVS_FLOAT(v0, v1, gvl); | |||||
| if(v0[0] < minf) | |||||
| minf = v0[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
| if(v_res[0] < minf) | |||||
| minf = v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| } | } | ||||
| @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #define MASK_T e32xm4_t | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 | |||||
| #define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 | |||||
| #define VMFIRSTM vmfirstm_e32xm4 | |||||
| #define VFDIVVF_FLOAT vfdivvf_float32xm4 | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 | |||||
| #define MASK_T vbool8_t | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m | |||||
| #define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 | |||||
| #define VMFIRSTM vmfirst_m_b8 | |||||
| #define VFDIVVF_FLOAT vfdiv_vf_f32m4 | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #define MASK_T e64xm4_t | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 | |||||
| #define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 | |||||
| #define VMFIRSTM vmfirstm_e64xm4 | |||||
| #define VFDIVVF_FLOAT vfdivvf_float64xm4 | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 | |||||
| #define MASK_T vbool16_t | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m | |||||
| #define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 | |||||
| #define VMFIRSTM vmfirst_m_b16 | |||||
| #define VFDIVVF_FLOAT vfdiv_vf_f64m4 | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| @@ -73,18 +77,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT_V_T vr, v0, v_zero; | FLOAT_V_T vr, v0, v_zero; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT scale = 0.0, ssq = 0.0; | FLOAT scale = 0.0, ssq = 0.0; | ||||
| MASK_T mask; | MASK_T mask; | ||||
| BLASLONG index = 0; | BLASLONG index = 0; | ||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| v_zero = VFMVVF_FLOAT(0, gvl); | v_zero = VFMVVF_FLOAT(0, gvl); | ||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -95,15 +104,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq before current vector | //total ssq before current vector | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| //ssq in vector vr | //ssq in vector vr | ||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| @@ -111,17 +120,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq now | //total ssq now | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -130,21 +139,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| } | } | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq now | //total ssq now | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| v_zero = VFMVVF_FLOAT(0, gvl); | v_zero = VFMVVF_FLOAT(0, gvl); | ||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| @@ -153,7 +162,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -164,15 +173,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq before current vector | //total ssq before current vector | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| //ssq in vector vr | //ssq in vector vr | ||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| @@ -181,17 +190,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| idx += inc_v; | idx += inc_v; | ||||
| } | } | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq now | //total ssq now | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -200,18 +209,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | scale = vr[0]; | ||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| } | } | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq now | //total ssq now | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| return(scale * sqrt(ssq)); | return(scale * sqrt(ssq)); | ||||
| @@ -27,26 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm8 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m8_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f32m8 | |||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm8 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m8_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f64m8 | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #endif | #endif | ||||
| @@ -60,8 +64,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT_V_T vr, v0, v1; | FLOAT_V_T vr, v0, v1; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl < n/2){ | if(gvl < n/2){ | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| for(i=0,j=0; i<n/(2*gvl); i++){ | for(i=0,j=0; i<n/(2*gvl); i++){ | ||||
| @@ -73,25 +82,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| vr = VFMACCVV_FLOAT(vr, v1, v1, gvl); | vr = VFMACCVV_FLOAT(vr, v1, v1, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| v0 = VFMVVF_FLOAT(0, gvl); | |||||
| v0 = VFREDSUM_FLOAT(vr, v0, gvl); | |||||
| len += v0[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| len += v_res[0]; | |||||
| } | } | ||||
| //tail | //tail | ||||
| for(;j < n;){ | for(;j < n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| //v1 = 0 | //v1 = 0 | ||||
| v1 = VFMVVF_FLOAT(0, gvl); | |||||
| //v1 = VFMVVF_FLOAT(0, gvl); | |||||
| //vr = VFDOTVV_FLOAT(v0, v0, gvl); | //vr = VFDOTVV_FLOAT(v0, v0, gvl); | ||||
| vr = VFMACCVV_FLOAT(v1, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v1, v0, v0, gvl); | ||||
| v0 = VFREDSUM_FLOAT(vr, v1, gvl); | |||||
| len += v0[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| len += v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl < n/2){ | if(gvl < n/2){ | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| @@ -104,20 +112,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| vr = VFMACCVV_FLOAT(vr, v1, v1, gvl); | vr = VFMACCVV_FLOAT(vr, v1, v1, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| v0 = VFMVVF_FLOAT(0, gvl); | |||||
| v0 = VFREDSUM_FLOAT(vr, v0, gvl); | |||||
| len += v0[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| len += v_res[0]; | |||||
| } | } | ||||
| //tail | //tail | ||||
| for(;j < n;){ | for(;j < n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| //v1 = 0 | //v1 = 0 | ||||
| v1 = VFMVVF_FLOAT(0, gvl); | |||||
| //v1 = VFMVVF_FLOAT(0, gvl); | |||||
| //vr = VFDOTVV_FLOAT(v0, v0, gvl); | //vr = VFDOTVV_FLOAT(v0, v0, gvl); | ||||
| vr = VFMACCVV_FLOAT(v1, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v1, v0, v0, gvl); | ||||
| v0 = VFREDSUM_FLOAT(vr, v1, gvl); | |||||
| len += v0[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| len += v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSEV_FLOAT vsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||||
| #define VFMSACVF_FLOAT vfmsacvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSEV_FLOAT vse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSEV_FLOAT vsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||||
| #define VFMSACVF_FLOAT vfmsacvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSEV_FLOAT vse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | ||||
| @@ -61,7 +61,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| FLOAT_V_T v0, v1, vx, vy; | FLOAT_V_T v0, v1, vx, vy; | ||||
| if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| vy = VLEV_FLOAT(&y[j], gvl); | vy = VLEV_FLOAT(&y[j], gvl); | ||||
| @@ -77,7 +77,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| vy = VLEV_FLOAT(&y[j], gvl); | vy = VLEV_FLOAT(&y[j], gvl); | ||||
| @@ -90,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| VSEV_FLOAT(&y[j], v1, gvl); | VSEV_FLOAT(&y[j], v1, gvl); | ||||
| } | } | ||||
| }else if(inc_y == 1){ | }else if(inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| @@ -109,7 +109,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy = VLEV_FLOAT(&y[j], gvl); | vy = VLEV_FLOAT(&y[j], gvl); | ||||
| @@ -122,7 +122,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| VSEV_FLOAT(&y[j], v1, gvl); | VSEV_FLOAT(&y[j], v1, gvl); | ||||
| } | } | ||||
| }else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | BLASLONG stride_y = inc_y * sizeof(FLOAT); | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| @@ -141,7 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl); | vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl); | ||||
| @@ -154,7 +154,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl); | VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl); | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | BLASLONG stride_y = inc_y * sizeof(FLOAT); | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| @@ -176,7 +176,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx = VLSEV_FLOAT(&x[j*inc_x],stride_x, gvl); | vx = VLSEV_FLOAT(&x[j*inc_x],stride_x, gvl); | ||||
| vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl); | vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl); | ||||
| @@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VSEV_FLOAT vsev_float32xm8 | |||||
| #define VSSEV_FLOAT vssev_float32xm8 | |||||
| #define VFMULVF_FLOAT vfmulvf_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VSEV_FLOAT vse_v_f32m8 | |||||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VSEV_FLOAT vsev_float64xm8 | |||||
| #define VSSEV_FLOAT vssev_float64xm8 | |||||
| #define VFMULVF_FLOAT vfmulvf_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VSEV_FLOAT vse_v_f64m8 | |||||
| #define VSSEV_FLOAT vsse_v_f64m8 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| @@ -61,7 +61,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| if(da == 0.0){ | if(da == 0.0){ | ||||
| memset(&x[0], 0, n * sizeof(FLOAT)); | memset(&x[0], 0, n * sizeof(FLOAT)); | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n / 2){ | if(gvl <= n / 2){ | ||||
| for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| @@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| //tail | //tail | ||||
| for(; j <n; ){ | for(; j <n; ){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| v0 = VFMULVF_FLOAT(v0, da, gvl); | v0 = VFMULVF_FLOAT(v0, da, gvl); | ||||
| VSEV_FLOAT(&x[j], v0, gvl); | VSEV_FLOAT(&x[j], v0, gvl); | ||||
| @@ -84,7 +84,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| }else{ | }else{ | ||||
| if(da == 0.0){ | if(da == 0.0){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n / 2){ | if(gvl <= n / 2){ | ||||
| v0 = VFMVVF_FLOAT(0, gvl); | v0 = VFMVVF_FLOAT(0, gvl); | ||||
| for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | ||||
| @@ -94,13 +94,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| //tail | //tail | ||||
| for(; j <n; ){ | for(; j <n; ){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VFMVVF_FLOAT(0, gvl); | v0 = VFMVVF_FLOAT(0, gvl); | ||||
| VSEV_FLOAT(&x[j], v0, gvl); | VSEV_FLOAT(&x[j], v0, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| BLASLONG ix = 0; | BLASLONG ix = 0; | ||||
| if(gvl < n / 2){ | if(gvl < n / 2){ | ||||
| @@ -118,7 +118,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| //tail | //tail | ||||
| for(; j <n; ){ | for(; j <n; ){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| v0 = VFMULVF_FLOAT(v0, da, gvl); | v0 = VFMULVF_FLOAT(v0, da, gvl); | ||||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | ||||
| @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VSEV_FLOAT vsev_float32xm8 | |||||
| #define VSSEV_FLOAT vssev_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VSEV_FLOAT vse_v_f32m8 | |||||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VSEV_FLOAT vsev_float64xm8 | |||||
| #define VSSEV_FLOAT vssev_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VSEV_FLOAT vse_v_f64m8 | |||||
| #define VSSEV_FLOAT vsse_v_f64m8 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| if (n < 0) return(0); | if (n < 0) return(0); | ||||
| if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| for(i=0,j=0; i<n/(2*gvl); i++){ | for(i=0,j=0; i<n/(2*gvl); i++){ | ||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| VSEV_FLOAT(&x[j], vy0, gvl); | VSEV_FLOAT(&x[j], vy0, gvl); | ||||
| @@ -79,7 +79,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| j+=gvl; | j+=gvl; | ||||
| } | } | ||||
| }else if (inc_y == 1){ | }else if (inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| @@ -98,7 +98,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl); | VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl); | ||||
| @@ -107,7 +107,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| ix += inc_x * gvl; | ix += inc_x * gvl; | ||||
| } | } | ||||
| }else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| @@ -126,7 +126,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| VSEV_FLOAT(&x[j], vy0, gvl); | VSEV_FLOAT(&x[j], vy0, gvl); | ||||
| @@ -135,7 +135,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| iy += inc_y * gvl; | iy += inc_y * gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| @@ -157,7 +157,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl); | VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl); | ||||
| @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSEV_FLOAT vsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSEV_FLOAT vse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSEV_FLOAT vsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSEV_FLOAT vse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
| @@ -63,6 +67,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| FLOAT temp2; | FLOAT temp2; | ||||
| FLOAT *a_ptr = a; | FLOAT *a_ptr = a; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT_V_T va, vx, vy, vr; | FLOAT_V_T va, vx, vy, vr; | ||||
| BLASLONG stride_x, stride_y, inc_xv, inc_yv, len; | BLASLONG stride_x, stride_y, inc_xv, inc_yv, len; | ||||
| @@ -76,7 +84,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i = j + 1; | i = j + 1; | ||||
| len = m - i; | len = m - i; | ||||
| if(len > 0){ | if(len > 0){ | ||||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(len); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| for(k = 0; k < len / gvl; k++){ | for(k = 0; k < len / gvl; k++){ | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| @@ -89,11 +97,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i += gvl; | i += gvl; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 = v_res[0]; | |||||
| if(i < m){ | if(i < m){ | ||||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-i); | |||||
| vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | ||||
| @@ -101,9 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| vx = VLEV_FLOAT(&x[i], gvl); | vx = VLEV_FLOAT(&x[i], gvl); | ||||
| vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[j] += alpha * temp2; | y[j] += alpha * temp2; | ||||
| @@ -121,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i = j + 1; | i = j + 1; | ||||
| len = m - i; | len = m - i; | ||||
| if(len > 0){ | if(len > 0){ | ||||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(len); | |||||
| inc_yv = inc_y * gvl; | inc_yv = inc_y * gvl; | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| for(k = 0; k < len / gvl; k++){ | for(k = 0; k < len / gvl; k++){ | ||||
| @@ -136,11 +142,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i += gvl; | i += gvl; | ||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 = v_res[0]; | |||||
| if(i < m){ | if(i < m){ | ||||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-i); | |||||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | ||||
| @@ -148,9 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| vx = VLEV_FLOAT(&x[i], gvl); | vx = VLEV_FLOAT(&x[i], gvl); | ||||
| vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[jy] += alpha * temp2; | y[jy] += alpha * temp2; | ||||
| @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i = j + 1; | i = j + 1; | ||||
| len = m - i; | len = m - i; | ||||
| if(len > 0){ | if(len > 0){ | ||||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(len); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| inc_xv = inc_x * gvl; | inc_xv = inc_x * gvl; | ||||
| for(k = 0; k < len / gvl; k++){ | for(k = 0; k < len / gvl; k++){ | ||||
| @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i += gvl; | i += gvl; | ||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 = v_res[0]; | |||||
| if(i < m){ | if(i < m){ | ||||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-i); | |||||
| vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | ||||
| @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[j] += alpha * temp2; | y[j] += alpha * temp2; | ||||
| @@ -220,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i = j + 1; | i = j + 1; | ||||
| len = m - i; | len = m - i; | ||||
| if(len > 0){ | if(len > 0){ | ||||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(len); | |||||
| inc_xv = inc_x * gvl; | inc_xv = inc_x * gvl; | ||||
| inc_yv = inc_y * gvl; | inc_yv = inc_y * gvl; | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| @@ -237,11 +239,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| ix += inc_xv; | ix += inc_xv; | ||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 = v_res[0]; | |||||
| if(i < m){ | if(i < m){ | ||||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-i); | |||||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | ||||
| @@ -249,9 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[jy] += alpha * temp2; | y[jy] += alpha * temp2; | ||||
| @@ -27,33 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSEV_FLOAT vsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSEV_FLOAT vse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSEV_FLOAT vsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSEV_FLOAT vse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
| @@ -65,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| FLOAT temp2; | FLOAT temp2; | ||||
| FLOAT *a_ptr = a; | FLOAT *a_ptr = a; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT_V_T va, vx, vy, vr; | FLOAT_V_T va, vx, vy, vr; | ||||
| BLASLONG stride_x, stride_y, inc_xv, inc_yv; | BLASLONG stride_x, stride_y, inc_xv, inc_yv; | ||||
| @@ -78,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| temp2 = 0.0; | temp2 = 0.0; | ||||
| if(j > 0){ | if(j > 0){ | ||||
| i = 0; | i = 0; | ||||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| for(k = 0; k < j / gvl; k++){ | for(k = 0; k < j / gvl; k++){ | ||||
| vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
| @@ -91,11 +99,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i += gvl; | i += gvl; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 = v_res[0]; | |||||
| if(i < j){ | if(i < j){ | ||||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j-i); | |||||
| vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | ||||
| @@ -103,9 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| vx = VLEV_FLOAT(&x[i], gvl); | vx = VLEV_FLOAT(&x[i], gvl); | ||||
| vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[j] += temp1 * a_ptr[j] + alpha * temp2; | y[j] += temp1 * a_ptr[j] + alpha * temp2; | ||||
| @@ -122,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| if(j > 0){ | if(j > 0){ | ||||
| iy = 0; | iy = 0; | ||||
| i = 0; | i = 0; | ||||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j); | |||||
| inc_yv = inc_y * gvl; | inc_yv = inc_y * gvl; | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| for(k = 0; k < j / gvl; k++){ | for(k = 0; k < j / gvl; k++){ | ||||
| @@ -137,11 +143,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i += gvl; | i += gvl; | ||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 = v_res[0]; | |||||
| if(i < j){ | if(i < j){ | ||||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j-i); | |||||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | ||||
| @@ -149,9 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| vx = VLEV_FLOAT(&x[i], gvl); | vx = VLEV_FLOAT(&x[i], gvl); | ||||
| vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[jy] += temp1 * a_ptr[j] + alpha * temp2; | y[jy] += temp1 * a_ptr[j] + alpha * temp2; | ||||
| @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| if(j > 0){ | if(j > 0){ | ||||
| ix = 0; | ix = 0; | ||||
| i = 0; | i = 0; | ||||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j); | |||||
| inc_xv = inc_x * gvl; | inc_xv = inc_x * gvl; | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| for(k = 0; k < j / gvl; k++){ | for(k = 0; k < j / gvl; k++){ | ||||
| @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| i += gvl; | i += gvl; | ||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 = v_res[0]; | |||||
| if(i < j){ | if(i < j){ | ||||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j-i); | |||||
| vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | ||||
| @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[j] += temp1 * a_ptr[j] + alpha * temp2; | y[j] += temp1 * a_ptr[j] + alpha * temp2; | ||||
| @@ -219,7 +221,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| ix = 0; | ix = 0; | ||||
| iy = 0; | iy = 0; | ||||
| i = 0; | i = 0; | ||||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j); | |||||
| inc_xv = inc_x * gvl; | inc_xv = inc_x * gvl; | ||||
| inc_yv = inc_y * gvl; | inc_yv = inc_y * gvl; | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| @@ -236,11 +238,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| ix += inc_xv; | ix += inc_xv; | ||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 = va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 = v_res[0]; | |||||
| if(i < j){ | if(i < j){ | ||||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j-i); | |||||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | va = VLEV_FLOAT(&a_ptr[i], gvl); | ||||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | ||||
| @@ -248,9 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
| va = VFMVVF_FLOAT(0, gvl); | |||||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||||
| temp2 += va[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[jy] += temp1 * a_ptr[j] + alpha * temp2; | y[jy] += temp1 * a_ptr[j] + alpha * temp2; | ||||
| @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <math.h> | #include <math.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| @@ -62,19 +66,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if (n <= 0 || inc_x <= 0) return(maxf); | if (n <= 0 || inc_x <= 0) return(maxf); | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_max; | FLOAT_V_T v0, v1, v_max; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| v_max = VFMVVF_FLOAT(0, gvl); | v_max = VFMVVF_FLOAT(0, gvl); | ||||
| BLASLONG inc_xv = inc_x * gvl * 2; | BLASLONG inc_xv = inc_x * gvl * 2; | ||||
| for(; i<n/gvl; i++){ | for(; i<n/gvl; i++){ | ||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v0 = VFADDVV_FLOAT(v0, v1, gvl); | v0 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | ||||
| @@ -82,23 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| j += gvl; | j += gvl; | ||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| v0 = VFMVVF_FLOAT(0, gvl); | |||||
| v_max = VFREDMAXVS_FLOAT(v_max, v0, gvl); | |||||
| maxf = v_max[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
| maxf = v_res[0]; | |||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v1 = VFADDVV_FLOAT(v0, v1, gvl); | v1 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
| v0 = VFMVVF_FLOAT(0, gvl); | |||||
| v_max = VFREDMAXVS_FLOAT(v1, v0, gvl); | |||||
| if(v_max[0] > maxf) | |||||
| maxf = v_max[0]; | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); | |||||
| if(v_res[0] > maxf) | |||||
| maxf = v_res[0]; | |||||
| } | } | ||||
| return(maxf); | return(maxf); | ||||
| } | } | ||||
| @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <float.h> | #include <float.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| @@ -63,18 +67,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT minf=FLT_MAX; | FLOAT minf=FLT_MAX; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_min; | FLOAT_V_T v0, v1, v_min; | ||||
| FLOAT_V_T_M1 v_res, v_max; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
| MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | ||||
| BLASLONG inc_xv = inc_x * gvl * 2; | BLASLONG inc_xv = inc_x * gvl * 2; | ||||
| for(; i<n/gvl; i++){ | for(; i<n/gvl; i++){ | ||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v0 = VFADDVV_FLOAT(v0, v1, gvl); | v0 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | v_min = VFMINVV_FLOAT(v_min, v0, gvl); | ||||
| @@ -82,23 +91,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| j += gvl; | j += gvl; | ||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| v0 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v_min = VFREDMINVS_FLOAT(v_min, v0, gvl); | |||||
| minf = v_min[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = v_res[0]; | |||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v1 = VFADDVV_FLOAT(v0, v1, gvl); | v1 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
| v0 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| v_min = VFREDMINVS_FLOAT(v1, v0, gvl); | |||||
| if(v_min[0] < minf) | |||||
| minf = v_min[0]; | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl); | |||||
| if(v_res[0] < minf) | |||||
| minf = v_res[0]; | |||||
| } | } | ||||
| return(minf); | return(minf); | ||||
| } | } | ||||
| @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <math.h> | #include <math.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 | |||||
| #define MASK_T e32xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 | |||||
| #define MASK_T e64xm8_t | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| { | { | ||||
| @@ -61,40 +65,44 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if (n <= 0 || inc_x <= 0) return(asumf); | if (n <= 0 || inc_x <= 0) return(asumf); | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_zero,v_sum; | FLOAT_V_T v0, v1, v_zero,v_sum; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| BLASLONG n2 = n * 2; | BLASLONG n2 = n * 2; | ||||
| gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n2); | |||||
| v_zero = VFMVVF_FLOAT(0, gvl); | v_zero = VFMVVF_FLOAT(0, gvl); | ||||
| if(gvl <= n2/2){ | if(gvl <= n2/2){ | ||||
| v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
| for(i=0,j=0; i<n2/(gvl*2); i++){ | for(i=0,j=0; i<n2/(gvl*2); i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
| j += gvl * 2; | j += gvl * 2; | ||||
| } | } | ||||
| v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||||
| asumf += v0[0]; | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
| asumf += v_res[0]; | |||||
| } | } | ||||
| for(;j<n2;){ | for(;j<n2;){ | ||||
| gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n2-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl); | |||||
| asumf += v0[0]; | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| asumf += v_res[0]; | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | ||||
| v_zero = VFMVVF_FLOAT(0, gvl); | v_zero = VFMVVF_FLOAT(0, gvl); | ||||
| @@ -103,31 +111,31 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
| j += gvl; | j += gvl; | ||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||||
| asumf += v0[0]; | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
| asumf += v_res[0]; | |||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v0, v1, gvl); | v_sum = VFADDVV_FLOAT(v0, v1, gvl); | ||||
| v_sum = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||||
| asumf += v_sum[0]; | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
| asumf += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| return(asumf); | return(asumf); | ||||
| @@ -28,27 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||||
| #define VFMSACVF_FLOAT vfmsacvf_float32xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||||
| #define VFMSACVF_FLOAT vfmsacvf_float64xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) | ||||
| @@ -69,7 +67,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| if(inc_y == 1){ | if(inc_y == 1){ | ||||
| memset(&y[0], 0, 2 * n * sizeof(FLOAT)); | memset(&y[0], 0, 2 * n * sizeof(FLOAT)); | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | vy0 = VFMVVF_FLOAT(0.0, gvl); | ||||
| BLASLONG inc_yv = inc_y * gvl * 2; | BLASLONG inc_yv = inc_y * gvl * 2; | ||||
| @@ -83,7 +81,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | vy0 = VFMVVF_FLOAT(0.0, gvl); | ||||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | ||||
| VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl); | VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl); | ||||
| @@ -92,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| } | } | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG inc_xv = inc_x * gvl * 2; | BLASLONG inc_xv = inc_x * gvl * 2; | ||||
| BLASLONG inc_yv = inc_y * gvl * 2; | BLASLONG inc_yv = inc_y * gvl * 2; | ||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| @@ -110,7 +108,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl); | vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl); | ||||
| @@ -124,7 +122,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| }else{ | }else{ | ||||
| FLOAT_V_T v0, v1; | FLOAT_V_T v0, v1; | ||||
| if(alpha_r == 0.0 && alpha_i == 0.0){ | if(alpha_r == 0.0 && alpha_i == 0.0){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG inc_yv = inc_y * gvl * 2; | BLASLONG inc_yv = inc_y * gvl * 2; | ||||
| for(i=0,j=0;i<n/gvl;i++){ | for(i=0,j=0;i<n/gvl;i++){ | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| @@ -139,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); | vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); | ||||
| v0 = VFMULVF_FLOAT(vy1, beta_i, gvl); | v0 = VFMULVF_FLOAT(vy1, beta_i, gvl); | ||||
| @@ -150,7 +148,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl); | VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl); | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG inc_xv = inc_x * gvl * 2; | BLASLONG inc_xv = inc_x * gvl * 2; | ||||
| BLASLONG inc_yv = inc_y * gvl * 2; | BLASLONG inc_yv = inc_y * gvl * 2; | ||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| @@ -174,7 +172,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| if(j<n){ | if(j<n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| @@ -28,21 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| @@ -56,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | ||||
| FLOAT_V_T vx0, vx1, vy0, vy1; | FLOAT_V_T vx0, vx1, vy0, vy1; | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG inc_xv = inc_x * 2 * gvl; | BLASLONG inc_xv = inc_x * 2 * gvl; | ||||
| BLASLONG inc_yv = inc_y * 2 * gvl; | BLASLONG inc_yv = inc_y * 2 * gvl; | ||||
| for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
| @@ -82,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| @@ -27,17 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #endif | #endif | ||||
| @@ -52,7 +50,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| memcpy(&y[0], &x[0], n * 2 * sizeof(FLOAT)); | memcpy(&y[0], &x[0], n * 2 * sizeof(FLOAT)); | ||||
| }else{ | }else{ | ||||
| FLOAT_V_T vx0, vx1, vx2, vx3; | FLOAT_V_T vx0, vx1, vx2, vx3; | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| @@ -75,7 +73,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl); | VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl); | ||||
| @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||||
| #define VFMSACVV_FLOAT vfmsacvv_float32xm4 | |||||
| #define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
| #define VFMSACVV_FLOAT vfmsac_vv_f32m4 | |||||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||||
| #define VFMSACVV_FLOAT vfmsacvv_float64xm4 | |||||
| #define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
| #define VFMSACVV_FLOAT vfmsac_vv_f64m4 | |||||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||||
| #endif | #endif | ||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | ||||
| @@ -70,9 +74,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| if ( n < 1 ) return(result); | if ( n < 1 ) return(result); | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; | FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| vr0 = VFMVVF_FLOAT(0, gvl); | vr0 = VFMVVF_FLOAT(0, gvl); | ||||
| vr1 = VFMVVF_FLOAT(0, gvl); | vr1 = VFMVVF_FLOAT(0, gvl); | ||||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| @@ -99,14 +107,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| ix += inc_xv; | ix += inc_xv; | ||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||||
| vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl); | |||||
| dot[0] += vr0[0]; | |||||
| vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl); | |||||
| dot[1] += vr1[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
| dot[0] += v_res[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
| dot[1] += v_res[0]; | |||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| @@ -123,11 +130,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| vr1 = VFMULVV_FLOAT(vx1, vy0, gvl); | vr1 = VFMULVV_FLOAT(vx1, vy0, gvl); | ||||
| vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl); | vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl); | ||||
| #endif | #endif | ||||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||||
| vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl); | |||||
| dot[0] += vr0[0]; | |||||
| vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl); | |||||
| dot[1] += vr1[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
| dot[0] += v_res[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
| dot[1] += v_res[0]; | |||||
| } | } | ||||
| CREAL(result) = dot[0]; | CREAL(result) = dot[0]; | ||||
| CIMAG(result) = dot[1]; | CIMAG(result) = dot[1]; | ||||
| @@ -27,25 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSEV_FLOAT vsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSEV_FLOAT vse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSEV_FLOAT vsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSEV_FLOAT vse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
| @@ -58,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| BLASLONG stride_a = sizeof(FLOAT) * 2; | BLASLONG stride_a = sizeof(FLOAT) * 2; | ||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; | BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; | ||||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m); | |||||
| BLASLONG inc_yv = inc_y * gvl * 2; | BLASLONG inc_yv = inc_y * gvl * 2; | ||||
| BLASLONG inc_x2 = inc_x * 2; | BLASLONG inc_x2 = inc_x * 2; | ||||
| BLASLONG lda2 = lda * 2; | BLASLONG lda2 = lda * 2; | ||||
| @@ -117,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j/2 < m){ | if(j/2 < m){ | ||||
| gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-j/2); | |||||
| a_ptr = a; | a_ptr = a; | ||||
| ix = 0; | ix = 0; | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| @@ -27,25 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
| @@ -57,15 +61,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| FLOAT_V_T va0, va1, vx0, vx1, vr, vi; | FLOAT_V_T va0, va1, vx0, vx1, vr, vi; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | ||||
| BLASLONG stride_a = sizeof(FLOAT) * 2; | BLASLONG stride_a = sizeof(FLOAT) * 2; | ||||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m); | |||||
| BLASLONG inc_xv = inc_x * gvl * 2; | BLASLONG inc_xv = inc_x * gvl * 2; | ||||
| BLASLONG inc_av = gvl * 2; | BLASLONG inc_av = gvl * 2; | ||||
| BLASLONG inc_y2 = inc_y * 2; | BLASLONG inc_y2 = inc_y * 2; | ||||
| BLASLONG lda2 = lda * 2; | BLASLONG lda2 = lda * 2; | ||||
| for(i = 0; i < n; i++){ | for(i = 0; i < n; i++){ | ||||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m); | |||||
| j = 0; | j = 0; | ||||
| ix = 0; | ix = 0; | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| @@ -90,13 +99,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| j += inc_av; | j += inc_av; | ||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| va0 = VFMVVF_FLOAT(0, gvl); | |||||
| vx0 = VFREDSUM_FLOAT(vr, va0, gvl); | |||||
| temp_r = vx0[0]; | |||||
| vx1 = VFREDSUM_FLOAT(vi, va0, gvl); | |||||
| temp_i = vx1[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp_r = v_res[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); | |||||
| temp_i = v_res[0]; | |||||
| if(j/2 < m){ | if(j/2 < m){ | ||||
| gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-j/2); | |||||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | ||||
| va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | ||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| @@ -113,11 +121,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); | vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); | ||||
| #endif | #endif | ||||
| va0 = VFMVVF_FLOAT(0, gvl); | |||||
| vx0 = VFREDSUM_FLOAT(vr, va0, gvl); | |||||
| temp_r += vx0[0]; | |||||
| vx1 = VFREDSUM_FLOAT(vi, va0, gvl); | |||||
| temp_i += vx1[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| temp_r += v_res[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); | |||||
| temp_i += v_res[0]; | |||||
| } | } | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| y[iy] += alpha_r * temp_r - alpha_i * temp_i; | y[iy] += alpha_r * temp_r - alpha_i * temp_i; | ||||
| @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||||
| #define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||||
| #define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | ||||
| @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
| FLOAT temp_r2, temp_i2; | FLOAT temp_r2, temp_i2; | ||||
| FLOAT *a_ptr = a; | FLOAT *a_ptr = a; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; | FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; | ||||
| BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; | BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; | ||||
| @@ -90,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
| i = j + 1; | i = j + 1; | ||||
| len = m - i; | len = m - i; | ||||
| if(len > 0){ | if(len > 0){ | ||||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(len); | |||||
| inc_xv = incx * gvl * 2; | inc_xv = incx * gvl * 2; | ||||
| inc_yv = incy * gvl * 2; | inc_yv = incy * gvl * 2; | ||||
| inc_av = gvl * 2; | inc_av = gvl * 2; | ||||
| @@ -134,13 +141,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| ia += inc_av; | ia += inc_av; | ||||
| } | } | ||||
| va0 = VFMVVF_FLOAT(0, gvl); | |||||
| vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); | |||||
| temp_r2 = vx0[0]; | |||||
| vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); | |||||
| temp_i2 = vx1[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
| temp_r2 = v_res[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
| temp_i2 = v_res[0]; | |||||
| if(i < m){ | if(i < m){ | ||||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(m-i); | |||||
| va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); | va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); | ||||
| va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); | va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| @@ -173,11 +179,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
| vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | ||||
| #endif | #endif | ||||
| va0 = VFMVVF_FLOAT(0, gvl); | |||||
| vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); | |||||
| temp_r2 += vx0[0]; | |||||
| vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); | |||||
| temp_i2 += vx1[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
| temp_r2 += v_res[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
| temp_i2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; | y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; | ||||
| @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||||
| #define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||||
| #define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | ||||
| @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
| FLOAT temp_r2, temp_i2; | FLOAT temp_r2, temp_i2; | ||||
| FLOAT *a_ptr = a; | FLOAT *a_ptr = a; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; | FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; | ||||
| BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; | BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; | ||||
| @@ -89,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
| ia = 0; | ia = 0; | ||||
| i = 0; | i = 0; | ||||
| if(j > 0){ | if(j > 0){ | ||||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j); | |||||
| inc_xv = incx * gvl * 2; | inc_xv = incx * gvl * 2; | ||||
| inc_yv = incy * gvl * 2; | inc_yv = incy * gvl * 2; | ||||
| inc_av = gvl * 2; | inc_av = gvl * 2; | ||||
| @@ -133,13 +140,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| ia += inc_av; | ia += inc_av; | ||||
| } | } | ||||
| va0 = VFMVVF_FLOAT(0, gvl); | |||||
| vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); | |||||
| temp_r2 = vx0[0]; | |||||
| vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); | |||||
| temp_i2 = vx1[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
| temp_r2 = v_res[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
| temp_i2 = v_res[0]; | |||||
| if(i < j){ | if(i < j){ | ||||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(j-i); | |||||
| va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); | va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); | ||||
| va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); | va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| @@ -172,11 +178,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
| vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | ||||
| #endif | #endif | ||||
| va0 = VFMVVF_FLOAT(0, gvl); | |||||
| vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); | |||||
| temp_r2 += vx0[0]; | |||||
| vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); | |||||
| temp_i2 += vx1[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
| temp_r2 += v_res[0]; | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
| temp_i2 += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| y[jy] += temp_r1 * a_ptr[ja]; | y[jy] += temp_r1 * a_ptr[ja]; | ||||
| @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
| #define ABS fabsf | #define ABS fabsf | ||||
| #define MASK_T e32xm4_t | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 | |||||
| #define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 | |||||
| #define VMFIRSTM vmfirstm_e32xm4 | |||||
| #define VFDIVVF_FLOAT vfdivvf_float32xm4 | |||||
| #define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 | |||||
| #define MASK_T vbool8_t | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m | |||||
| #define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 | |||||
| #define VMFIRSTM vmfirst_m_b8 | |||||
| #define VFDIVVF_FLOAT vfdiv_vf_f32m4 | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
| #define ABS fabs | #define ABS fabs | ||||
| #define MASK_T e64xm4_t | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 | |||||
| #define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 | |||||
| #define VMFIRSTM vmfirstm_e64xm4 | |||||
| #define VFDIVVF_FLOAT vfdivvf_float64xm4 | |||||
| #define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 | |||||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 | |||||
| #define MASK_T vbool16_t | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m | |||||
| #define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 | |||||
| #define VMFIRSTM vmfirst_m_b16 | |||||
| #define VFDIVVF_FLOAT vfdiv_vf_f64m4 | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||||
| #endif | #endif | ||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| @@ -73,19 +77,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT_V_T vr, v0, v_zero; | FLOAT_V_T vr, v0, v_zero; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT scale = 0.0, ssq = 0.0; | FLOAT scale = 0.0, ssq = 0.0; | ||||
| MASK_T mask; | MASK_T mask; | ||||
| BLASLONG index = 0; | BLASLONG index = 0; | ||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| BLASLONG n2 = n * 2; | BLASLONG n2 = n * 2; | ||||
| gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n2); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| v_zero = VFMVVF_FLOAT(0, gvl); | v_zero = VFMVVF_FLOAT(0, gvl); | ||||
| for(i=0,j=0; i<n2/gvl; i++){ | for(i=0,j=0; i<n2/gvl; i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -96,15 +105,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq before current vector | //total ssq before current vector | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| //ssq in vector vr | //ssq in vector vr | ||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| @@ -112,17 +121,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq now | //total ssq now | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //tail | //tail | ||||
| if(j < n2){ | if(j < n2){ | ||||
| gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n2-j); | |||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -131,21 +140,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| } | } | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq now | //total ssq now | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| v_zero = VFMVVF_FLOAT(0, gvl); | v_zero = VFMVVF_FLOAT(0, gvl); | ||||
| unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | ||||
| @@ -154,7 +163,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -165,15 +174,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq before current vector | //total ssq before current vector | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| //ssq in vector vr | //ssq in vector vr | ||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| @@ -182,7 +191,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -193,15 +202,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq before current vector | //total ssq before current vector | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| //ssq in vector vr | //ssq in vector vr | ||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| @@ -210,17 +219,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| idx += inc_v; | idx += inc_v; | ||||
| } | } | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq now | //total ssq now | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -231,11 +240,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| } | } | ||||
| @@ -243,7 +252,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | ||||
| //fabs(vector) | //fabs(vector) | ||||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | mask = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
| //if scale change | //if scale change | ||||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | mask = VMFGTVF_FLOAT(v0, scale, gvl); | ||||
| index = VMFIRSTM(mask, gvl); | index = VMFIRSTM(mask, gvl); | ||||
| @@ -254,22 +263,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| } | } | ||||
| }else{//found greater element | }else{//found greater element | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq before current vector | //total ssq before current vector | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| //find max | //find max | ||||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| //update ssq before max_index | //update ssq before max_index | ||||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||||
| //update scale | //update scale | ||||
| scale = vr[0]; | |||||
| scale = v_res[0]; | |||||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | v0 = VFDIVVF_FLOAT(v0, scale, gvl); | ||||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | ||||
| } | } | ||||
| //ssq in vector vr: vr[0] | //ssq in vector vr: vr[0] | ||||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| //total ssq now | //total ssq now | ||||
| ssq += vr[0]; | |||||
| ssq += v_res[0]; | |||||
| } | } | ||||
| } | } | ||||
| return(scale * sqrt(ssq)); | return(scale * sqrt(ssq)); | ||||
| @@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLEV_FLOAT vlev_float32xm4 | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSEV_FLOAT vsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLEV_FLOAT vle_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSEV_FLOAT vse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLEV_FLOAT vlev_float64xm4 | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSEV_FLOAT vsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLEV_FLOAT vle_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSEV_FLOAT vse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | ||||
| @@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; | FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | ||||
| BLASLONG inc_xv = inc_x * 2 * gvl; | BLASLONG inc_xv = inc_x * 2 * gvl; | ||||
| @@ -90,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| ix += 2*gvl; | ix += 2*gvl; | ||||
| } | } | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLEV_FLOAT(&x[ix], gvl); | vx0 = VLEV_FLOAT(&x[ix], gvl); | ||||
| vx1 = VLEV_FLOAT(&x[ix+gvl], gvl); | vx1 = VLEV_FLOAT(&x[ix+gvl], gvl); | ||||
| vy0 = VLEV_FLOAT(&y[ix], gvl); | vy0 = VLEV_FLOAT(&y[ix], gvl); | ||||
| @@ -137,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
| @@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float32xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||||
| #define VSSEV_FLOAT vssev_float32xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M4 | |||||
| #define FLOAT_V_T float64xm4_t | |||||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||||
| #define VSSEV_FLOAT vssev_float64xm4 | |||||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| @@ -60,7 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| if(da_r == 0.0 && da_i == 0.0){ | if(da_r == 0.0 && da_i == 0.0){ | ||||
| memset(&x[0], 0, n * 2 * sizeof(FLOAT)); | memset(&x[0], 0, n * 2 * sizeof(FLOAT)); | ||||
| }else if(da_r == 0.0){ | }else if(da_r == 0.0){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| BLASLONG inc_xv = inc_x * 2 * gvl; | BLASLONG inc_xv = inc_x * 2 * gvl; | ||||
| for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
| @@ -77,7 +77,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| @@ -88,7 +88,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); | VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); | ||||
| } | } | ||||
| }else if(da_i == 0.0){ | }else if(da_i == 0.0){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| BLASLONG inc_xv = inc_x * 2 * gvl; | BLASLONG inc_xv = inc_x * 2 * gvl; | ||||
| for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
| @@ -105,7 +105,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| @@ -116,7 +116,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); | VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| BLASLONG inc_xv = inc_x * 2 * gvl; | BLASLONG inc_xv = inc_x * 2 * gvl; | ||||
| for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
| @@ -135,7 +135,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| ix += inc_xv; | ix += inc_xv; | ||||
| } | } | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define RVV_EFLOAT RVV_E32 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float32xm8_t | |||||
| #define VLEV_FLOAT vlev_float32xm8 | |||||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||||
| #define VSEV_FLOAT vsev_float32xm8 | |||||
| #define VSSEV_FLOAT vssev_float32xm8 | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT vle_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||||
| #define VSEV_FLOAT vse_v_f32m8 | |||||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||||
| #else | #else | ||||
| #define RVV_EFLOAT RVV_E64 | |||||
| #define RVV_M RVV_M8 | |||||
| #define FLOAT_V_T float64xm8_t | |||||
| #define VLEV_FLOAT vlev_float64xm8 | |||||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||||
| #define VSEV_FLOAT vsev_float64xm8 | |||||
| #define VSSEV_FLOAT vssev_float64xm8 | |||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT vle_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||||
| #define VSEV_FLOAT vse_v_f64m8 | |||||
| #define VSSEV_FLOAT vsse_v_f64m8 | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||||
| if (n < 0) return(0); | if (n < 0) return(0); | ||||
| if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| BLASLONG n2 = n * 2; | BLASLONG n2 = n * 2; | ||||
| if(gvl <= n2/2){ | if(gvl <= n2/2){ | ||||
| for(i=0,j=0; i<n2/(2*gvl); i++){ | for(i=0,j=0; i<n2/(2*gvl); i++){ | ||||
| @@ -72,7 +72,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||||
| } | } | ||||
| } | } | ||||
| for(;j<n2;){ | for(;j<n2;){ | ||||
| gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n2-j); | |||||
| vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
| vy0 = VLEV_FLOAT(&y[j], gvl); | vy0 = VLEV_FLOAT(&y[j], gvl); | ||||
| VSEV_FLOAT(&x[j], vy0, gvl); | VSEV_FLOAT(&x[j], vy0, gvl); | ||||
| @@ -80,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n); | |||||
| stride_x = inc_x * 2 * sizeof(FLOAT); | stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
| stride_y = inc_y * 2 * sizeof(FLOAT); | stride_y = inc_y * 2 * sizeof(FLOAT); | ||||
| BLASLONG inc_xv = inc_x * gvl * 2; | BLASLONG inc_xv = inc_x * gvl * 2; | ||||
| @@ -100,7 +100,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||||
| iy += inc_yv; | iy += inc_yv; | ||||
| } | } | ||||
| if(j < n){ | if(j < n){ | ||||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||||
| gvl = VSETVL(n-j); | |||||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||