* Fixed bugs in dgemm, [a]min\max, asum kernels * Added zero checks for BLAS kernels * Added dsdot implementation for RVV 0.7.1 * Fixed bugs in _vector files for C910V and RISCV64_ZVL256B targets * Added additional definitions for RISCV64_ZVL256B targettags/v0.3.27
| @@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) | |||
| TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_ZVL256B) | |||
| TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_GENERIC) | |||
| TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | |||
| endif | |||
| @@ -6,6 +6,10 @@ ifeq ($(CORE), x280) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math | |||
| FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_ZVL256B) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_GENERIC) | |||
| CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | |||
| @@ -121,6 +121,7 @@ Z14 | |||
| RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | |||
| C910V | |||
| x280 | |||
| RISCV64_ZVL256B | |||
| 11.LOONGARCH64: | |||
| LOONGSONGENERIC | |||
| @@ -1692,6 +1692,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_RISCV64_ZVL256B | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "RISCV64_ZVL256B" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DRISCV64_ZVL256B " \ | |||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "riscv64_zvl256b" | |||
| #define CORENAME "RISCV64_ZVL256B" | |||
| #else | |||
| #endif | |||
| #if defined(FORCE_E2K) || defined(__e2k__) | |||
| #define FORCE | |||
| @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c | |||
| DDOTKERNEL = dot_vector.c | |||
| CDOTKERNEL = zdot_vector.c | |||
| ZDOTKERNEL = zdot_vector.c | |||
| DSDOTKERNEL = dsdot_vector.c | |||
| SNRM2KERNEL = nrm2_vector.c | |||
| DNRM2KERNEL = nrm2_vector.c | |||
| @@ -31,15 +31,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define ABS fabs | |||
| # else | |||
| # define ELEN 32 | |||
| # define ABS fabsf | |||
| # endif | |||
| #else | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define ABS fabs | |||
| # else | |||
| # define ELEN 32 | |||
| # define ABS fabsf | |||
| # endif | |||
| #endif | |||
| @@ -69,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT minf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| minf = *x; | |||
| minf = ABS(*x); | |||
| x += inc_x; | |||
| --n; | |||
| if (n == 0) return(minf); | |||
| @@ -67,7 +67,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG ix=0; | |||
| FLOAT asumf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| unsigned int gvl = 0; | |||
| @@ -103,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| inc_xv += inc_xv * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||
| } | |||
| @@ -60,7 +60,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if (n < 0) return(0); | |||
| if (n <= 0) return(0); | |||
| BLASLONG i=0, j=0; | |||
| unsigned int gvl = 0; | |||
| @@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| asm volatile( | |||
| "vsetvli zero, zero, e64,m1 \n\t" | |||
| "fmv.w.x ft11, zero \n\t" | |||
| "fmv.d.x ft11, zero \n\t" | |||
| "mv t0, %[BK] \n\t" | |||
| "vfmv.v.f v16, ft11 \n\t" | |||
| @@ -0,0 +1,152 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| double dot = 0.0 ; | |||
| if ( n < 1 ) return(dot); | |||
| vfloat64m4_t vr; | |||
| vfloat32m2_t vx, vy; | |||
| unsigned int gvl = 0; | |||
| vfloat64m1_t v_res, v_z0; | |||
| gvl = vsetvlmax_e64m1(); | |||
| v_res = vfmv_v_f_f64m1(0, gvl); | |||
| v_z0 = vfmv_v_f_f64m1(0, gvl); | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else{ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -139,7 +139,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||
| if(cur_minf > minf){ | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| @@ -185,7 +185,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||
| if(cur_minf > minf){ | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| @@ -156,7 +156,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||
| if(cur_minf > minf){ | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| @@ -104,7 +104,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| if(n <= 0) return(0.0); | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if(n == 1) return (ABS(x[0])); | |||
| unsigned int gvl = 0; | |||
| @@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG i=0, j=0; | |||
| double len = 0.0 ; | |||
| if ( n < 0 ) return(0.0); | |||
| if ( n <= 0 ) return(0.0); | |||
| if(n == 1) return (ABS(x[0])); | |||
| FLOAT_V_T vr, v0, v1; | |||
| @@ -67,7 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| BLASLONG stride_x, stride_y; | |||
| FLOAT_V_T vx0, vx1, vy0, vy1; | |||
| if (n < 0) return(0); | |||
| if (n <= 0) return(0); | |||
| unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | |||
| if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | |||
| @@ -60,17 +60,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) | |||
| #define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) | |||
| #endif | |||
| #define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||
| #define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| #define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) | |||
| #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | |||
| #define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| @@ -91,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| for(; i<n/gvl; i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||
| v0 = VFABSV_FLOAT(v0, gvl); | |||
| v1 = VFABSV_FLOAT(v1, gvl); | |||
| v0 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||
| @@ -108,10 +105,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||
| v0 = VFABSV_FLOAT(v0, gvl); | |||
| v1 = VFABSV_FLOAT(v1, gvl); | |||
| v1 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||
| } | |||
| @@ -62,17 +62,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) | |||
| #define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) | |||
| #endif | |||
| #define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||
| #define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| #define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) | |||
| #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | |||
| #define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| @@ -93,10 +91,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| for(; i<n/gvl; i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||
| v0 = VFABSV_FLOAT(v0, gvl); | |||
| v1 = VFABSV_FLOAT(v1, gvl); | |||
| v0 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||
| @@ -110,10 +107,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||
| v0 = VFABSV_FLOAT(v0, gvl); | |||
| v1 = VFABSV_FLOAT(v1, gvl); | |||
| v1 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||
| } | |||
| @@ -96,7 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| if(n < 0) return(0.0); | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero; | |||
| unsigned int gvl = 0; | |||
| @@ -69,7 +69,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||
| unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | |||
| if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | |||
| if (n < 0) return(0); | |||
| if (n <= 0) return(0); | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| BLASLONG n2 = n * 2; | |||
| if(gvl <= n2/2){ | |||