* Fixed bugs in dgemm, [a]min\max, asum kernels * Added zero checks for BLAS kernels * Added dsdot implementation for RVV 0.7.1 * Fixed bugs in _vector files for C910V and RISCV64_ZVL256B targets * Added additional definitions for RISCV64_ZVL256B targettags/v0.3.27
@@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) | |||||
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | ||||
endif | endif | ||||
ifeq ($(TARGET), RISCV64_ZVL256B) | |||||
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | |||||
endif | |||||
ifeq ($(TARGET), RISCV64_GENERIC) | ifeq ($(TARGET), RISCV64_GENERIC) | ||||
TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | ||||
endif | endif | ||||
@@ -6,6 +6,10 @@ ifeq ($(CORE), x280) | |||||
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math | CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math | ||||
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | ||||
endif | endif | ||||
ifeq ($(CORE), RISCV64_ZVL256B) | |||||
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d | |||||
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||||
endif | |||||
ifeq ($(CORE), RISCV64_GENERIC) | ifeq ($(CORE), RISCV64_GENERIC) | ||||
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | ||||
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | ||||
@@ -121,6 +121,7 @@ Z14 | |||||
RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | ||||
C910V | C910V | ||||
x280 | x280 | ||||
RISCV64_ZVL256B | |||||
11.LOONGARCH64: | 11.LOONGARCH64: | ||||
LOONGSONGENERIC | LOONGSONGENERIC | ||||
@@ -1692,6 +1692,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#else | #else | ||||
#endif | #endif | ||||
#ifdef FORCE_RISCV64_ZVL256B | |||||
#define FORCE | |||||
#define ARCHITECTURE "RISCV64" | |||||
#define SUBARCHITECTURE "RISCV64_ZVL256B" | |||||
#define SUBDIRNAME "riscv64" | |||||
#define ARCHCONFIG "-DRISCV64_ZVL256B " \ | |||||
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||||
#define LIBNAME "riscv64_zvl256b" | |||||
#define CORENAME "RISCV64_ZVL256B" | |||||
#else | |||||
#endif | |||||
#if defined(FORCE_E2K) || defined(__e2k__) | #if defined(FORCE_E2K) || defined(__e2k__) | ||||
#define FORCE | #define FORCE | ||||
@@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c | |||||
DDOTKERNEL = dot_vector.c | DDOTKERNEL = dot_vector.c | ||||
CDOTKERNEL = zdot_vector.c | CDOTKERNEL = zdot_vector.c | ||||
ZDOTKERNEL = zdot_vector.c | ZDOTKERNEL = zdot_vector.c | ||||
DSDOTKERNEL = dsdot_vector.c | |||||
SNRM2KERNEL = nrm2_vector.c | SNRM2KERNEL = nrm2_vector.c | ||||
DNRM2KERNEL = nrm2_vector.c | DNRM2KERNEL = nrm2_vector.c | ||||
@@ -31,15 +31,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
# define LMUL m2 | # define LMUL m2 | ||||
# if defined(DOUBLE) | # if defined(DOUBLE) | ||||
# define ELEN 64 | # define ELEN 64 | ||||
# define ABS fabs | |||||
# else | # else | ||||
# define ELEN 32 | # define ELEN 32 | ||||
# define ABS fabsf | |||||
# endif | # endif | ||||
#else | #else | ||||
# define LMUL m8 | # define LMUL m8 | ||||
# if defined(DOUBLE) | # if defined(DOUBLE) | ||||
# define ELEN 64 | # define ELEN 64 | ||||
# define ABS fabs | |||||
# else | # else | ||||
# define ELEN 32 | # define ELEN 32 | ||||
# define ABS fabsf | |||||
# endif | # endif | ||||
#endif | #endif | ||||
@@ -69,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT minf=0.0; | FLOAT minf=0.0; | ||||
if (n <= 0 || inc_x <= 0) return(minf); | if (n <= 0 || inc_x <= 0) return(minf); | ||||
minf = *x; | |||||
minf = ABS(*x); | |||||
x += inc_x; | x += inc_x; | ||||
--n; | --n; | ||||
if (n == 0) return(minf); | if (n == 0) return(minf); | ||||
@@ -67,7 +67,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
BLASLONG ix=0; | |||||
FLOAT asumf=0.0; | FLOAT asumf=0.0; | ||||
if (n <= 0 || inc_x <= 0) return(asumf); | if (n <= 0 || inc_x <= 0) return(asumf); | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
@@ -103,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
BLASLONG inc_xv = inc_x * gvl; | |||||
for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | v0 = VFABS_FLOAT(v0, gvl); | ||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||||
v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | v1 = VFABS_FLOAT(v1, gvl); | ||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
j += gvl * 2; | j += gvl * 2; | ||||
inc_xv += inc_xv * 2; | |||||
} | } | ||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | ||||
} | } | ||||
@@ -60,7 +60,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | ||||
{ | { | ||||
if (n < 0) return(0); | |||||
if (n <= 0) return(0); | |||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
@@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||||
asm volatile( | asm volatile( | ||||
"vsetvli zero, zero, e64,m1 \n\t" | "vsetvli zero, zero, e64,m1 \n\t" | ||||
"fmv.w.x ft11, zero \n\t" | |||||
"fmv.d.x ft11, zero \n\t" | |||||
"mv t0, %[BK] \n\t" | "mv t0, %[BK] \n\t" | ||||
"vfmv.v.f v16, ft11 \n\t" | "vfmv.v.f v16, ft11 \n\t" | ||||
@@ -0,0 +1,152 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2023, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
{ | |||||
BLASLONG i=0, j=0; | |||||
double dot = 0.0 ; | |||||
if ( n < 1 ) return(dot); | |||||
vfloat64m4_t vr; | |||||
vfloat32m2_t vx, vy; | |||||
unsigned int gvl = 0; | |||||
vfloat64m1_t v_res, v_z0; | |||||
gvl = vsetvlmax_e64m1(); | |||||
v_res = vfmv_v_f_f64m1(0, gvl); | |||||
v_z0 = vfmv_v_f_f64m1(0, gvl); | |||||
if(inc_x == 1 && inc_y == 1){ | |||||
gvl = vsetvl_e64m4(n); | |||||
vr = vfmv_v_f_f64m4(0, gvl); | |||||
for(i=0,j=0; i<n/gvl; i++){ | |||||
vx = vle32_v_f32m2(&x[j], gvl); | |||||
vy = vle32_v_f32m2(&y[j], gvl); | |||||
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||||
j += gvl; | |||||
} | |||||
if(j > 0){ | |||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
} | |||||
//tail | |||||
if(j < n){ | |||||
gvl = vsetvl_e64m4(n-j); | |||||
vx = vle32_v_f32m2(&x[j], gvl); | |||||
vy = vle32_v_f32m2(&y[j], gvl); | |||||
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||||
//vr = vfdot_vv_f32m2(vx, vy, gvl); | |||||
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
} | |||||
}else if(inc_y == 1){ | |||||
gvl = vsetvl_e64m4(n); | |||||
vr = vfmv_v_f_f64m4(0, gvl); | |||||
int stride_x = inc_x * sizeof(FLOAT); | |||||
for(i=0,j=0; i<n/gvl; i++){ | |||||
vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||||
vy = vle32_v_f32m2(&y[j], gvl); | |||||
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||||
j += gvl; | |||||
} | |||||
if(j > 0){ | |||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
} | |||||
//tail | |||||
if(j < n){ | |||||
gvl = vsetvl_e64m4(n-j); | |||||
vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||||
vy = vle32_v_f32m2(&y[j], gvl); | |||||
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||||
//vr = vfdot_vv_f32m2(vx, vy, gvl); | |||||
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
} | |||||
}else if(inc_x == 1){ | |||||
gvl = vsetvl_e64m4(n); | |||||
vr = vfmv_v_f_f64m4(0, gvl); | |||||
int stride_y = inc_y * sizeof(FLOAT); | |||||
for(i=0,j=0; i<n/gvl; i++){ | |||||
vx = vle32_v_f32m2(&x[j], gvl); | |||||
vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||||
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||||
j += gvl; | |||||
} | |||||
if(j > 0){ | |||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
} | |||||
//tail | |||||
if(j < n){ | |||||
gvl = vsetvl_e64m4(n-j); | |||||
vx = vle32_v_f32m2(&x[j], gvl); | |||||
vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||||
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||||
//vr = vfdot_vv_f32m2(vx, vy, gvl); | |||||
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
} | |||||
}else{ | |||||
gvl = vsetvl_e64m4(n); | |||||
vr = vfmv_v_f_f64m4(0, gvl); | |||||
int stride_x = inc_x * sizeof(FLOAT); | |||||
int stride_y = inc_y * sizeof(FLOAT); | |||||
for(i=0,j=0; i<n/gvl; i++){ | |||||
vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||||
vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||||
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||||
j += gvl; | |||||
} | |||||
if(j > 0){ | |||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
} | |||||
//tail | |||||
if(j < n){ | |||||
gvl = vsetvl_e64m4(n-j); | |||||
vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||||
vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||||
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||||
//vr = vfdot_vv_f32m2(vx, vy, gvl); | |||||
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
} | |||||
} | |||||
return(dot); | |||||
} |
@@ -139,7 +139,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | ||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res); | FLOAT cur_minf = EXTRACT_FLOAT(v_res); | ||||
if(cur_minf > minf){ | |||||
if(cur_minf < minf){ | |||||
//tail index | //tail index | ||||
v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl); | v_min_index = VADDVX_UINT(v_min_index, j, gvl); | ||||
@@ -185,7 +185,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | ||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res); | FLOAT cur_minf = EXTRACT_FLOAT(v_res); | ||||
if(cur_minf > minf){ | |||||
if(cur_minf < minf){ | |||||
//tail index | //tail index | ||||
v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl); | v_min_index = VADDVX_UINT(v_min_index, j, gvl); | ||||
@@ -156,7 +156,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | ||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res); | FLOAT cur_minf = EXTRACT_FLOAT(v_res); | ||||
if(cur_minf > minf){ | |||||
if(cur_minf < minf){ | |||||
//tail index | //tail index | ||||
v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl); | v_min_index = VADDVX_UINT(v_min_index, j, gvl); | ||||
@@ -104,7 +104,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | { | ||||
BLASLONG i=0; | BLASLONG i=0; | ||||
if(n <= 0) return(0.0); | |||||
if (n <= 0 || inc_x <= 0) return(0.0); | |||||
if(n == 1) return (ABS(x[0])); | if(n == 1) return (ABS(x[0])); | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
@@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
double len = 0.0 ; | double len = 0.0 ; | ||||
if ( n < 0 ) return(0.0); | |||||
if ( n <= 0 ) return(0.0); | |||||
if(n == 1) return (ABS(x[0])); | if(n == 1) return (ABS(x[0])); | ||||
FLOAT_V_T vr, v0, v1; | FLOAT_V_T vr, v0, v1; | ||||
@@ -67,7 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
BLASLONG stride_x, stride_y; | BLASLONG stride_x, stride_y; | ||||
FLOAT_V_T vx0, vx1, vy0, vy1; | FLOAT_V_T vx0, vx1, vy0, vy1; | ||||
if (n < 0) return(0); | |||||
if (n <= 0) return(0); | |||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | ||||
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | ||||
@@ -60,17 +60,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | ||||
#ifdef RISCV_0p10_INTRINSICS | #ifdef RISCV_0p10_INTRINSICS | ||||
#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) | #define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) | ||||
#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) | |||||
#else | #else | ||||
#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | ||||
#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) | |||||
#endif | #endif | ||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _) | #define MASK_T JOIN(vbool, MLEN, _t, _, _) | ||||
#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) | |||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | ||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | ||||
#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) | #define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) | ||||
#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | ||||
#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
@@ -91,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
for(; i<n/gvl; i++){ | for(; i<n/gvl; i++){ | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||||
v0 = VFABSV_FLOAT(v0, gvl); | |||||
v1 = VFABSV_FLOAT(v1, gvl); | |||||
v0 = VFADDVV_FLOAT(v0, v1, gvl); | v0 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | ||||
@@ -108,10 +105,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||||
v0 = VFABSV_FLOAT(v0, gvl); | |||||
v1 = VFABSV_FLOAT(v1, gvl); | |||||
v1 = VFADDVV_FLOAT(v0, v1, gvl); | v1 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | ||||
} | } | ||||
@@ -62,17 +62,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | ||||
#ifdef RISCV_0p10_INTRINSICS | #ifdef RISCV_0p10_INTRINSICS | ||||
#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) | #define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) | ||||
#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) | |||||
#else | #else | ||||
#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | #define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | ||||
#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) | |||||
#endif | #endif | ||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _) | #define MASK_T JOIN(vbool, MLEN, _t, _, _) | ||||
#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) | |||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | ||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | ||||
#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) | #define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) | ||||
#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | ||||
#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
@@ -93,10 +91,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
for(; i<n/gvl; i++){ | for(; i<n/gvl; i++){ | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||||
v0 = VFABSV_FLOAT(v0, gvl); | |||||
v1 = VFABSV_FLOAT(v1, gvl); | |||||
v0 = VFADDVV_FLOAT(v0, v1, gvl); | v0 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl); | v_min = VFMINVV_FLOAT(v_min, v0, gvl); | ||||
@@ -110,10 +107,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||||
v0 = VFABSV_FLOAT(v0, gvl); | |||||
v1 = VFABSV_FLOAT(v1, gvl); | |||||
v1 = VFADDVV_FLOAT(v0, v1, gvl); | v1 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | ||||
} | } | ||||
@@ -96,7 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | { | ||||
BLASLONG i=0; | BLASLONG i=0; | ||||
if(n < 0) return(0.0); | |||||
if (n <= 0 || inc_x <= 0) return(0.0); | |||||
FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero; | FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
@@ -69,7 +69,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | ||||
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | ||||
if (n < 0) return(0); | |||||
if (n <= 0) return(0); | |||||
if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
BLASLONG n2 = n * 2; | BLASLONG n2 = n * 2; | ||||
if(gvl <= n2/2){ | if(gvl <= n2/2){ | ||||