* fix multiple numerical stability and corner case issues * add a script to generate arbitrary gemm kernel shapes * add a generic zvl256b target to demonstrate large gemm kernel unrollstags/v0.3.27
@@ -91,12 +91,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
#define BUFFER_SIZE ( 32 << 20) | #define BUFFER_SIZE ( 32 << 20) | ||||
#define SEEK_ADDRESS | #define SEEK_ADDRESS | ||||
#if defined(C910V) | |||||
#include <riscv_vector.h> | |||||
#endif | |||||
#if defined(x280) | |||||
#include <riscv_vector.h> | |||||
#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v) | |||||
# include <riscv_vector.h> | |||||
# if !defined(DOUBLE) | |||||
# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v) | |||||
# else | |||||
# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v) | |||||
# endif | |||||
#else | |||||
# define EXTRACT_FLOAT(v) (v[0]) | |||||
#endif | #endif | ||||
#endif | #endif |
@@ -70,14 +70,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
/* or implied, of The University of Texas at Austin. */ | /* or implied, of The University of Texas at Austin. */ | ||||
/*********************************************************************/ | /*********************************************************************/ | ||||
#define CPU_GENERIC 0 | |||||
#define CPU_C910V 1 | |||||
#define CPU_x280 2 | |||||
#define CPU_GENERIC 0 | |||||
#define CPU_C910V 1 | |||||
#define CPU_RISCV64_ZVL256B 2 | |||||
static char *cpuname[] = { | static char *cpuname[] = { | ||||
"RISCV64_GENERIC", | "RISCV64_GENERIC", | ||||
"C910V" | |||||
"x280" | |||||
"C910V", | |||||
"CPU_RISCV64_ZVL256B" | |||||
}; | }; | ||||
int detect(void){ | int detect(void){ | ||||
@@ -0,0 +1,199 @@ | |||||
SAMAXKERNEL = amax_vector.c | |||||
DAMAXKERNEL = amax_vector.c | |||||
CAMAXKERNEL = zamax_vector.c | |||||
ZAMAXKERNEL = zamax_vector.c | |||||
SAMINKERNEL = amin_vector.c | |||||
DAMINKERNEL = amin_vector.c | |||||
CAMINKERNEL = zamin_vector.c | |||||
ZAMINKERNEL = zamin_vector.c | |||||
SMAXKERNEL = max_vector.c | |||||
DMAXKERNEL = max_vector.c | |||||
SMINKERNEL = min_vector.c | |||||
DMINKERNEL = min_vector.c | |||||
ISAMAXKERNEL = iamax_vector.c | |||||
IDAMAXKERNEL = iamax_vector.c | |||||
ICAMAXKERNEL = izamax_vector.c | |||||
IZAMAXKERNEL = izamax_vector.c | |||||
ISAMINKERNEL = iamin_vector.c | |||||
IDAMINKERNEL = iamin_vector.c | |||||
ICAMINKERNEL = izamin_vector.c | |||||
IZAMINKERNEL = izamin_vector.c | |||||
ISMAXKERNEL = imax_vector.c | |||||
IDMAXKERNEL = imax_vector.c | |||||
ISMINKERNEL = imin_vector.c | |||||
IDMINKERNEL = imin_vector.c | |||||
SASUMKERNEL = asum_vector.c | |||||
DASUMKERNEL = asum_vector.c | |||||
CASUMKERNEL = zasum_vector.c | |||||
ZASUMKERNEL = zasum_vector.c | |||||
SSUMKERNEL = sum_vector.c | |||||
DSUMKERNEL = sum_vector.c | |||||
CSUMKERNEL = zsum_vector.c | |||||
ZSUMKERNEL = zsum_vector.c | |||||
SAXPYKERNEL = axpy_vector.c | |||||
DAXPYKERNEL = axpy_vector.c | |||||
CAXPYKERNEL = zaxpy_vector.c | |||||
ZAXPYKERNEL = zaxpy_vector.c | |||||
SCOPYKERNEL = copy_vector.c | |||||
DCOPYKERNEL = copy_vector.c | |||||
CCOPYKERNEL = zcopy_vector.c | |||||
ZCOPYKERNEL = zcopy_vector.c | |||||
SDOTKERNEL = dot_vector.c | |||||
DDOTKERNEL = dot_vector.c | |||||
CDOTKERNEL = zdot_vector.c | |||||
ZDOTKERNEL = zdot_vector.c | |||||
DSDOTKERNEL = ../generic/dot.c | |||||
SNRM2KERNEL = nrm2_vector.c | |||||
DNRM2KERNEL = nrm2_vector.c | |||||
CNRM2KERNEL = znrm2_vector.c | |||||
ZNRM2KERNEL = znrm2_vector.c | |||||
SROTKERNEL = rot_vector.c | |||||
DROTKERNEL = rot_vector.c | |||||
CROTKERNEL = zrot_vector.c | |||||
ZROTKERNEL = zrot_vector.c | |||||
SSCALKERNEL = scal_vector.c | |||||
DSCALKERNEL = scal_vector.c | |||||
CSCALKERNEL = zscal_vector.c | |||||
ZSCALKERNEL = zscal_vector.c | |||||
SSWAPKERNEL = swap_vector.c | |||||
DSWAPKERNEL = swap_vector.c | |||||
CSWAPKERNEL = zswap_vector.c | |||||
ZSWAPKERNEL = zswap_vector.c | |||||
SGEMVNKERNEL = gemv_n_vector.c | |||||
DGEMVNKERNEL = gemv_n_vector.c | |||||
CGEMVNKERNEL = zgemv_n_vector.c | |||||
ZGEMVNKERNEL = zgemv_n_vector.c | |||||
SGEMVTKERNEL = gemv_t_vector.c | |||||
DGEMVTKERNEL = gemv_t_vector.c | |||||
CGEMVTKERNEL = zgemv_t_vector.c | |||||
ZGEMVTKERNEL = zgemv_t_vector.c | |||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c | |||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c | |||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c | |||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c | |||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c | |||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c | |||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c | |||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c | |||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
SSYMV_U_KERNEL = symv_U_vector.c | |||||
SSYMV_L_KERNEL = symv_L_vector.c | |||||
DSYMV_U_KERNEL = symv_U_vector.c | |||||
DSYMV_L_KERNEL = symv_L_vector.c | |||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
CHEMV_L_KERNEL = zhemv_LM_vector.c | |||||
CHEMV_M_KERNEL = zhemv_LM_vector.c | |||||
CHEMV_U_KERNEL = zhemv_UV_vector.c | |||||
CHEMV_V_KERNEL = zhemv_UV_vector.c | |||||
ZHEMV_L_KERNEL = zhemv_LM_vector.c | |||||
ZHEMV_M_KERNEL = zhemv_LM_vector.c | |||||
ZHEMV_U_KERNEL = zhemv_UV_vector.c | |||||
ZHEMV_V_KERNEL = zhemv_UV_vector.c | |||||
LSAME_KERNEL = ../generic/lsame.c | |||||
SCABS_KERNEL = ../generic/cabs.c | |||||
DCABS_KERNEL = ../generic/cabs.c | |||||
QCABS_KERNEL = ../generic/cabs.c | |||||
ifndef SGEMM_BETA | |||||
SGEMM_BETA = ../generic/gemm_beta.c | |||||
endif | |||||
ifndef DGEMM_BETA | |||||
DGEMM_BETA = ../generic/gemm_beta.c | |||||
endif | |||||
ifndef CGEMM_BETA | |||||
CGEMM_BETA = ../generic/zgemm_beta.c | |||||
endif | |||||
ifndef ZGEMM_BETA | |||||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
endif |
@@ -28,36 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | #include <math.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
@@ -65,103 +66,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT maxf=0.0; | FLOAT maxf=0.0; | ||||
if (n <= 0 || inc_x <= 0) return(maxf); | if (n <= 0 || inc_x <= 0) return(maxf); | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T v0, v1, v_max; | |||||
FLOAT_V_T_M1 v_res, v_zero; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_zero = VFMVVF_FLOAT_M1(0, gvl); | |||||
FLOAT_V_T v0, v1; | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
MASK_T mask0, mask1; | |||||
FLOAT zero = 0.0; | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
v_max = VFMVVF_FLOAT(0, gvl); | |||||
for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v1) | |||||
:"vd"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v1) | |||||
:"vd"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||||
j += gvl*2; | j += gvl*2; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||||
maxf = *((FLOAT*)&v_res); | |||||
//maxf = v_res[0]; | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||||
if(*((FLOAT*)&v_res) > maxf) | |||||
maxf = *((FLOAT*)&v_res); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -169,94 +95,27 @@ asm volatile( | |||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
v_max = VFMVVF_FLOAT(0, gvl); | |||||
for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | ||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v1) | |||||
:"vd"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v1) | |||||
:"vd"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||||
j += gvl*2; | j += gvl*2; | ||||
ix += inc_xv*2; | ix += inc_xv*2; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||||
maxf = *((FLOAT*)&v_res); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||||
if(*((FLOAT*)&v_res) > maxf) | |||||
maxf = *((FLOAT*)&v_res); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
} | } | ||||
maxf = EXTRACT_FLOAT(v_res); | |||||
return(maxf); | return(maxf); | ||||
} | } | ||||
@@ -26,232 +26,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | *****************************************************************************/ | ||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | |||||
#include <float.h> | |||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
if (n <= 0 || inc_x <= 0) return(0.0); | |||||
FLOAT minf=FLT_MAX; | |||||
BLASLONG i=0, j=0; | |||||
BLASLONG ix=0; | |||||
FLOAT minf=0.0; | |||||
if (n <= 0 || inc_x <= 0) return(minf); | |||||
minf = *x; | |||||
x += inc_x; | |||||
--n; | |||||
if (n == 0) return(minf); | |||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T v0, v1, v_min; | |||||
FLOAT_V_T_M1 v_res, v_max; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
FLOAT_V_T v0, v1; | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(minf, 1); | |||||
MASK_T mask0, mask1; | |||||
FLOAT zero = 0.0; | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v1) | |||||
:"vd"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v1) | |||||
:"vd"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||||
j += gvl*2; | j += gvl*2; | ||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = *((FLOAT*)&v_res); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
if(*((FLOAT*)&v_res) < minf) | |||||
minf = *((FLOAT*)&v_res); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
}else{ | }else{ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
BLASLONG idx = 0, inc_xv = inc_x * gvl; | |||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
BLASLONG inc_xv = inc_x * gvl; | |||||
for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||||
v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v1) | |||||
:"vd"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v1) | |||||
:"vd"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||||
j += gvl*2; | j += gvl*2; | ||||
idx += inc_xv*2; | |||||
ix += inc_xv*2; | |||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = *((FLOAT*)&v_res); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vsetvli zero, zero, e8, m1\n\t" | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+vd"(v0) | |||||
:"vd"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
if(*((FLOAT*)&v_res) < minf) | |||||
minf = *((FLOAT*)&v_res); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
} | } | ||||
return(minf); | |||||
} | |||||
minf = EXTRACT_FLOAT(v_res); | |||||
return(minf); | |||||
} |
@@ -28,35 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | #include <math.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
@@ -64,75 +67,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT asumf=0.0; | FLOAT asumf=0.0; | ||||
if (n <= 0 || inc_x <= 0) return(asumf); | if (n <= 0 || inc_x <= 0) return(asumf); | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T v0, v1, v_zero,v_sum; | |||||
FLOAT_V_T_M1 v_res, v_z0; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
FLOAT_V_T v0, v1, v_sum; | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
MASK_T mask0, mask1; | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
j += gvl * 2; | j += gvl * 2; | ||||
} | } | ||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
asumf += *((FLOAT*)&v_res); | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
asumf += *((FLOAT*)&v_res); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
}else{ | }else{ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | ||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
j += gvl * 2; | j += gvl * 2; | ||||
inc_xv += inc_xv * 2; | inc_xv += inc_xv * 2; | ||||
} | } | ||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
asumf += *((FLOAT*)&v_res); | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
asumf += *((FLOAT*)&v_res); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
} | } | ||||
asumf = EXTRACT_FLOAT(v_res); | |||||
return(asumf); | return(asumf); | ||||
} | } | ||||
@@ -27,28 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define FLOAT_V_T vfloat32m4_t | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSEV_FLOAT vse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define FLOAT_V_T vfloat64m4_t | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSEV_FLOAT vse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
# define LMUL m4 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMULVF_FLOAT JOIN(__riscv_vfmul, _vf_f, ELEN, LMUL, _) | |||||
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | ||||
{ | { | ||||
if (n < 0) return(0); | if (n < 0) return(0); | ||||
@@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
*****************************************************************************/ | *****************************************************************************/ | ||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define FLOAT_V_T vfloat32m4_t | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSEV_FLOAT vse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define FLOAT_V_T vfloat64m4_t | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSEV_FLOAT vse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
# define LMUL m4 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _) | |||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
{ | { | ||||
BLASLONG i=0, j=0, jx=0, jy=0; | BLASLONG i=0, j=0, jx=0, jy=0; | ||||
@@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
*****************************************************************************/ | *****************************************************************************/ | ||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VSEV_FLOAT vse32_v_f32m8 | |||||
#define VSSEV_FLOAT vsse32_v_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VSEV_FLOAT vse64_v_f64m8 | |||||
#define VSSEV_FLOAT vsse64_v_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) | |||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
@@ -0,0 +1,860 @@ | |||||
/* | |||||
AUTOGENERATED KERNEL | |||||
Settings: | |||||
LMUL=1 | |||||
M=8 | |||||
M_tail_scalar_from=2 | |||||
N=8 | |||||
__riscv_='__riscv_' | |||||
complex=False | |||||
conjugate=False | |||||
cpu='zvl256b' | |||||
force_acc_double=False | |||||
index_type='BLASLONG' | |||||
op='gemm' | |||||
param_precision='double' | |||||
reg_width_bits=256 | |||||
tail_policy='' | |||||
trace=False | |||||
Derived: | |||||
ELEN_ACC=64 | |||||
ELEN_PARAM=64 | |||||
LMUL_ACC=1 | |||||
VFMACC='__riscv_vfmacc_vf_f64m1' | |||||
VFMUL='__riscv_vfmul_vf_f64m1' | |||||
VLEV='__riscv_vle64_v_f64m1' | |||||
VLSEV='__riscv_vlse64_v_f64m1' | |||||
VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' | |||||
VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' | |||||
VSETVL='__riscv_vsetvl_e64m1' | |||||
VSEV='__riscv_vse64_v_f64m1' | |||||
VSSEV='__riscv_vsse64_v_f64m1' | |||||
acc_vector_t='vfloat64m1_t' | |||||
output='dgemm_kernel_8x8_zvl256b.c' | |||||
param_scalar_t='double' | |||||
param_vector_t='vfloat64m1_t' | |||||
*/ | |||||
#include "common.h" | |||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) | |||||
{ | |||||
BLASLONG gvl = 0; | |||||
BLASLONG m_top = 0; | |||||
BLASLONG n_top = 0; | |||||
// -- MAIN PASS | |||||
for (BLASLONG j=0; j<N/8; j+=1) { | |||||
m_top = 0; | |||||
BLASLONG gvl = __riscv_vsetvl_e64m1(4); | |||||
for (BLASLONG i=0; i<M/8; i+=1) { | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
double B0 = B[bi+0]; | |||||
double B1 = B[bi+1]; | |||||
double B2 = B[bi+2]; | |||||
double B3 = B[bi+3]; | |||||
double B4 = B[bi+4]; | |||||
double B5 = B[bi+5]; | |||||
double B6 = B[bi+6]; | |||||
double B7 = B[bi+7]; | |||||
bi += 8; | |||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
ai += 8; | |||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||||
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||||
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl); | |||||
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||||
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl); | |||||
vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl); | |||||
vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl); | |||||
vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl); | |||||
vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl); | |||||
vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl); | |||||
vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl); | |||||
vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl); | |||||
vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl); | |||||
for(BLASLONG k=1; k<K; k++) { | |||||
B0 = B[bi+0]; | |||||
B1 = B[bi+1]; | |||||
B2 = B[bi+2]; | |||||
B3 = B[bi+3]; | |||||
B4 = B[bi+4]; | |||||
B5 = B[bi+5]; | |||||
B6 = B[bi+6]; | |||||
B7 = B[bi+7]; | |||||
bi += 8; | |||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
ai += 8; | |||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||||
result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl); | |||||
result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl); | |||||
result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl); | |||||
result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl); | |||||
result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl); | |||||
result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl); | |||||
result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl); | |||||
result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl); | |||||
result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl); | |||||
result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl); | |||||
result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl); | |||||
result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl); | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||||
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||||
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||||
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||||
c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl ); | |||||
c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl ); | |||||
c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl ); | |||||
c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl ); | |||||
c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl ); | |||||
c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl ); | |||||
c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl ); | |||||
c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl ); | |||||
ci=n_top*ldc+m_top; | |||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c15, gvl); | |||||
m_top += 8; | |||||
} | |||||
// -- tails for main pass | |||||
if( M & 4 ) { | |||||
gvl = __riscv_vsetvl_e64m1(4); | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
double B0 = B[bi+0]; | |||||
double B1 = B[bi+1]; | |||||
double B2 = B[bi+2]; | |||||
double B3 = B[bi+3]; | |||||
double B4 = B[bi+4]; | |||||
double B5 = B[bi+5]; | |||||
double B6 = B[bi+6]; | |||||
double B7 = B[bi+7]; | |||||
bi += 8; | |||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
ai += 4; | |||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||||
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl); | |||||
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl); | |||||
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl); | |||||
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl); | |||||
for(BLASLONG k=1; k<K; k++) { | |||||
B0 = B[bi+0]; | |||||
B1 = B[bi+1]; | |||||
B2 = B[bi+2]; | |||||
B3 = B[bi+3]; | |||||
B4 = B[bi+4]; | |||||
B5 = B[bi+5]; | |||||
B6 = B[bi+6]; | |||||
B7 = B[bi+7]; | |||||
bi += 8; | |||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
ai += 4; | |||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl); | |||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl); | |||||
result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl); | |||||
result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl); | |||||
result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl); | |||||
result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl); | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||||
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||||
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||||
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||||
ci=n_top*ldc+m_top; | |||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c7, gvl); | |||||
m_top += 4; | |||||
} | |||||
if( M & 2 ) { | |||||
double result0 = 0; | |||||
double result1 = 0; | |||||
double result2 = 0; | |||||
double result3 = 0; | |||||
double result4 = 0; | |||||
double result5 = 0; | |||||
double result6 = 0; | |||||
double result7 = 0; | |||||
double result8 = 0; | |||||
double result9 = 0; | |||||
double result10 = 0; | |||||
double result11 = 0; | |||||
double result12 = 0; | |||||
double result13 = 0; | |||||
double result14 = 0; | |||||
double result15 = 0; | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
for(BLASLONG k=0; k<K; k++) { | |||||
result0+=A[ai+0]*B[bi+0]; | |||||
result1+=A[ai+1]*B[bi+0]; | |||||
result2+=A[ai+0]*B[bi+1]; | |||||
result3+=A[ai+1]*B[bi+1]; | |||||
result4+=A[ai+0]*B[bi+2]; | |||||
result5+=A[ai+1]*B[bi+2]; | |||||
result6+=A[ai+0]*B[bi+3]; | |||||
result7+=A[ai+1]*B[bi+3]; | |||||
result8+=A[ai+0]*B[bi+4]; | |||||
result9+=A[ai+1]*B[bi+4]; | |||||
result10+=A[ai+0]*B[bi+5]; | |||||
result11+=A[ai+1]*B[bi+5]; | |||||
result12+=A[ai+0]*B[bi+6]; | |||||
result13+=A[ai+1]*B[bi+6]; | |||||
result14+=A[ai+0]*B[bi+7]; | |||||
result15+=A[ai+1]*B[bi+7]; | |||||
ai+=2; | |||||
bi+=8; | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
C[ci+0*ldc+0] += alpha * result0; | |||||
C[ci+0*ldc+1] += alpha * result1; | |||||
C[ci+1*ldc+0] += alpha * result2; | |||||
C[ci+1*ldc+1] += alpha * result3; | |||||
C[ci+2*ldc+0] += alpha * result4; | |||||
C[ci+2*ldc+1] += alpha * result5; | |||||
C[ci+3*ldc+0] += alpha * result6; | |||||
C[ci+3*ldc+1] += alpha * result7; | |||||
C[ci+4*ldc+0] += alpha * result8; | |||||
C[ci+4*ldc+1] += alpha * result9; | |||||
C[ci+5*ldc+0] += alpha * result10; | |||||
C[ci+5*ldc+1] += alpha * result11; | |||||
C[ci+6*ldc+0] += alpha * result12; | |||||
C[ci+6*ldc+1] += alpha * result13; | |||||
C[ci+7*ldc+0] += alpha * result14; | |||||
C[ci+7*ldc+1] += alpha * result15; | |||||
m_top+=2; | |||||
} | |||||
if( M & 1 ) { | |||||
double result0 = 0; | |||||
double result1 = 0; | |||||
double result2 = 0; | |||||
double result3 = 0; | |||||
double result4 = 0; | |||||
double result5 = 0; | |||||
double result6 = 0; | |||||
double result7 = 0; | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
for(BLASLONG k=0; k<K; k++) { | |||||
result0+=A[ai+0]*B[bi+0]; | |||||
result1+=A[ai+0]*B[bi+1]; | |||||
result2+=A[ai+0]*B[bi+2]; | |||||
result3+=A[ai+0]*B[bi+3]; | |||||
result4+=A[ai+0]*B[bi+4]; | |||||
result5+=A[ai+0]*B[bi+5]; | |||||
result6+=A[ai+0]*B[bi+6]; | |||||
result7+=A[ai+0]*B[bi+7]; | |||||
ai+=1; | |||||
bi+=8; | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
C[ci+0*ldc+0] += alpha * result0; | |||||
C[ci+1*ldc+0] += alpha * result1; | |||||
C[ci+2*ldc+0] += alpha * result2; | |||||
C[ci+3*ldc+0] += alpha * result3; | |||||
C[ci+4*ldc+0] += alpha * result4; | |||||
C[ci+5*ldc+0] += alpha * result5; | |||||
C[ci+6*ldc+0] += alpha * result6; | |||||
C[ci+7*ldc+0] += alpha * result7; | |||||
m_top+=1; | |||||
} | |||||
n_top += 8; | |||||
} | |||||
// -- tails for N=4 | |||||
if( N & 4 ) { | |||||
gvl = __riscv_vsetvl_e64m1(4); | |||||
m_top = 0; | |||||
for (BLASLONG i=0; i<M/8; i+=1) { | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
double B0 = B[bi+0]; | |||||
double B1 = B[bi+1]; | |||||
double B2 = B[bi+2]; | |||||
double B3 = B[bi+3]; | |||||
bi += 4; | |||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
ai += 8; | |||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||||
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||||
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl); | |||||
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||||
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl); | |||||
for(BLASLONG k=1; k<K; k++) { | |||||
B0 = B[bi+0]; | |||||
B1 = B[bi+1]; | |||||
B2 = B[bi+2]; | |||||
B3 = B[bi+3]; | |||||
bi += 4; | |||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
ai += 8; | |||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||||
result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl); | |||||
result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl); | |||||
result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl); | |||||
result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl); | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||||
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||||
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||||
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||||
ci=n_top*ldc+m_top; | |||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c7, gvl); | |||||
m_top += 8; | |||||
} | |||||
if( M & 4 ) { | |||||
gvl = __riscv_vsetvl_e64m1(4); | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
double B0 = B[bi+0]; | |||||
double B1 = B[bi+1]; | |||||
double B2 = B[bi+2]; | |||||
double B3 = B[bi+3]; | |||||
bi += 4; | |||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
ai += 4; | |||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||||
for(BLASLONG k=1; k<K; k++) { | |||||
B0 = B[bi+0]; | |||||
B1 = B[bi+1]; | |||||
B2 = B[bi+2]; | |||||
B3 = B[bi+3]; | |||||
bi += 4; | |||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
ai += 4; | |||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl); | |||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl); | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
ci=n_top*ldc+m_top; | |||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); | |||||
m_top += 4; | |||||
} | |||||
if( M & 2 ) { | |||||
double result0 = 0; | |||||
double result1 = 0; | |||||
double result2 = 0; | |||||
double result3 = 0; | |||||
double result4 = 0; | |||||
double result5 = 0; | |||||
double result6 = 0; | |||||
double result7 = 0; | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
for(BLASLONG k=0; k<K; k++) { | |||||
result0+=A[ai+0]*B[bi+0]; | |||||
result1+=A[ai+1]*B[bi+0]; | |||||
result2+=A[ai+0]*B[bi+1]; | |||||
result3+=A[ai+1]*B[bi+1]; | |||||
result4+=A[ai+0]*B[bi+2]; | |||||
result5+=A[ai+1]*B[bi+2]; | |||||
result6+=A[ai+0]*B[bi+3]; | |||||
result7+=A[ai+1]*B[bi+3]; | |||||
ai+=2; | |||||
bi+=4; | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
C[ci+0*ldc+0] += alpha * result0; | |||||
C[ci+0*ldc+1] += alpha * result1; | |||||
C[ci+1*ldc+0] += alpha * result2; | |||||
C[ci+1*ldc+1] += alpha * result3; | |||||
C[ci+2*ldc+0] += alpha * result4; | |||||
C[ci+2*ldc+1] += alpha * result5; | |||||
C[ci+3*ldc+0] += alpha * result6; | |||||
C[ci+3*ldc+1] += alpha * result7; | |||||
m_top+=2; | |||||
} | |||||
if( M & 1 ) { | |||||
double result0 = 0; | |||||
double result1 = 0; | |||||
double result2 = 0; | |||||
double result3 = 0; | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
for(BLASLONG k=0; k<K; k++) { | |||||
result0+=A[ai+0]*B[bi+0]; | |||||
result1+=A[ai+0]*B[bi+1]; | |||||
result2+=A[ai+0]*B[bi+2]; | |||||
result3+=A[ai+0]*B[bi+3]; | |||||
ai+=1; | |||||
bi+=4; | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
C[ci+0*ldc+0] += alpha * result0; | |||||
C[ci+1*ldc+0] += alpha * result1; | |||||
C[ci+2*ldc+0] += alpha * result2; | |||||
C[ci+3*ldc+0] += alpha * result3; | |||||
m_top+=1; | |||||
} | |||||
n_top += 4; | |||||
} | |||||
// -- tails for N=2 | |||||
if( N & 2 ) { | |||||
gvl = __riscv_vsetvl_e64m1(4); | |||||
m_top = 0; | |||||
for (BLASLONG i=0; i<M/8; i+=1) { | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
double B0 = B[bi+0]; | |||||
double B1 = B[bi+1]; | |||||
bi += 2; | |||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
ai += 8; | |||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||||
for(BLASLONG k=1; k<K; k++) { | |||||
B0 = B[bi+0]; | |||||
B1 = B[bi+1]; | |||||
bi += 2; | |||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
ai += 8; | |||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
ci=n_top*ldc+m_top; | |||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); | |||||
m_top += 8; | |||||
} | |||||
if( M & 4 ) { | |||||
gvl = __riscv_vsetvl_e64m1(4); | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
double B0 = B[bi+0]; | |||||
double B1 = B[bi+1]; | |||||
bi += 2; | |||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
ai += 4; | |||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
for(BLASLONG k=1; k<K; k++) { | |||||
B0 = B[bi+0]; | |||||
B1 = B[bi+1]; | |||||
bi += 2; | |||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
ai += 4; | |||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
ci=n_top*ldc+m_top; | |||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); | |||||
m_top += 4; | |||||
} | |||||
if( M & 2 ) { | |||||
double result0 = 0; | |||||
double result1 = 0; | |||||
double result2 = 0; | |||||
double result3 = 0; | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
for(BLASLONG k=0; k<K; k++) { | |||||
result0+=A[ai+0]*B[bi+0]; | |||||
result1+=A[ai+1]*B[bi+0]; | |||||
result2+=A[ai+0]*B[bi+1]; | |||||
result3+=A[ai+1]*B[bi+1]; | |||||
ai+=2; | |||||
bi+=2; | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
C[ci+0*ldc+0] += alpha * result0; | |||||
C[ci+0*ldc+1] += alpha * result1; | |||||
C[ci+1*ldc+0] += alpha * result2; | |||||
C[ci+1*ldc+1] += alpha * result3; | |||||
m_top+=2; | |||||
} | |||||
if( M & 1 ) { | |||||
double result0 = 0; | |||||
double result1 = 0; | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
for(BLASLONG k=0; k<K; k++) { | |||||
result0+=A[ai+0]*B[bi+0]; | |||||
result1+=A[ai+0]*B[bi+1]; | |||||
ai+=1; | |||||
bi+=2; | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
C[ci+0*ldc+0] += alpha * result0; | |||||
C[ci+1*ldc+0] += alpha * result1; | |||||
m_top+=1; | |||||
} | |||||
n_top += 2; | |||||
} | |||||
// -- tails for N=1 | |||||
if( N & 1 ) { | |||||
gvl = __riscv_vsetvl_e64m1(4); | |||||
m_top = 0; | |||||
for (BLASLONG i=0; i<M/8; i+=1) { | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
double B0 = B[bi+0]; | |||||
bi += 1; | |||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
ai += 8; | |||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||||
for(BLASLONG k=1; k<K; k++) { | |||||
B0 = B[bi+0]; | |||||
bi += 1; | |||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
ai += 8; | |||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
ci=n_top*ldc+m_top; | |||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); | |||||
m_top += 8; | |||||
} | |||||
if( M & 4 ) { | |||||
gvl = __riscv_vsetvl_e64m1(4); | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
double B0 = B[bi+0]; | |||||
bi += 1; | |||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
ai += 4; | |||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
for(BLASLONG k=1; k<K; k++) { | |||||
B0 = B[bi+0]; | |||||
bi += 1; | |||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
ai += 4; | |||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
ci=n_top*ldc+m_top; | |||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); | |||||
m_top += 4; | |||||
} | |||||
if( M & 2 ) { | |||||
double result0 = 0; | |||||
double result1 = 0; | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
for(BLASLONG k=0; k<K; k++) { | |||||
result0+=A[ai+0]*B[bi+0]; | |||||
result1+=A[ai+1]*B[bi+0]; | |||||
ai+=2; | |||||
bi+=1; | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
C[ci+0*ldc+0] += alpha * result0; | |||||
C[ci+0*ldc+1] += alpha * result1; | |||||
m_top+=2; | |||||
} | |||||
if( M & 1 ) { | |||||
double result0 = 0; | |||||
BLASLONG ai=m_top*K; | |||||
BLASLONG bi=n_top*K; | |||||
for(BLASLONG k=0; k<K; k++) { | |||||
result0+=A[ai+0]*B[bi+0]; | |||||
ai+=1; | |||||
bi+=1; | |||||
} | |||||
BLASLONG ci=n_top*ldc+m_top; | |||||
C[ci+0*ldc+0] += alpha * result0; | |||||
m_top+=1; | |||||
} | |||||
n_top += 1; | |||||
} | |||||
return 0; | |||||
} | |||||
@@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
BLASLONG ix=0,iy=0; | BLASLONG ix=0,iy=0; | ||||
double dot = 0.0 ; | double dot = 0.0 ; | ||||
if ( n < 0 ) return(dot); | |||||
if ( n < 1 ) return(dot); | |||||
while(i < n) | while(i < n) | ||||
{ | { | ||||
@@ -27,31 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 | |||||
#endif | #endif | ||||
#if defined(DSDOT) | #if defined(DSDOT) | ||||
@@ -63,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
double dot = 0.0 ; | double dot = 0.0 ; | ||||
if ( n < 0 ) return(dot); | |||||
if ( n < 1 ) return(dot); | |||||
FLOAT_V_T vr, vx, vy; | FLOAT_V_T vr, vx, vy; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
@@ -82,8 +80,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
if(j > 0){ | if(j > 0){ | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
dot += (double)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
dot += (double)EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
//tail | //tail | ||||
if(j < n){ | if(j < n){ | ||||
@@ -93,13 +91,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
//vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
dot += (double)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
dot += (double)EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
}else if(inc_y == 1){ | }else if(inc_y == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
int stride_x = inc_x * sizeof(FLOAT); | |||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
vy = VLEV_FLOAT(&y[j], gvl); | vy = VLEV_FLOAT(&y[j], gvl); | ||||
@@ -107,9 +105,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
if(j > 0){ | if(j > 0){ | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
dot += (double)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
dot += (double)EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
//tail | //tail | ||||
if(j < n){ | if(j < n){ | ||||
@@ -119,14 +116,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
//vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
dot += (double)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
dot += (double)EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
}else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
int stride_y = inc_y * sizeof(FLOAT); | |||||
BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
@@ -134,9 +130,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
if(j > 0){ | if(j > 0){ | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
dot += (double)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
dot += (double)EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
//tail | //tail | ||||
if(j < n){ | if(j < n){ | ||||
@@ -146,15 +141,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
//vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
dot += (double)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
dot += (double)EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
}else{ | }else{ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
int stride_x = inc_x * sizeof(FLOAT); | |||||
int stride_y = inc_y * sizeof(FLOAT); | |||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
@@ -162,9 +156,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
if(j > 0){ | if(j > 0){ | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
dot += (double)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
dot += (double)EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
//tail | //tail | ||||
if(j < n){ | if(j < n){ | ||||
@@ -174,9 +167,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
//vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
dot += (double)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
dot += (double)EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
return(dot); | return(dot); | ||||
@@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSEV_FLOAT vse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSEV_FLOAT __riscv_vse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSEV_FLOAT vse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSEV_FLOAT __riscv_vse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
@@ -27,107 +27,102 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | |||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||||
#define FLOAT_V_T vfloat32m2_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
#define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2 | |||||
#define xint_t int | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | |||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||||
#define FLOAT_V_T vfloat64m2_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
#define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2 | |||||
#define xint_t long long | |||||
#endif | #endif | ||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
{ | { | ||||
BLASLONG i = 0, j = 0, k = 0; | |||||
BLASLONG ix = 0, iy = 0; | |||||
FLOAT *a_ptr = a; | |||||
BLASLONG i = 0, j = 0, k = 0; | |||||
BLASLONG ix = 0, iy = 0; | |||||
FLOAT *a_ptr = a; | |||||
FLOAT temp; | FLOAT temp; | ||||
FLOAT_V_T va, vr, vx; | FLOAT_V_T va, vr, vx; | ||||
unsigned int gvl = 0; | |||||
FLOAT_V_T_M1 v_res, v_z0; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
BLASLONG gvl = 0; | |||||
FLOAT_V_T_M1 v_res; | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
for(i = 0; i < n; i++){ | for(i = 0; i < n; i++){ | ||||
v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
gvl = VSETVL(m); | gvl = VSETVL(m); | ||||
j = 0; | j = 0; | ||||
vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
for(k = 0; k < m/gvl; k++){ | for(k = 0; k < m/gvl; k++){ | ||||
va = VLEV_FLOAT(&a_ptr[j], gvl); | va = VLEV_FLOAT(&a_ptr[j], gvl); | ||||
vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
vr = VFMACCVV_FLOAT(vr, va, vx, gvl); | |||||
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop | |||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp = (FLOAT)VFMVFS_FLOAT(v_res); | |||||
if(j < m){ | if(j < m){ | ||||
gvl = VSETVL(m-j); | gvl = VSETVL(m-j); | ||||
va = VLEV_FLOAT(&a_ptr[j], gvl); | va = VLEV_FLOAT(&a_ptr[j], gvl); | ||||
vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
vr = VFMULVV_FLOAT(va, vx, gvl); | vr = VFMULVV_FLOAT(va, vx, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp += (FLOAT)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||||
} | } | ||||
temp = (FLOAT)EXTRACT_FLOAT(v_res); | |||||
y[iy] += alpha * temp; | y[iy] += alpha * temp; | ||||
iy += inc_y; | iy += inc_y; | ||||
a_ptr += lda; | a_ptr += lda; | ||||
} | } | ||||
}else{ | }else{ | ||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
for(i = 0; i < n; i++){ | for(i = 0; i < n; i++){ | ||||
v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
gvl = VSETVL(m); | gvl = VSETVL(m); | ||||
BLASLONG inc_xv = inc_x * gvl; | |||||
j = 0; | j = 0; | ||||
ix = 0; | ix = 0; | ||||
vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
for(k = 0; k < m/gvl; k++){ | for(k = 0; k < m/gvl; k++){ | ||||
va = VLEV_FLOAT(&a_ptr[j], gvl); | va = VLEV_FLOAT(&a_ptr[j], gvl); | ||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
vr = VFMACCVV_FLOAT(vr, va, vx, gvl); | |||||
vr = VFMULVV_FLOAT(va, vx, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
ix += inc_xv; | |||||
ix += inc_x * gvl; | |||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp = (FLOAT)VFMVFS_FLOAT(v_res); | |||||
if(j < m){ | if(j < m){ | ||||
gvl = VSETVL(m-j); | gvl = VSETVL(m-j); | ||||
va = VLEV_FLOAT(&a_ptr[j], gvl); | va = VLEV_FLOAT(&a_ptr[j], gvl); | ||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
vr = VFMULVV_FLOAT(va, vx, gvl); | vr = VFMULVV_FLOAT(va, vx, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp += (FLOAT)VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||||
} | } | ||||
temp = (FLOAT)EXTRACT_FLOAT(v_res); | |||||
y[iy] += alpha * temp; | y[iy] += alpha * temp; | ||||
iy += inc_y; | iy += inc_y; | ||||
a_ptr += lda; | a_ptr += lda; | ||||
} | } | ||||
} | } | ||||
return(0); | return(0); | ||||
} | } | ||||
@@ -0,0 +1,670 @@ | |||||
#!/usr/bin/python3 | |||||
import sys, os | |||||
import contextlib | |||||
#----------------------------------------------------------------------- | |||||
def ERROR(*args, **kwargs): | |||||
print(*args, file=sys.stderr, **kwargs) | |||||
sys.exit(-1) | |||||
class Target(object): | |||||
def __init__( self, out, mappings, initial_level=0, tab_width=4 ): | |||||
self._level = initial_level | |||||
self._tab_width = tab_width | |||||
self._out = out | |||||
self._mappings = mappings | |||||
@contextlib.contextmanager | |||||
def map( self, **items ): | |||||
old_mappings = self._mappings | |||||
self._mappings = dict(old_mappings, **items) | |||||
yield self._mappings | |||||
self._mappings = old_mappings | |||||
@contextlib.contextmanager | |||||
def block( self, start=None, end=None, **args ): | |||||
with self.map(**args): | |||||
if start is not None: | |||||
self.write(); | |||||
self.write(start) | |||||
self._level += 1 | |||||
yield self._level | |||||
self._level -= 1 | |||||
if end is not None: | |||||
self.write(end) | |||||
self.write() | |||||
def write( self, fmt=None, *args, **kwargs ): | |||||
if fmt is not None: | |||||
mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings | |||||
self._out(self._indent_str() + fmt.format(*args, **mappings)) | |||||
else: | |||||
self._out("") | |||||
def _indent_str( self ): | |||||
return ' ' * (self._level * self._tab_width) | |||||
#----------------------------------------------------------------------- | |||||
def generate_trmm_block( dest ): | |||||
dest.write("{index_type} pass_K = K;") | |||||
dest.write("#ifdef LEFT") | |||||
with dest.block(): | |||||
dest.write("{index_type} off = offset + m_top;") | |||||
dest.write("#else") | |||||
with dest.block(): | |||||
dest.write("{index_type} off = -offset + n_top;") | |||||
dest.write("#endif") | |||||
dest.write("#ifdef BACKWARDS") | |||||
with dest.block(): | |||||
dest.write("ai += off*{M}{elt_size};") | |||||
dest.write("bi += off*{N}{elt_size};") | |||||
dest.write("pass_K -= off;") | |||||
dest.write("#else") | |||||
with dest.block(): | |||||
dest.write("#ifdef LEFT") | |||||
with dest.block(): | |||||
dest.write("pass_K = off + {M};") | |||||
dest.write("#else") | |||||
with dest.block(): | |||||
dest.write("pass_K = off + {N};") | |||||
dest.write("#endif") | |||||
dest.write("#endif") | |||||
#----------------------------------------------------------------------- | |||||
def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ): | |||||
TRMM = (settings['op'].value == 'trmm') | |||||
narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value | |||||
with dest.map( | |||||
M=M, | |||||
N=N, | |||||
): | |||||
dest.write("{index_type} ai=m_top*K{elt_size};") | |||||
dest.write("{index_type} bi=n_top*K{elt_size};") | |||||
if TRMM: | |||||
generate_trmm_block( dest ) | |||||
for i in range(N): | |||||
dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i) | |||||
dest.write("bi += {N};") | |||||
dest.write() | |||||
for i in range(a_regs): | |||||
dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i) | |||||
dest.write("ai += {M};") | |||||
dest.write() | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j) | |||||
with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): | |||||
for i in range(N): | |||||
dest.write("B{i} = B[bi+{i}];", i=i ) | |||||
dest.write("bi += {N};") | |||||
dest.write() | |||||
for i in range(a_regs): | |||||
dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i) | |||||
dest.write("ai += {M};") | |||||
dest.write() | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i ) | |||||
dest.write() | |||||
dest.write("{index_type} ci=n_top*ldc+m_top;") | |||||
dest.write() | |||||
if narrow_result: | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i) | |||||
if not TRMM: | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
idx = j*a_regs+i | |||||
increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' | |||||
if idx == N*a_regs-1: | |||||
increment = '' | |||||
dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment) | |||||
if narrow_result: | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
idx = j*a_regs+i | |||||
if TRMM: | |||||
dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx) | |||||
else: | |||||
dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx) | |||||
else: | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
idx = j*a_regs+i | |||||
if TRMM: | |||||
dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx) | |||||
else: | |||||
dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx) | |||||
if not TRMM: | |||||
dest.write() | |||||
dest.write("ci=n_top*ldc+m_top;") | |||||
dest.write() | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
idx = j*a_regs+i | |||||
increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' | |||||
if idx == N*a_regs-1: | |||||
increment = '' | |||||
dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment) | |||||
#----------------------------------------------------------------------- | |||||
def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ): | |||||
TRMM = (settings['op'].value == 'trmm') | |||||
narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value | |||||
if narrow_result: | |||||
raise RuntimeError("wide accumulator not supported for generated complex kernels") | |||||
# we could, but we run out of registers really really fast | |||||
with dest.map( | |||||
M=M, | |||||
N=N, | |||||
): | |||||
dest.write("{index_type} ai=m_top*K*2;") | |||||
dest.write("{index_type} bi=n_top*K*2;") | |||||
if TRMM: | |||||
generate_trmm_block( dest ) | |||||
for i in range(N): | |||||
dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i) | |||||
dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i) | |||||
dest.write("bi += {N}*2;") | |||||
dest.write() | |||||
for i in range(a_regs): | |||||
dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) | |||||
dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) | |||||
dest.write("ai += {M}*2;") | |||||
dest.write() | |||||
accumulation_regs = a_regs * N * settings['LMUL_ACC'].value | |||||
dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k", | |||||
a_regs=a_regs*2, accumulation_regs=accumulation_regs*2 | |||||
) | |||||
pass_regs = (accumulation_regs + a_regs)*2 | |||||
tmp_regs = 32-pass_regs | |||||
if tmp_regs < 2: | |||||
raise RuntimeError("Complex kernel would use too many registers!") | |||||
dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs) | |||||
tmp_unroll_i = min(tmp_regs, a_regs) | |||||
tmp_unroll_j = N | |||||
while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j: | |||||
tmp_unroll_j = int(tmp_unroll_j / 2) | |||||
if tmp_unroll_i < a_regs or tmp_unroll_j < N: | |||||
dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i) | |||||
for tj in range(0, N, tmp_unroll_j): | |||||
for ti in range(0, a_regs, tmp_unroll_i): | |||||
for j in range(tj, tj+tmp_unroll_j): | |||||
for i in range(ti, ti+tmp_unroll_i): | |||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||||
if ti == 0 and tj==0: | |||||
dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") | |||||
dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") | |||||
else: | |||||
dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") | |||||
dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") | |||||
for j in range(tj, tj+tmp_unroll_j): | |||||
for i in range(ti, ti+tmp_unroll_i): | |||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||||
dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") | |||||
dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") | |||||
for j in range(tj, tj+tmp_unroll_j): | |||||
for i in range(ti, ti+tmp_unroll_i): | |||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||||
dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;") | |||||
dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;") | |||||
with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): | |||||
for i in range(N): | |||||
dest.write("B{i}r = B[bi+{i}*2+0];", i=i) | |||||
dest.write("B{i}i = B[bi+{i}*2+1];", i=i) | |||||
dest.write("bi += {N}*2;") | |||||
dest.write() | |||||
for i in range(a_regs): | |||||
dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) | |||||
dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) | |||||
dest.write("ai += {M}*2;") | |||||
dest.write() | |||||
for tj in range(0, N, tmp_unroll_j): | |||||
for ti in range(0, a_regs, tmp_unroll_i): | |||||
# note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign | |||||
# so accumulating them directly to ACC would lose precision when ACC is larger | |||||
for j in range(tj, tj+tmp_unroll_j): | |||||
for i in range(ti, ti+tmp_unroll_i): | |||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||||
dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") | |||||
dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") | |||||
for j in range(tj, tj+tmp_unroll_j): | |||||
for i in range(ti, ti+tmp_unroll_i): | |||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||||
dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") | |||||
dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") | |||||
for j in range(tj, tj+tmp_unroll_j): | |||||
for i in range(ti, ti+tmp_unroll_i): | |||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||||
dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);") | |||||
dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);") | |||||
dest.write() | |||||
dest.write("{index_type} ci=n_top*ldc+m_top;") | |||||
dest.write() | |||||
for j in range(N): | |||||
if TRMM: | |||||
for i in range(a_regs): | |||||
with dest.map(idx=j*a_regs+i): | |||||
dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );") | |||||
dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );") | |||||
else: | |||||
for i in range(a_regs): | |||||
idx = j*a_regs+i | |||||
increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' | |||||
if idx == N*a_regs-1: | |||||
increment = '' | |||||
with dest.map(idx=j*a_regs+i, increment=increment): | |||||
dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );") | |||||
dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );") | |||||
dest.write("{increment}") | |||||
if not TRMM: | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
with dest.map(idx=j*a_regs+i): | |||||
dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );") | |||||
dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );") | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
with dest.map(idx=j*a_regs+i): | |||||
dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );") | |||||
dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );") | |||||
if not TRMM: | |||||
dest.write() | |||||
dest.write("ci=n_top*ldc+m_top;") | |||||
dest.write() | |||||
for j in range(N): | |||||
for i in range(a_regs): | |||||
idx = j*a_regs+i | |||||
increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' | |||||
if idx == N*a_regs-1: | |||||
increment = '' | |||||
with dest.map(idx=j*a_regs+i, increment=increment): | |||||
dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);") | |||||
dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);") | |||||
dest.write("{increment}") | |||||
#----------------------------------------------------------------------- | |||||
def generate_gemm_kernel( settings, OUTPUT ): | |||||
if settings['conjugate'].value: | |||||
ERROR('conjugate gemm not yet supported') | |||||
is_complex = settings['complex'].value | |||||
generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real | |||||
dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings }) | |||||
M = settings['M'].value | |||||
N = settings['N'].value | |||||
vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) | |||||
a_regs = max(int(M/vlenmax), 1) | |||||
accumulation_regs = a_regs * N * settings['LMUL_ACC'].value | |||||
required_regs = accumulation_regs + a_regs | |||||
if is_complex: | |||||
required_regs = required_regs * 2 + 2 | |||||
dest.write(''' | |||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
#define S0 1 | |||||
#define S1 -1 | |||||
#define S2 1 | |||||
#define S3 1 | |||||
#define VFMACC_RR __riscv_vfmsac{tail_policy} | |||||
#define VFMACC_RI __riscv_vfmacc{tail_policy} | |||||
#endif | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
#define S0 1 | |||||
#define S1 1 | |||||
#define S2 1 | |||||
#define S3 -1 | |||||
#define VFMACC_RR __riscv_vfmacc{tail_policy} | |||||
#define VFMACC_RI __riscv_vfmsac{tail_policy} | |||||
#endif | |||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
#define S0 1 | |||||
#define S1 1 | |||||
#define S2 -1 | |||||
#define S3 1 | |||||
#define VFMACC_RR __riscv_vfmacc{tail_policy} | |||||
#define VFMACC_RI __riscv_vfnmsac{tail_policy} | |||||
#endif | |||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
#define S0 1 | |||||
#define S1 -1 | |||||
#define S2 -1 | |||||
#define S3 -1 | |||||
#define VFMACC_RR __riscv_vfmsac{tail_policy} | |||||
#define VFMACC_RI __riscv_vfnmacc{tail_policy} | |||||
#endif | |||||
'''.format(tail_policy=settings['tail_policy'].value)) | |||||
if required_regs > 32: | |||||
raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format( | |||||
required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '') | |||||
)) | |||||
TRMM = (settings['op'].value == 'trmm') | |||||
if TRMM: | |||||
with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"): | |||||
dest.write("#define BACKWARDS") | |||||
dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})", | |||||
alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'), | |||||
trmm = (', BLASLONG offset' if TRMM else '') | |||||
) | |||||
with dest.block("{{", "}}", elt_size='*2' if is_complex else ''): | |||||
if settings['trace'].value: | |||||
dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);") | |||||
dest.write("{index_type} gvl = 0;") | |||||
dest.write("{index_type} m_top = 0;") | |||||
dest.write("{index_type} n_top = 0;") | |||||
dest.write() | |||||
dest.write() | |||||
dest.write("// -- MAIN PASS") | |||||
with dest.block("for ({index_type} j=0; j<N/{N}; j+=1) {{", "}}"): | |||||
dest.write("m_top = 0;") | |||||
dest.write("{index_type} gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) | |||||
dest.write() | |||||
with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"): | |||||
generate_gemm_kernel_inner( settings, dest, M, N, vlenmax, a_regs ) | |||||
dest.write( "m_top += {M};" ) | |||||
dest.write() | |||||
dest.write() | |||||
dest.write("// -- tails for main pass") | |||||
generate_M_tails( dest, settings, M, N ) | |||||
dest.write( "n_top += {N};" ) | |||||
N_tail = int(N/2) | |||||
while( N_tail > 0 ): | |||||
with dest.map(N=N_tail): | |||||
dest.write() | |||||
dest.write() | |||||
dest.write("// -- tails for N={N}") | |||||
with dest.block("if( N & {N} ) {{", "}}" ): | |||||
if settings['trace'].value: | |||||
dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") | |||||
dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) | |||||
dest.write("m_top = 0;") | |||||
with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"): | |||||
generate_gemm_kernel_inner( settings, dest, M, N_tail, vlenmax, a_regs ) | |||||
dest.write("m_top += {M};") | |||||
generate_M_tails( dest, settings, M, N_tail ) | |||||
dest.write("n_top += {N};") | |||||
N_tail = int(N_tail/2) | |||||
dest.write("return 0;"); | |||||
#----------------------------------------------------------------------- | |||||
def generate_M_tails( dest, settings, M, N ): | |||||
M_tail = int(M/2) | |||||
M_tail_min = settings['M_tail_scalar_from'].value | |||||
vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) | |||||
TRMM = (settings['op'].value == 'trmm') | |||||
is_complex = settings['complex'].value | |||||
generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real | |||||
while( M_tail > M_tail_min ): | |||||
with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ): | |||||
if settings['trace'].value: | |||||
dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") | |||||
a_regs = max( 1, int(M_tail/vlenmax) ) | |||||
vlen = int(M_tail/a_regs) | |||||
dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen) | |||||
generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs ) | |||||
dest.write( "m_top += {M_tail};" ) | |||||
M_tail = int( M_tail / 2 ) | |||||
while( M_tail > 0 ): | |||||
with dest.block("if( M & {M_tail} ) {{", "}}", | |||||
M_tail=M_tail, | |||||
N=N, | |||||
result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) | |||||
): | |||||
if settings['trace'].value: | |||||
dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") | |||||
for r in range(M_tail * N * (2 if is_complex else 1)): | |||||
dest.write("{result_t} result{r} = 0;", | |||||
r=r | |||||
) | |||||
dest.write("{index_type} ai=m_top*K{elt_size};") | |||||
dest.write("{index_type} bi=n_top*K{elt_size};") | |||||
if TRMM: | |||||
with dest.map(M=M_tail, N=N): | |||||
generate_trmm_block( dest ) | |||||
with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ): | |||||
for ki in range( N ): | |||||
for kj in range( M_tail ): | |||||
if is_complex: | |||||
dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format( | |||||
dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2 | |||||
)) | |||||
dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format( | |||||
dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2 | |||||
)) | |||||
else: | |||||
dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format( | |||||
dest=ki*M_tail+kj, kj=kj, ki=ki | |||||
)) | |||||
dest.write("ai+={M_tail}{elt_size};") | |||||
dest.write("bi+={N}{elt_size};") | |||||
dest.write("{index_type} ci=n_top*ldc+m_top;") | |||||
if is_complex: | |||||
dest.write("{result_t} Cr, Ci;") | |||||
for ki in range( N ): | |||||
for kj in range( M_tail ): | |||||
if is_complex: | |||||
if TRMM: | |||||
dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) | |||||
dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) | |||||
else: | |||||
dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj) | |||||
dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj) | |||||
dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) | |||||
dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) | |||||
dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1) | |||||
dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0) | |||||
dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj ) | |||||
dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj ) | |||||
else: | |||||
op = '' if TRMM else '+' | |||||
dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};", | |||||
ki=ki, kj=kj, op=op, dest=ki*M_tail+kj | |||||
) | |||||
dest.write("m_top+={M_tail};") | |||||
M_tail = int(M_tail/2) | |||||
#----------------------------------------------------------------------- | |||||
class Setting(object): | |||||
def __init__( self, value, convert = None ): | |||||
self._value = value | |||||
self._convert = convert | |||||
@classmethod | |||||
def ENUM( cls, *values ): | |||||
def closure( values ): | |||||
return lambda value: values[value.lower()] | |||||
return closure( { v.lower():v for v in values } ) | |||||
@classmethod | |||||
def BOOL( cls, value ): | |||||
return value.lower().startswith('t') or value == '1' | |||||
@property | |||||
def value( self ): | |||||
return self._value | |||||
@property | |||||
def configurable( self ): | |||||
return self._convert is not None | |||||
@value.setter | |||||
def value( self, value ): | |||||
self._value = self._convert( value ) | |||||
def __str__( self ): | |||||
return str(self._value) | |||||
#----------------------------------------------------------------------- | |||||
def main(): | |||||
settings = { | |||||
'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ), | |||||
'M': Setting( 16, int ), | |||||
'N': Setting( 4, int ), | |||||
'reg_width_bits': Setting( 256, int ), | |||||
'LMUL': Setting( 1, int ), | |||||
'M_tail_scalar_from':Setting( 2, int ), | |||||
'cpu': Setting( 'zvl256b', str ), | |||||
'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ), | |||||
'force_acc_double': Setting( False, Setting.BOOL ), | |||||
'complex': Setting( False, Setting.BOOL ), | |||||
'conjugate': Setting( False, Setting.BOOL ), | |||||
'index_type': Setting( 'BLASLONG', str ), | |||||
'trace': Setting( False, Setting.BOOL ), | |||||
'output': Setting( None, str ), | |||||
'tail_policy': Setting( '', str ), # _ta, if toolchain supports it | |||||
'__riscv_': Setting( '__riscv_', str), | |||||
} | |||||
for item in sys.argv[1:]: | |||||
try: | |||||
name, value = tuple(item.split( '=', 1 )) | |||||
except: | |||||
ERROR("couldn't parse {}, expected arguments of the form name=value".format(item)) | |||||
if name not in settings: | |||||
ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name ) | |||||
+"options (and current defaults) are\n{}".format( | |||||
" ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()])) | |||||
) | |||||
try: | |||||
settings[name].value = value | |||||
except: | |||||
import traceback | |||||
traceback.print_exc() | |||||
ERROR("couldn't parse {}".format(item)) | |||||
if settings['output'].value is None: | |||||
if settings['complex'].value: | |||||
prefix = 'z' if settings['param_precision'].value == 'double' else 'c' | |||||
else: | |||||
prefix = 'd' if settings['param_precision'].value == 'double' else 's' | |||||
settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format( | |||||
prefix, | |||||
settings['op'], | |||||
settings['M'], | |||||
settings['N'], | |||||
settings['cpu'] | |||||
)) | |||||
if settings['param_precision'].value == 'double': | |||||
settings['param_scalar_t'] = Setting( 'double' ) | |||||
settings['ELEN_PARAM'] = Setting(64) | |||||
else: | |||||
settings['param_scalar_t'] = Setting( 'float' ) | |||||
settings['ELEN_PARAM'] = Setting(32) | |||||
settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) | |||||
settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) | |||||
settings['ELEN_ACC'] = settings['ELEN_PARAM'] | |||||
settings['LMUL_ACC'] = Setting(settings['LMUL'].value) | |||||
widen = '' | |||||
if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'): | |||||
settings['ELEN_ACC'] = Setting(64) | |||||
settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2) | |||||
settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy'])) | |||||
widen = 'w' | |||||
settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) | |||||
settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) | |||||
settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL'])) | |||||
settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC'])) | |||||
settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) | |||||
settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) | |||||
settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) | |||||
settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) | |||||
settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'])) | |||||
to_stdout = (settings['output'].value == '-') | |||||
if not to_stdout: | |||||
print("Writing {}".format(settings['output'].value), file=sys.stderr) | |||||
with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file: | |||||
def OUTPUT(*args, **kwargs): | |||||
print(*args, file=destination_file, **kwargs) | |||||
OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable]))) | |||||
OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable]))) | |||||
OUTPUT('#include "common.h"') | |||||
OUTPUT("\n") | |||||
if settings['op'].value in ('gemm', 'trmm'): | |||||
generate_gemm_kernel(settings, OUTPUT) | |||||
else: | |||||
ERROR("unsupported kernel type {}".format(settings['op'])) | |||||
if __name__ == "__main__": | |||||
main() |
@@ -27,118 +27,111 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | #include <math.h> | ||||
#include <float.h> | |||||
#if defined(DOUBLE) | #if defined(DOUBLE) | ||||
#define ABS fabs | |||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define FLOAT_V_T vfloat64m4_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||||
#define VMFIRSTM vmfirst_m_b8 | |||||
#define UINT_V_T vuint64m8_t | |||||
#define VIDV_MASK_UINT vid_v_u64m8_m | |||||
#define VIDV_UINT vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
#define VADDVX_UINT vadd_vx_u64m8 | |||||
#define VMVVX_UINT vmv_v_x_u64m8 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 | |||||
#define MASK_T vbool16_t | |||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 | |||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 | |||||
#define VMFIRSTM __riscv_vfirst_m_b16 | |||||
#define UINT_V_T vuint64m4_t | |||||
#define VIDV_UINT __riscv_vid_v_u64m4 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u64m4 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m4 | |||||
#define VFABS_FLOAT __riscv_vfabs_v_f64m4 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u64m4 | |||||
#define VMV_X __riscv_vmv_x_s_u64m4_u64 | |||||
#else | #else | ||||
#define ABS fabsf | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define FLOAT_V_T vfloat32m4_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||||
#define VMFIRSTM vmfirst_m_b4 | |||||
#define UINT_V_T vuint32m8_t | |||||
#define VIDV_MASK_UINT vid_v_u32m8_m | |||||
#define VIDV_UINT vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
#define VADDVX_UINT vadd_vx_u32m8 | |||||
#define VMVVX_UINT vmv_v_x_u32m8 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 | |||||
#define MASK_T vbool8_t | |||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 | |||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 | |||||
#define VMFIRSTM __riscv_vfirst_m_b8 | |||||
#define UINT_V_T vuint32m4_t | |||||
#define VIDV_UINT __riscv_vid_v_u32m4 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u32m4 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m4 | |||||
#define VFABS_FLOAT __riscv_vfabs_v_f32m4 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u32m4 | |||||
#define VMV_X __riscv_vmv_x_s_u32m4_u32 | |||||
#endif | #endif | ||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
FLOAT maxf=0.0; | |||||
BLASLONG i=0, j=0; | |||||
unsigned int max_index = 0; | unsigned int max_index = 0; | ||||
if (n <= 0 || inc_x <= 0) return(max_index); | |||||
if (n <= 0 || inc_x <= 0) return(max_index); | |||||
FLOAT maxf=-FLT_MAX; | |||||
FLOAT_V_T vx, v_max; | FLOAT_V_T vx, v_max; | ||||
UINT_V_T v_max_index; | UINT_V_T v_max_index; | ||||
MASK_T mask; | MASK_T mask; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T_M1 v_res, v_z0; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); | |||||
gvl = VSETVL(n); | |||||
UINT_V_T vid = VIDV_UINT(gvl); | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | |||||
v_max_index = VMVVX_UINT(0, gvl); | v_max_index = VMVVX_UINT(0, gvl); | ||||
v_max = VFMVVF_FLOAT(-1, gvl); | |||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
vx = VFABS_FLOAT(vx, gvl); | |||||
//index where element greater than v_max | //index where element greater than v_max | ||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl); | mask = VMFLTVV_FLOAT(v_max, vx, gvl); | ||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); | |||||
//update v_max and start_index j | //update v_max and start_index j | ||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | ||||
j += gvl; | j += gvl; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
maxf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
max_index = VMFIRSTM(mask,gvl); | |||||
max_index = *((unsigned int*)&v_max_index+max_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
vx = VLEV_FLOAT(&x[j], gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||||
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
v_max = VLEV_FLOAT(&x[j], gvl); | |||||
v_max = VFABS_FLOAT(v_max, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
FLOAT cur_maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||||
if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
//tail index | //tail index | ||||
v_max_index = VIDV_UINT(gvl); | |||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl); | |||||
v_max_index = VADDVX_UINT(vid, j, gvl); | |||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | ||||
max_index = VMFIRSTM(mask,gvl); | |||||
max_index = *((unsigned int*)&v_max_index+max_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -146,51 +139,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
unsigned int idx = 0, inc_v = gvl * inc_x; | unsigned int idx = 0, inc_v = gvl * inc_x; | ||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
v_max_index = VMVVX_UINT(0, gvl); | v_max_index = VMVVX_UINT(0, gvl); | ||||
v_max = VFMVVF_FLOAT(-1, gvl); | |||||
for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
vx = VFABS_FLOAT(vx, gvl); | |||||
//index where element greater than v_max | //index where element greater than v_max | ||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl); | mask = VMFLTVV_FLOAT(v_max, vx, gvl); | ||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); | |||||
//update v_max and start_index j | //update v_max and start_index j | ||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | ||||
j += gvl; | j += gvl; | ||||
idx += inc_v; | idx += inc_v; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
maxf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
max_index = VMFIRSTM(mask,gvl); | |||||
max_index = *((unsigned int*)&v_max_index+max_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||||
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
v_max = VFABS_FLOAT(v_max, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
FLOAT cur_maxf = *((FLOAT*)&v_res); | |||||
if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
//tail index | //tail index | ||||
v_max_index = VIDV_UINT(gvl); | |||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl); | |||||
v_max_index = VADDVX_UINT(vid, j, gvl); | |||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | ||||
max_index = VMFIRSTM(mask,gvl); | |||||
max_index = *((unsigned int*)&v_max_index+max_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
} | } | ||||
return(max_index+1); | |||||
return(max_index+1); | |||||
} | } | ||||
@@ -31,85 +31,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#if defined(DOUBLE) | #if defined(DOUBLE) | ||||
#define ABS fabs | |||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
#define FLOAT_V_T vfloat64m8_t | #define FLOAT_V_T vfloat64m8_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | #define MASK_T vbool8_t | ||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||||
#define VMFIRSTM vmfirst_m_b8 | |||||
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 | |||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 | |||||
#define VMFIRSTM __riscv_vfirst_m_b8 | |||||
#define UINT_V_T vuint64m8_t | #define UINT_V_T vuint64m8_t | ||||
#define VIDV_MASK_UINT vid_v_u64m8_m | |||||
#define VIDV_UINT vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
#define VADDVX_UINT vadd_vx_u64m8 | |||||
#define VMVVX_UINT vmv_v_x_u64m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu | |||||
#define VIDV_UINT __riscv_vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||||
#define VFABS_FLOAT __riscv_vfabs_v_f64m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u64m8 | |||||
#define VMV_X __riscv_vmv_x_s_u64m8_u64 | |||||
#else | #else | ||||
#define ABS fabsf | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
#define FLOAT_V_T vfloat32m8_t | #define FLOAT_V_T vfloat32m8_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | #define MASK_T vbool4_t | ||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||||
#define VMFIRSTM vmfirst_m_b4 | |||||
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 | |||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 | |||||
#define VMFIRSTM __riscv_vfirst_m_b4 | |||||
#define UINT_V_T vuint32m8_t | #define UINT_V_T vuint32m8_t | ||||
#define VIDV_MASK_UINT vid_v_u32m8_m | |||||
#define VIDV_UINT vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
#define VADDVX_UINT vadd_vx_u32m8 | |||||
#define VMVVX_UINT vmv_v_x_u32m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu | |||||
#define VIDV_UINT __riscv_vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||||
#define VFABS_FLOAT __riscv_vfabs_v_f32m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u32m8 | |||||
#define VMV_X __riscv_vmv_x_s_u32m8_u32 | |||||
#endif | #endif | ||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
FLOAT minf=FLT_MAX; | |||||
BLASLONG i=0, j=0; | |||||
unsigned int min_index = 0; | unsigned int min_index = 0; | ||||
if (n <= 0 || inc_x <= 0) return(min_index); | |||||
if (n <= 0 || inc_x <= 0) return(min_index); | |||||
FLOAT minf=FLT_MAX; | |||||
FLOAT_V_T vx, v_min; | FLOAT_V_T vx, v_min; | ||||
UINT_V_T v_min_index; | UINT_V_T v_min_index; | ||||
MASK_T mask; | MASK_T mask; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T_M1 v_res, v_max; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
v_min_index = VMVVX_UINT(0, gvl); | v_min_index = VMVVX_UINT(0, gvl); | ||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
vx = VFABS_FLOAT(vx, gvl); | |||||
//index where element less than v_min | |||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||||
//index where element greater than v_min | |||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | ||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | ||||
@@ -117,29 +111,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl); | v_min = VFMINVV_FLOAT(v_min, vx, gvl); | ||||
j += gvl; | j += gvl; | ||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
minf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl); | mask = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
min_index = VMFIRSTM(mask,gvl); | |||||
min_index = *((unsigned int*)&v_min_index+min_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
vx = VLEV_FLOAT(&x[j], gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||||
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
v_min = VLEV_FLOAT(&x[j], gvl); | |||||
v_min = VFABS_FLOAT(v_min, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
FLOAT cur_minf = *((FLOAT*)&v_res); | |||||
if(cur_minf < minf){ | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||||
if(cur_minf > minf){ | |||||
//tail index | //tail index | ||||
v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl); | v_min_index = VADDVX_UINT(v_min_index, j, gvl); | ||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | ||||
min_index = VMFIRSTM(mask,gvl); | |||||
min_index = *((unsigned int*)&v_min_index+min_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -151,12 +145,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_min_index = VMVVX_UINT(0, gvl); | v_min_index = VMVVX_UINT(0, gvl); | ||||
for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
vx = VFABS_FLOAT(vx, gvl); | |||||
//index where element less than v_min | |||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||||
//index where element greater than v_min | |||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | ||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | ||||
@@ -165,33 +157,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
j += gvl; | j += gvl; | ||||
idx += inc_v; | idx += inc_v; | ||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
minf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl); | mask = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
min_index = VMFIRSTM(mask,gvl); | |||||
min_index = *((unsigned int*)&v_min_index+min_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||||
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
v_min = VFABS_FLOAT(v_min, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
FLOAT cur_minf = *((FLOAT*)&v_res); | |||||
if(cur_minf < minf){ | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||||
if(cur_minf > minf){ | |||||
//tail index | //tail index | ||||
v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl); | v_min_index = VADDVX_UINT(v_min_index, j, gvl); | ||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | ||||
min_index = VMFIRSTM(mask,gvl); | |||||
min_index = *((unsigned int*)&v_min_index+min_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
} | } | ||||
return(min_index+1); | |||||
return(min_index+1); | |||||
} | } | ||||
@@ -31,68 +31,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#if defined(DOUBLE) | #if defined(DOUBLE) | ||||
#define ABS fabs | |||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
#define FLOAT_V_T vfloat64m8_t | #define FLOAT_V_T vfloat64m8_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | #define MASK_T vbool8_t | ||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||||
#define VMFIRSTM vmfirst_m_b8 | |||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 | |||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 | |||||
#define VMFIRSTM __riscv_vfirst_m_b8 | |||||
#define UINT_V_T vuint64m8_t | #define UINT_V_T vuint64m8_t | ||||
#define VIDV_MASK_UINT vid_v_u64m8_m | |||||
#define VIDV_UINT vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
#define VADDVX_UINT vadd_vx_u64m8 | |||||
#define VMVVX_UINT vmv_v_x_u64m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu | |||||
#define VIDV_UINT __riscv_vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u64m8 | |||||
#define VMV_X __riscv_vmv_x_s_u64m8_u64 | |||||
#else | #else | ||||
#define ABS fabsf | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
#define FLOAT_V_T vfloat32m8_t | #define FLOAT_V_T vfloat32m8_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | #define MASK_T vbool4_t | ||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||||
#define VMFIRSTM vmfirst_m_b4 | |||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 | |||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 | |||||
#define VMFIRSTM __riscv_vfirst_m_b4 | |||||
#define UINT_V_T vuint32m8_t | #define UINT_V_T vuint32m8_t | ||||
#define VIDV_MASK_UINT vid_v_u32m8_m | |||||
#define VIDV_UINT vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
#define VADDVX_UINT vadd_vx_u32m8 | |||||
#define VMVVX_UINT vmv_v_x_u32m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu | |||||
#define VIDV_UINT __riscv_vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u32m8 | |||||
#define VMV_X __riscv_vmv_x_s_u32m8_u32 | |||||
#endif | #endif | ||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
BLASLONG i=0, j=0; | |||||
unsigned int max_index = 0; | unsigned int max_index = 0; | ||||
if (n <= 0 || inc_x <= 0) return(max_index); | |||||
FLOAT maxf=-FLT_MAX; | |||||
if (n <= 0 || inc_x <= 0) return(max_index); | |||||
FLOAT maxf=-FLT_MAX; | |||||
FLOAT_V_T vx, v_max; | FLOAT_V_T vx, v_max; | ||||
UINT_V_T v_max_index; | UINT_V_T v_max_index; | ||||
MASK_T mask; | MASK_T mask; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T_M1 v_res, v_min; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
@@ -104,32 +102,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
//index where element greater than v_max | //index where element greater than v_max | ||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl); | mask = VMFLTVV_FLOAT(v_max, vx, gvl); | ||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | ||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||||
//update v_max and start_index j | //update v_max and start_index j | ||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | ||||
j += gvl; | j += gvl; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
maxf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
max_index = VMFIRSTM(mask,gvl); | |||||
max_index = *((unsigned int*)&v_max_index+max_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v_max = VLEV_FLOAT(&x[j], gvl); | v_max = VLEV_FLOAT(&x[j], gvl); | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
FLOAT cur_maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||||
if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
//tail index | //tail index | ||||
v_max_index = VIDV_UINT(gvl); | v_max_index = VIDV_UINT(gvl); | ||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl); | v_max_index = VADDVX_UINT(v_max_index, j, gvl); | ||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | ||||
max_index = VMFIRSTM(mask,gvl); | |||||
max_index = *((unsigned int*)&v_max_index+max_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -145,37 +145,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
//index where element greater than v_max | //index where element greater than v_max | ||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl); | mask = VMFLTVV_FLOAT(v_max, vx, gvl); | ||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | ||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||||
//update v_max and start_index j | //update v_max and start_index j | ||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | ||||
j += gvl; | j += gvl; | ||||
idx += inc_v; | idx += inc_v; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
maxf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | ||||
max_index = VMFIRSTM(mask,gvl); | |||||
max_index = *((unsigned int*)&v_max_index+max_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
FLOAT cur_maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||||
if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
//tail index | //tail index | ||||
v_max_index = VIDV_UINT(gvl); | v_max_index = VIDV_UINT(gvl); | ||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl); | v_max_index = VADDVX_UINT(v_max_index, j, gvl); | ||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | ||||
max_index = VMFIRSTM(mask,gvl); | |||||
max_index = *((unsigned int*)&v_max_index+max_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
} | } | ||||
return(max_index+1); | |||||
return(max_index+1); | |||||
} | } | ||||
@@ -31,122 +31,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#if defined(DOUBLE) | #if defined(DOUBLE) | ||||
#define ABS fabs | |||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
#define FLOAT_V_T vfloat64m8_t | #define FLOAT_V_T vfloat64m8_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | #define MASK_T vbool8_t | ||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||||
#define VMFIRSTM vmfirst_m_b8 | |||||
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 | |||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 | |||||
#define VMFIRSTM __riscv_vfirst_m_b8 | |||||
#define UINT_V_T vuint64m8_t | #define UINT_V_T vuint64m8_t | ||||
#define VIDV_MASK_UINT vid_v_u64m8_m | |||||
#define VIDV_UINT vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
#define VADDVX_UINT vadd_vx_u64m8 | |||||
#define VMVVX_UINT vmv_v_x_u64m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m | |||||
#define VIDV_UINT __riscv_vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m | |||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u64m8 | |||||
#define VMV_X __riscv_vmv_x_s_u64m8_u64 | |||||
#else | #else | ||||
#define ABS fabsf | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
#define FLOAT_V_T vfloat32m8_t | #define FLOAT_V_T vfloat32m8_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | #define MASK_T vbool4_t | ||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||||
#define VMFIRSTM vmfirst_m_b4 | |||||
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 | |||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 | |||||
#define VMFIRSTM __riscv_vfirst_m_b4 | |||||
#define UINT_V_T vuint32m8_t | #define UINT_V_T vuint32m8_t | ||||
#define VIDV_MASK_UINT vid_v_u32m8_m | |||||
#define VIDV_UINT vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
#define VADDVX_UINT vadd_vx_u32m8 | |||||
#define VMVVX_UINT vmv_v_x_u32m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m | |||||
#define VIDV_UINT __riscv_vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m | |||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u32m8 | |||||
#define VMV_X __riscv_vmv_x_s_u32m8_u32 | |||||
#endif | #endif | ||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
FLOAT minf=FLT_MAX; | |||||
BLASLONG i=0, j=0; | |||||
unsigned int min_index = 0; | unsigned int min_index = 0; | ||||
if (n <= 0 || inc_x <= 0) return(min_index); | |||||
if (n <= 0 || inc_x <= 0) return(min_index); | |||||
FLOAT minf=FLT_MAX; | |||||
FLOAT_V_T vx, v_min; | FLOAT_V_T vx, v_min; | ||||
UINT_V_T v_min_index; | UINT_V_T v_min_index; | ||||
MASK_T mask; | MASK_T mask; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T_M1 v_res, v_max; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
v_min_index = VMVVX_UINT(0, gvl); | v_min_index = VMVVX_UINT(0, gvl); | ||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
//index where element less than v_min | |||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1 \n\t" | |||||
"vsetvli x0, %2, e64,m8 \n\t" | |||||
"vid.v %0, v0.t \n\t" | |||||
:"+v"(v_min_index) | |||||
:"v"(mask), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1 \n\t" | |||||
"vsetvli x0, %2, e32,m8 \n\t" | |||||
"vid.v %0, v0.t \n\t" | |||||
:"+v"(v_min_index) | |||||
:"v"(mask), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); | |||||
//index where element greater than v_min | |||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||||
v_min_index = VIDV_MASK_UINT(mask, gvl); | |||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); | |||||
//update v_min and start_index j | //update v_min and start_index j | ||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl); | v_min = VFMINVV_FLOAT(v_min, vx, gvl); | ||||
j += gvl; | j += gvl; | ||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
minf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl); | mask = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
min_index = VMFIRSTM(mask,gvl); | |||||
min_index = *((unsigned int*)&v_min_index+min_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v_min = VLEV_FLOAT(&x[j], gvl); | v_min = VLEV_FLOAT(&x[j], gvl); | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
FLOAT cur_minf = *((FLOAT*)&v_res); | |||||
if(cur_minf < minf){ | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||||
if(cur_minf > minf){ | |||||
//tail index | //tail index | ||||
v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl); | v_min_index = VADDVX_UINT(v_min_index, j, gvl); | ||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | ||||
min_index = VMFIRSTM(mask,gvl); | |||||
min_index = *((unsigned int*)&v_min_index+min_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -159,59 +142,39 @@ asm volatile( | |||||
for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
//index where element less than v_min | |||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1 \n\t" | |||||
"vsetvli x0, %2, e64,m8 \n\t" | |||||
"vid.v %0, v0.t \n\t" | |||||
:"+v"(v_min_index) | |||||
:"v"(mask), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1 \n\t" | |||||
"vsetvli x0, %2, e32,m8 \n\t" | |||||
"vid.v %0, v0.t \n\t" | |||||
:"+v"(v_min_index) | |||||
:"v"(mask), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); | |||||
//index where element greater than v_min | |||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||||
v_min_index = VIDV_MASK_UINT(mask, gvl); | |||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); | |||||
//update v_min and start_index j | //update v_min and start_index j | ||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl); | v_min = VFMINVV_FLOAT(v_min, vx, gvl); | ||||
j += gvl; | j += gvl; | ||||
idx += inc_v; | idx += inc_v; | ||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
minf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl); | mask = VMFLEVF_FLOAT(v_min, minf, gvl); | ||||
min_index = VMFIRSTM(mask,gvl); | |||||
min_index = *((unsigned int*)&v_min_index+min_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
FLOAT cur_minf = *((FLOAT*)&v_res); | |||||
if(cur_minf < minf){ | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||||
if(cur_minf > minf){ | |||||
//tail index | //tail index | ||||
v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl); | v_min_index = VADDVX_UINT(v_min_index, j, gvl); | ||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | ||||
min_index = VMFIRSTM(mask,gvl); | |||||
min_index = *((unsigned int*)&v_min_index+min_index); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
} | } | ||||
return(min_index+1); | |||||
return(min_index+1); | |||||
} | } | ||||
@@ -27,241 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | #include <math.h> | ||||
#include <float.h> | |||||
#if defined(DOUBLE) | #if defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
#define FLOAT_V_T vfloat64m8_t | #define FLOAT_V_T vfloat64m8_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | #define MASK_T vbool8_t | ||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||||
#define VMFIRSTM vmfirst_m_b8 | |||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 | |||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 | |||||
#define VMFIRSTM __riscv_vfirst_m_b8 | |||||
#define UINT_V_T vuint64m8_t | #define UINT_V_T vuint64m8_t | ||||
#define VSEVU_UINT vse64_v_u64m8 | |||||
#define VSEVU_UINT __riscv_vse64_v_u64m8 | |||||
#define UINT_T long unsigned int | #define UINT_T long unsigned int | ||||
#define VIDV_MASK_UINT vid_v_u64m8_m | |||||
#define VIDV_UINT vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
#define VADDVX_UINT vadd_vx_u64m8 | |||||
#define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
#define VMVVX_UINT vmv_v_x_u64m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu | |||||
#define VIDV_UINT __riscv_vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||||
#define VFABS_FLOAT __riscv_vfabs_v_f64m8 | |||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u64m8 | |||||
#define VMV_X __riscv_vmv_x_s_u64m8_u64 | |||||
#else | #else | ||||
#define ABS fabsf | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
#define FLOAT_V_T vfloat32m8_t | #define FLOAT_V_T vfloat32m8_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | #define MASK_T vbool4_t | ||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||||
#define VMFIRSTM vmfirst_m_b4 | |||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 | |||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 | |||||
#define VMFIRSTM __riscv_vfirst_m_b4 | |||||
#define UINT_V_T vuint32m8_t | #define UINT_V_T vuint32m8_t | ||||
#define UINT_T unsigned int | #define UINT_T unsigned int | ||||
#define VSEVU_UINT vse32_v_u32m8 | |||||
#define VIDV_MASK_UINT vid_v_u32m8_m | |||||
#define VIDV_UINT vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
#define VADDVX_UINT vadd_vx_u32m8 | |||||
#define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
#define VMVVX_UINT vmv_v_x_u32m8 | |||||
#define VSEVU_UINT __riscv_vse32_v_u32m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu | |||||
#define VIDV_UINT __riscv_vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||||
#define VFABS_FLOAT __riscv_vfabs_v_f32m8 | |||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u32m8 | |||||
#define VMV_X __riscv_vmv_x_s_u32m8_u32 | |||||
#endif | #endif | ||||
#define RVV_M RVV_M8 | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
FLOAT maxf=0.0; | |||||
BLASLONG i=0, j=0; | |||||
unsigned int max_index = 0; | unsigned int max_index = 0; | ||||
if (n <= 0 || inc_x <= 0) return(max_index); | |||||
if (n <= 0 || inc_x <= 0) return(max_index); | |||||
FLOAT maxf=-FLT_MAX; | |||||
FLOAT_V_T vx0, vx1, v_max; | |||||
FLOAT_V_T vx, vx2, v_max; | |||||
UINT_V_T v_max_index; | UINT_V_T v_max_index; | ||||
MASK_T mask0, mask1; | |||||
MASK_T mask; | |||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T_M1 v_res, v_z0; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); | |||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
UINT_T temp_uint[gvl]; | |||||
unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); | |||||
unsigned int idx = 0, inc_v = gvl * inc_x * 2; | |||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||||
v_max_index = VMVVX_UINT(0, gvl); | v_max_index = VMVVX_UINT(0, gvl); | ||||
v_max = VFMVVF_FLOAT(-1, gvl); | |||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||||
BLASLONG inc_xv = gvl * inc_x * 2; | |||||
BLASLONG ix = 0; | |||||
for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx0) | |||||
:"v"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx0) | |||||
:"v"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx1) | |||||
:"v"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx1) | |||||
:"v"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); | |||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||||
vx = VFABS_FLOAT(vx, gvl); | |||||
vx2 = VFABS_FLOAT(vx2, gvl); | |||||
vx = VFADDVV_FLOAT(vx, vx2, gvl); | |||||
//index where element greater than v_max | //index where element greater than v_max | ||||
mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); | |||||
v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1 \n\t" | |||||
"vsetvli x0, %2, e64,m8 \n\t" | |||||
"vid.v %0, v0.t \n\t" | |||||
:"+v"(v_max_index) | |||||
:"v"(mask0), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1 \n\t" | |||||
"vsetvli x0, %2, e32,m8 \n\t" | |||||
"vid.v %0, v0.t \n\t" | |||||
:"+v"(v_max_index) | |||||
:"v"(mask0), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); | |||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||||
//update v_max and start_index j | //update v_max and start_index j | ||||
v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); | |||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||||
j += gvl; | j += gvl; | ||||
ix += inc_xv; | |||||
idx += inc_v; | |||||
} | } | ||||
vx0 = VFMVVF_FLOAT(0, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
maxf = VFMVFS_FLOAT(v_res); | |||||
mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||||
max_index = VMFIRSTM(mask0,gvl); | |||||
VSEVU_UINT(temp_uint,v_max_index,gvl); | |||||
max_index = temp_uint[max_index]; | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
maxf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v_max_index = VMVVX_UINT(0, gvl); | |||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx0) | |||||
:"v"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx0) | |||||
:"v"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx1) | |||||
:"v"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx1) | |||||
:"v"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
v_max = VFADDVV_FLOAT(vx0, vx1, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
FLOAT cur_maxf = VFMVFS_FLOAT(v_res); | |||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||||
v_max = VFABS_FLOAT(v_max, gvl); | |||||
vx2 = VFABS_FLOAT(vx2, gvl); | |||||
v_max = VFADDVV_FLOAT(v_max, vx2, gvl); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||||
if(cur_maxf > maxf){ | if(cur_maxf > maxf){ | ||||
//tail index | //tail index | ||||
v_max_index = VIDV_UINT(gvl); | v_max_index = VIDV_UINT(gvl); | ||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl); | v_max_index = VADDVX_UINT(v_max_index, j, gvl); | ||||
mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||||
max_index = VMFIRSTM(mask0,gvl); | |||||
VSEVU_UINT(temp_uint,v_max_index,gvl); | |||||
max_index = temp_uint[max_index]; | |||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_max_index, mask, gvl); | |||||
max_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
return(max_index+1); | |||||
} | |||||
return(max_index+1); | |||||
} |
@@ -31,235 +31,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#if defined(DOUBLE) | #if defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
#define FLOAT_V_T vfloat64m8_t | #define FLOAT_V_T vfloat64m8_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | #define MASK_T vbool8_t | ||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||||
#define VMFIRSTM vmfirst_m_b8 | |||||
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 | |||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 | |||||
#define VMFIRSTM __riscv_vfirst_m_b8 | |||||
#define UINT_V_T vuint64m8_t | #define UINT_V_T vuint64m8_t | ||||
#define VSEVU_UINT vse64_v_u64m8 | #define VSEVU_UINT vse64_v_u64m8 | ||||
#define UINT_T long unsigned int | #define UINT_T long unsigned int | ||||
#define VIDV_MASK_UINT vid_v_u64m8_m | |||||
#define VIDV_UINT vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||||
#define VADDVX_UINT vadd_vx_u64m8 | |||||
#define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
#define VMVVX_UINT vmv_v_x_u64m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu | |||||
#define VIDV_UINT __riscv_vid_v_u64m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||||
#define VFABS_FLOAT __riscv_vfabs_v_f64m8 | |||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u64m8 | |||||
#define VMV_X __riscv_vmv_x_s_u64m8_u64 | |||||
#else | #else | ||||
#define ABS fabsf | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
#define FLOAT_V_T vfloat32m8_t | #define FLOAT_V_T vfloat32m8_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | #define MASK_T vbool4_t | ||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||||
#define VMFIRSTM vmfirst_m_b4 | |||||
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 | |||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 | |||||
#define VMFIRSTM __riscv_vfirst_m_b4 | |||||
#define UINT_V_T vuint32m8_t | #define UINT_V_T vuint32m8_t | ||||
#define UINT_T unsigned int | #define UINT_T unsigned int | ||||
#define VSEVU_UINT vse32_v_u32m8 | |||||
#define VIDV_MASK_UINT vid_v_u32m8_m | |||||
#define VIDV_UINT vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||||
#define VADDVX_UINT vadd_vx_u32m8 | |||||
#define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
#define VMVVX_UINT vmv_v_x_u32m8 | |||||
#define VSEVU_UINT __riscv_vse32_v_u32m8 | |||||
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu | |||||
#define VIDV_UINT __riscv_vid_v_u32m8 | |||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu | |||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||||
#define VFABS_FLOAT __riscv_vfabs_v_f32m8 | |||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 | |||||
#define VCOMPRESS __riscv_vcompress_vm_u32m8 | |||||
#define VMV_X __riscv_vmv_x_s_u32m8_u32 | |||||
#endif | #endif | ||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
FLOAT minf=FLT_MAX; | |||||
BLASLONG i=0, j=0; | |||||
unsigned int min_index = 0; | unsigned int min_index = 0; | ||||
if (n <= 0 || inc_x <= 0) return(min_index); | |||||
if (n <= 0 || inc_x <= 0) return(min_index); | |||||
FLOAT minf=FLT_MAX; | |||||
FLOAT_V_T vx0, vx1, v_min; | |||||
FLOAT_V_T vx, vx2, v_min; | |||||
UINT_V_T v_min_index; | UINT_V_T v_min_index; | ||||
MASK_T mask0, mask1; | |||||
MASK_T mask; | |||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T_M1 v_res, v_max; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
UINT_T temp_uint[gvl]; | |||||
v_min_index = VMVVX_UINT(0, gvl); | |||||
unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); | |||||
unsigned int idx = 0, inc_v = gvl * inc_x * 2; | |||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | ||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||||
BLASLONG inc_xv = gvl * inc_x * 2; | |||||
BLASLONG ix = 0; | |||||
v_min_index = VMVVX_UINT(0, gvl); | |||||
for(i=0,j=0; i < n/gvl; i++){ | for(i=0,j=0; i < n/gvl; i++){ | ||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx0) | |||||
:"v"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx0) | |||||
:"v"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx1) | |||||
:"v"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx1) | |||||
:"v"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); | |||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||||
vx = VFABS_FLOAT(vx, gvl); | |||||
vx2 = VFABS_FLOAT(vx2, gvl); | |||||
vx = VFADDVV_FLOAT(vx, vx2, gvl); | |||||
//index where element less than v_min | |||||
mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); | |||||
v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1 \n\t" | |||||
"vsetvli x0, %2, e64,m8 \n\t" | |||||
"vid.v %0, v0.t \n\t" | |||||
:"+v"(v_min_index) | |||||
:"v"(mask0), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1 \n\t" | |||||
"vsetvli x0, %2, e32,m8 \n\t" | |||||
"vid.v %0, v0.t \n\t" | |||||
:"+v"(v_min_index) | |||||
:"v"(mask0), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); | |||||
//index where element greater than v_min | |||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | |||||
//update v_min and start_index j | //update v_min and start_index j | ||||
v_min = VFMINVV_FLOAT(v_min, vx0, gvl); | |||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||||
j += gvl; | j += gvl; | ||||
ix += inc_xv; | |||||
idx += inc_v; | |||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = VFMVFS_FLOAT(v_res); | |||||
mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); | |||||
min_index = VMFIRSTM(mask0,gvl); | |||||
VSEVU_UINT(temp_uint,v_min_index,gvl); | |||||
min_index = temp_uint[min_index]; | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
minf = EXTRACT_FLOAT(v_res); | |||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
if(j < n){ | if(j < n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v_min_index = VMVVX_UINT(0, gvl); | |||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx0) | |||||
:"v"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx0) | |||||
:"v"(mask0), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||||
/* | |||||
#if defined(DOUBLE) | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e64,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx1) | |||||
:"v"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#else | |||||
asm volatile( | |||||
"vor.vv v0, %1, %1\n\t" | |||||
"vsetvli x0, %3, e32,m8 \n\t" | |||||
"vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
:"+v"(vx1) | |||||
:"v"(mask1), "f"(zero), "r"(gvl) | |||||
:"v0"); | |||||
#endif | |||||
*/ | |||||
v_min = VFADDVV_FLOAT(vx0, vx1, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
FLOAT cur_minf = VFMVFS_FLOAT(v_res); | |||||
if(cur_minf < minf){ | |||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||||
v_min = VFABS_FLOAT(v_min, gvl); | |||||
vx2 = VFABS_FLOAT(vx2, gvl); | |||||
v_min = VFADDVV_FLOAT(v_min, vx2, gvl); | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||||
if(cur_minf > minf){ | |||||
//tail index | //tail index | ||||
v_min_index = VIDV_UINT(gvl); | v_min_index = VIDV_UINT(gvl); | ||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl); | v_min_index = VADDVX_UINT(v_min_index, j, gvl); | ||||
mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||||
min_index = VMFIRSTM(mask0,gvl); | |||||
VSEVU_UINT(temp_uint,v_min_index,gvl); | |||||
min_index = temp_uint[min_index]; | |||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||||
UINT_V_T compressed; | |||||
compressed = VCOMPRESS(v_min_index, mask, gvl); | |||||
min_index = VMV_X(compressed); | |||||
} | } | ||||
} | } | ||||
return(min_index+1); | |||||
} | |||||
return(min_index+1); | |||||
} |
@@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | #include <math.h> | ||||
#include <float.h> | #include <float.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 32 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 16 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 8 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 4 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||||
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
@@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT maxf=-FLT_MAX; | FLOAT maxf=-FLT_MAX; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T v0, v1, v_max; | FLOAT_V_T v0, v1, v_max; | ||||
FLOAT_V_T_M1 v_res, v_min; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
@@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | ||||
j += gvl * 2; | j += gvl * 2; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||||
if(*((FLOAT*)&v_res) > maxf) | |||||
maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
j += gvl * 2; | j += gvl * 2; | ||||
idx += inc_xv * 2; | idx += inc_xv * 2; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||||
maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||||
if(*((FLOAT*)&v_res) > maxf) | |||||
maxf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
} | } | ||||
maxf = EXTRACT_FLOAT(v_res); | |||||
return(maxf); | return(maxf); | ||||
} | } | ||||
@@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | #include <math.h> | ||||
#include <float.h> | #include <float.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 32 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 16 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 8 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 4 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||||
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
@@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT minf=FLT_MAX; | FLOAT minf=FLT_MAX; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T v0, v1, v_min; | FLOAT_V_T v0, v1, v_min; | ||||
FLOAT_V_T_M1 v_res, v_max; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
@@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_min = VFMINVV_FLOAT(v_min, v1, gvl); | v_min = VFMINVV_FLOAT(v_min, v1, gvl); | ||||
j += gvl * 2; | j += gvl * 2; | ||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
if(*((FLOAT*)&v_res) < minf) | |||||
minf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
j += gvl * 2; | j += gvl * 2; | ||||
idx += inc_xv * 2; | idx += inc_xv * 2; | ||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
} | } | ||||
for(;j<n;){ | for(;j<n;){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
if(*((FLOAT*)&v_res) < minf) | |||||
minf = *((FLOAT*)&v_res); | |||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
} | } | ||||
minf = EXTRACT_FLOAT(v_res); | |||||
return(minf); | return(minf); | ||||
} | } | ||||
@@ -26,207 +26,185 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | *****************************************************************************/ | ||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | |||||
#define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32 | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
#define ABS fabsf | |||||
#define MASK_T vbool8_t | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m | |||||
#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 | |||||
#define VMFIRSTM vmfirst_m_b8 | |||||
#define VFDIVVF_FLOAT vfdiv_vf_f32m4 | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m1 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | |||||
#define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64 | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
# define LMUL m4 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 16 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 8 | |||||
# endif | |||||
#endif | |||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVSF_FLOAT JOIN(__riscv_vfmv, _s_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||||
#define VFABS JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) | |||||
#define VMFNE JOIN(__riscv_vmfne_vf_f,ELEN, LMUL, _b, MLEN) | |||||
#define VMFGT JOIN(__riscv_vmfgt_vv_f,ELEN, LMUL, _b, MLEN) | |||||
#define VMFEQ JOIN(__riscv_vmfeq_vf_f,ELEN, LMUL, _b, MLEN) | |||||
#define VCPOP JOIN(__riscv_vcpop, _m_b, MLEN, _, _) | |||||
#define VFREDMAX JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) | |||||
#define VFREDMIN JOIN(__riscv_vfredmin_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) | |||||
#define VFIRST JOIN(__riscv_vfirst, _m_b, MLEN, _, _) | |||||
#define VRGATHER JOIN(__riscv_vrgather, _vx_f, ELEN, LMUL, _) | |||||
#define VFDIV JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _) | |||||
#define VFDIV_M JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _mu) | |||||
#define VFMUL JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _) | |||||
#define VFMUL_M JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _mu) | |||||
#define VFMACC JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _) | |||||
#define VFMACC_M JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _mu) | |||||
#define VMSBF JOIN(__riscv_vmsbf, _m_b, MLEN, _, _) | |||||
#define VMSOF JOIN(__riscv_vmsof, _m_b, MLEN, _, _) | |||||
#define VMAND JOIN(__riscv_vmand, _mm_b, MLEN, _, _) | |||||
#define VMANDN JOIN(__riscv_vmandn, _mm_b, MLEN, _, _) | |||||
#define VFREDSUM JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) | |||||
#define VMERGE JOIN(__riscv_vmerge, _vvm_f, ELEN, LMUL, _) | |||||
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | #define ABS fabs | ||||
#define MASK_T vbool16_t | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m | |||||
#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 | |||||
#define VMFIRSTM vmfirst_m_b16 | |||||
#define VFDIVVF_FLOAT vfdiv_vf_f64m4 | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | #endif | ||||
#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v) | |||||
//#define DUMP( label, v0, gvl ) | |||||
#define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
BLASLONG i=0; | |||||
if ( n < 0 ) return(0.0); | |||||
if(n <= 0) return(0.0); | |||||
if(n == 1) return (ABS(x[0])); | if(n == 1) return (ABS(x[0])); | ||||
FLOAT_V_T vr, v0, v_zero; | |||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T_M1 v_res, v_z0; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
FLOAT scale = 0.0, ssq = 0.0; | |||||
MASK_T mask; | |||||
BLASLONG index = 0; | |||||
if(inc_x == 1){ | |||||
gvl = VSETVL(n); | |||||
vr = VFMVVF_FLOAT(0, gvl); | |||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
for(i=0,j=0; i<n/gvl; i++){ | |||||
v0 = VLEV_FLOAT(&x[j], gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0){ | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl); | |||||
} | |||||
}else{//found greater element | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq before current vector | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
//ssq in vector vr | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
MASK_T nonzero_mask; | |||||
MASK_T scale_mask; | |||||
gvl = VSETVL(n); | |||||
FLOAT_V_T v0; | |||||
FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl); | |||||
FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl); | |||||
FLOAT scale = 0; | |||||
FLOAT ssq = 0; | |||||
unsigned int stride_x = inc_x * sizeof(FLOAT); | |||||
int idx = 0; | |||||
if( n >= gvl ) // don't pay overheads if we're not doing useful work | |||||
{ | |||||
for(i=0; i<n/gvl; i++){ | |||||
v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl ); | |||||
nonzero_mask = VMFNE( v0, 0, gvl ); | |||||
v0 = VFABS( v0, gvl ); | |||||
scale_mask = VMFGT( v0, v_scale, gvl ); | |||||
// assume scale changes are relatively infrequent | |||||
// unclear if the vcpop+branch is actually a win | |||||
// since the operations being skipped are predicated anyway | |||||
// need profiling to confirm | |||||
if( VCPOP(scale_mask, gvl) ) | |||||
{ | |||||
v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl ); | |||||
v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl ); | |||||
v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl ); | |||||
v_scale = VMERGE( v_scale, v0, scale_mask, gvl ); | |||||
} | } | ||||
j += gvl; | |||||
v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl ); | |||||
v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl ); | |||||
idx += inc_x * gvl; | |||||
} | } | ||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq now | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//tail | |||||
if(j < n){ | |||||
gvl = VSETVL(n-j); | |||||
v0 = VLEV_FLOAT(&x[j], gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0) | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
}else{//found greater element | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
// we have gvl elements which we accumulated independently, with independent scales | |||||
// we need to combine these | |||||
// naive sort so we process small values first to avoid losing information | |||||
// could use vector sort extensions where available, but we're dealing with gvl elts at most | |||||
FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT)); | |||||
FLOAT * out_scale = alloca(gvl*sizeof(FLOAT)); | |||||
VSEV_FLOAT( out_ssq, v_ssq, gvl ); | |||||
VSEV_FLOAT( out_scale, v_scale, gvl ); | |||||
for( int a = 0; a < (gvl-1); ++a ) | |||||
{ | |||||
int smallest = a; | |||||
for( size_t b = a+1; b < gvl; ++b ) | |||||
if( out_scale[b] < out_scale[smallest] ) | |||||
smallest = b; | |||||
if( smallest != a ) | |||||
{ | |||||
FLOAT tmp1 = out_ssq[a]; | |||||
FLOAT tmp2 = out_scale[a]; | |||||
out_ssq[a] = out_ssq[smallest]; | |||||
out_scale[a] = out_scale[smallest]; | |||||
out_ssq[smallest] = tmp1; | |||||
out_scale[smallest] = tmp2; | |||||
} | } | ||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq now | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
} | } | ||||
}else{ | |||||
gvl = VSETVL(n); | |||||
vr = VFMVVF_FLOAT(0, gvl); | |||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
unsigned int stride_x = inc_x * sizeof(FLOAT); | |||||
int idx = 0, inc_v = inc_x * gvl; | |||||
for(i=0,j=0; i<n/gvl; i++){ | |||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0){ | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl); | |||||
} | |||||
}else{//found greater element | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq before current vector | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
//ssq in vector vr | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
int a = 0; | |||||
while( a<gvl && out_scale[a] == 0 ) | |||||
++a; | |||||
if( a < gvl ) | |||||
{ | |||||
ssq = out_ssq[a]; | |||||
scale = out_scale[a]; | |||||
++a; | |||||
for( ; a < gvl; ++a ) | |||||
{ | |||||
ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a]; | |||||
scale = out_scale[a]; | |||||
} | } | ||||
j += gvl; | |||||
idx += inc_v; | |||||
} | } | ||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq now | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//tail | |||||
if(j < n){ | |||||
gvl = VSETVL(n-j); | |||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0) | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
}else{//found greater element | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOATM4(vr); | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
} | |||||
//finish any tail using scalar ops | |||||
i*=gvl*inc_x; | |||||
n*=inc_x; | |||||
while(i < n){ | |||||
if ( x[i] != 0.0 ){ | |||||
FLOAT absxi = ABS( x[i] ); | |||||
if ( scale < absxi ){ | |||||
ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); | |||||
scale = absxi ; | |||||
} | |||||
else{ | |||||
ssq += ( absxi/scale ) * ( absxi/scale ); | |||||
} | } | ||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq now | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
} | } | ||||
i += inc_x; | |||||
} | } | ||||
return(scale * sqrt(ssq)); | return(scale * sqrt(ssq)); | ||||
} | } | ||||
@@ -31,9 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | #define VSETVL_MAX vsetvlmax_e32m1() | ||||
#define FLOAT_V_T vfloat32m8_t | #define FLOAT_V_T vfloat32m8_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | ||||
#define VLEV_FLOAT vle_v_f32m8 | |||||
#define VLSEV_FLOAT vlse_v_f32m8 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 | #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 | ||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m8 | #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | ||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | #define VFMVVF_FLOAT vfmv_v_f_f32m8 | ||||
@@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | #define VSETVL_MAX vsetvlmax_e64m1() | ||||
#define FLOAT_V_T vfloat64m8_t | #define FLOAT_V_T vfloat64m8_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | ||||
#define VLEV_FLOAT vle_v_f64m8 | |||||
#define VLSEV_FLOAT vlse_v_f64m8 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 | #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 | ||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m8 | #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | ||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | #define VFMVVF_FLOAT vfmv_v_f_f64m8 | ||||
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSEV_FLOAT vse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
#define VFMSACVF_FLOAT vfmsac_vf_f32m4 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSEV_FLOAT __riscv_vse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 | |||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSEV_FLOAT vse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
#define VFMSACVF_FLOAT vfmsac_vf_f64m4 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSEV_FLOAT __riscv_vse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 | |||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | ||||
@@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
BLASLONG ix=0,iy=0; | BLASLONG ix=0,iy=0; | ||||
if(n <= 0) return(0); | if(n <= 0) return(0); | ||||
unsigned int gvl = 0; | |||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | |||||
FLOAT_V_T v0, v1, vx, vy; | FLOAT_V_T v0, v1, vx, vy; | ||||
if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
gvl = VSETVL(n); | |||||
for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
vy = VLEV_FLOAT(&y[j], gvl); | vy = VLEV_FLOAT(&y[j], gvl); | ||||
@@ -90,7 +89,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
VSEV_FLOAT(&y[j], v1, gvl); | VSEV_FLOAT(&y[j], v1, gvl); | ||||
} | } | ||||
}else if(inc_y == 1){ | }else if(inc_y == 1){ | ||||
gvl = VSETVL(n); | |||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
@@ -122,7 +120,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
VSEV_FLOAT(&y[j], v1, gvl); | VSEV_FLOAT(&y[j], v1, gvl); | ||||
} | } | ||||
}else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
gvl = VSETVL(n); | |||||
BLASLONG stride_y = inc_y * sizeof(FLOAT); | BLASLONG stride_y = inc_y * sizeof(FLOAT); | ||||
BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
@@ -154,7 +151,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl); | VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl); | ||||
} | } | ||||
}else{ | }else{ | ||||
gvl = VSETVL(n); | |||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
BLASLONG stride_y = inc_y * sizeof(FLOAT); | BLASLONG stride_y = inc_y * sizeof(FLOAT); | ||||
BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
@@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | *****************************************************************************/ | ||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VSEV_FLOAT vse32_v_f32m8 | |||||
#define VSSEV_FLOAT vsse32_v_f32m8 | |||||
#define VFMULVF_FLOAT vfmul_vf_f32m8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 32 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 16 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VSEV_FLOAT vse64_v_f64m8 | |||||
#define VSSEV_FLOAT vsse64_v_f64m8 | |||||
#define VFMULVF_FLOAT vfmul_vf_f64m8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 8 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 4 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMULVF_FLOAT JOIN(__riscv_vfmul, _vf_f, ELEN, LMUL, _) | |||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
{ | { | ||||
BLASLONG i=0,j=0; | BLASLONG i=0,j=0; | ||||
@@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
} | } | ||||
}else{ | }else{ | ||||
if(da == 0.0){ | if(da == 0.0){ | ||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
BLASLONG ix = 0; | |||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
BLASLONG ix = 0; | |||||
if(gvl <= n / 2){ | |||||
long int inc_xv = gvl * inc_x; | |||||
v0 = VFMVVF_FLOAT(0, gvl); | |||||
for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | |||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||||
VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl); | |||||
ix += inc_xv * 2; | |||||
} | |||||
v0 = VFMVVF_FLOAT(0, gvl); | |||||
for(i = 0; i < n/(gvl*2); ++i ){ | |||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||||
ix += inc_x * gvl; | |||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||||
ix += inc_x * gvl; | |||||
} | } | ||||
//tail | |||||
for(; j <n; ){ | |||||
gvl = VSETVL(n-j); | |||||
i *= gvl*2; | |||||
while( i < n ){ | |||||
gvl = VSETVL(n-i); | |||||
v0 = VFMVVF_FLOAT(0, gvl); | v0 = VFMVVF_FLOAT(0, gvl); | ||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||||
j += gvl; | |||||
ix += inc_x * gvl; | |||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||||
i += gvl; | |||||
ix += inc_x * gvl; | |||||
} | } | ||||
}else{ | }else{ | ||||
gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
@@ -0,0 +1,114 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2020, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 | |||||
#else | |||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 | |||||
#endif | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i=0, j=0; | |||||
BLASLONG ix=0; | |||||
FLOAT asumf=0.0; | |||||
if (n <= 0 || inc_x <= 0) return(asumf); | |||||
unsigned int gvl = 0; | |||||
FLOAT_V_T v0, v1, v_sum; | |||||
FLOAT_V_T_M1 v_res; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
if(inc_x == 1){ | |||||
gvl = VSETVL(n); | |||||
if(gvl <= n/2){ | |||||
v_sum = VFMVVF_FLOAT(0, gvl); | |||||
for(i=0,j=0; i<n/(gvl*2); i++){ | |||||
v0 = VLEV_FLOAT(&x[j], gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||||
j += gvl * 2; | |||||
} | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
} | |||||
for(;j<n;){ | |||||
gvl = VSETVL(n-j); | |||||
v0 = VLEV_FLOAT(&x[j], gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | |||||
} | |||||
}else{ | |||||
gvl = VSETVL(n); | |||||
unsigned int stride_x = inc_x * sizeof(FLOAT); | |||||
if(gvl <= n/2){ | |||||
v_sum = VFMVVF_FLOAT(0, gvl); | |||||
BLASLONG inc_xv = inc_x * gvl; | |||||
for(i=0,j=0; i<n/(gvl*2); i++){ | |||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||||
j += gvl * 2; | |||||
inc_xv += inc_xv * 2; | |||||
} | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
} | |||||
for(;j<n;){ | |||||
gvl = VSETVL(n-j); | |||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | |||||
} | |||||
} | |||||
asumf = EXTRACT_FLOAT(v_res); | |||||
return(asumf); | |||||
} | |||||
@@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <stdio.h> | #include <stdio.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VSEV_FLOAT vse32_v_f32m8 | |||||
#define VSSEV_FLOAT vsse32_v_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 32 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 16 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VSEV_FLOAT vse64_v_f64m8 | |||||
#define VSSEV_FLOAT vsse64_v_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 8 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 4 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) | |||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
{ | { | ||||
BLASLONG i = 0, j = 0; | BLASLONG i = 0, j = 0; | ||||
BLASLONG ix = 0,iy = 0; | BLASLONG ix = 0,iy = 0; | ||||
BLASLONG stride_x, stride_y; | BLASLONG stride_x, stride_y; | ||||
FLOAT_V_T vx0, vx1, vy0, vy1; | FLOAT_V_T vx0, vx1, vy0, vy1; | ||||
unsigned int gvl = 0; | |||||
if (n < 0) return(0); | if (n < 0) return(0); | ||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | |||||
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | |||||
if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
gvl = VSETVL(n); | |||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
for(i=0,j=0; i<n/(2*gvl); i++){ | for(i=0,j=0; i<n/(2*gvl); i++){ | ||||
vx0 = VLEV_FLOAT(&x[j], gvl); | vx0 = VLEV_FLOAT(&x[j], gvl); | ||||
@@ -79,7 +96,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
j+=gvl; | j+=gvl; | ||||
} | } | ||||
}else if (inc_y == 1){ | }else if (inc_y == 1){ | ||||
gvl = VSETVL(n); | |||||
stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
@@ -107,7 +123,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
ix += inc_x * gvl; | ix += inc_x * gvl; | ||||
} | } | ||||
}else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
gvl = VSETVL(n); | |||||
stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
@@ -135,7 +150,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
iy += inc_y * gvl; | iy += inc_y * gvl; | ||||
} | } | ||||
}else{ | }else{ | ||||
gvl = VSETVL(n); | |||||
stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
stride_y = inc_y * sizeof(FLOAT); | stride_y = inc_y * sizeof(FLOAT); | ||||
if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
@@ -27,37 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSEV_FLOAT vse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSEV_FLOAT __riscv_vse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSEV_FLOAT vse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSEV_FLOAT __riscv_vse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
@@ -99,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
i += gvl; | i += gvl; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 = EXTRACT_FLOAT(v_res); | |||||
if(i < m){ | if(i < m){ | ||||
gvl = VSETVL(m-i); | gvl = VSETVL(m-i); | ||||
vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
@@ -110,8 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
vx = VLEV_FLOAT(&x[i], gvl); | vx = VLEV_FLOAT(&x[i], gvl); | ||||
vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 += EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
y[j] += alpha * temp2; | y[j] += alpha * temp2; | ||||
@@ -144,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
i += gvl; | i += gvl; | ||||
iy += inc_yv; | iy += inc_yv; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 = EXTRACT_FLOAT(v_res); | |||||
if(i < m){ | if(i < m){ | ||||
gvl = VSETVL(m-i); | gvl = VSETVL(m-i); | ||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
@@ -155,8 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
vx = VLEV_FLOAT(&x[i], gvl); | vx = VLEV_FLOAT(&x[i], gvl); | ||||
vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 += EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
y[jy] += alpha * temp2; | y[jy] += alpha * temp2; | ||||
@@ -190,8 +188,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
i += gvl; | i += gvl; | ||||
ix += inc_xv; | ix += inc_xv; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 = EXTRACT_FLOAT(v_res); | |||||
if(i < m){ | if(i < m){ | ||||
gvl = VSETVL(m-i); | gvl = VSETVL(m-i); | ||||
vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
@@ -201,8 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 += EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
y[j] += alpha * temp2; | y[j] += alpha * temp2; | ||||
@@ -241,8 +239,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
ix += inc_xv; | ix += inc_xv; | ||||
iy += inc_yv; | iy += inc_yv; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 = EXTRACT_FLOAT(v_res); | |||||
if(i < m){ | if(i < m){ | ||||
gvl = VSETVL(m-i); | gvl = VSETVL(m-i); | ||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
@@ -252,8 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 += EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
y[jy] += alpha * temp2; | y[jy] += alpha * temp2; | ||||
@@ -27,39 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSEV_FLOAT vse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
#define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSEV_FLOAT __riscv_vse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSEV_FLOAT vse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
#define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSEV_FLOAT __riscv_vse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
@@ -101,8 +99,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
i += gvl; | i += gvl; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 = EXTRACT_FLOAT(v_res); | |||||
if(i < j){ | if(i < j){ | ||||
gvl = VSETVL(j-i); | gvl = VSETVL(j-i); | ||||
vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
@@ -112,8 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
vx = VLEV_FLOAT(&x[i], gvl); | vx = VLEV_FLOAT(&x[i], gvl); | ||||
vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 += EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
y[j] += temp1 * a_ptr[j] + alpha * temp2; | y[j] += temp1 * a_ptr[j] + alpha * temp2; | ||||
@@ -145,8 +143,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
i += gvl; | i += gvl; | ||||
iy += inc_yv; | iy += inc_yv; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 = EXTRACT_FLOAT(v_res); | |||||
if(i < j){ | if(i < j){ | ||||
gvl = VSETVL(j-i); | gvl = VSETVL(j-i); | ||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
@@ -156,8 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
vx = VLEV_FLOAT(&x[i], gvl); | vx = VLEV_FLOAT(&x[i], gvl); | ||||
vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 += EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
y[jy] += temp1 * a_ptr[j] + alpha * temp2; | y[jy] += temp1 * a_ptr[j] + alpha * temp2; | ||||
@@ -190,8 +188,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
i += gvl; | i += gvl; | ||||
ix += inc_xv; | ix += inc_xv; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 = EXTRACT_FLOAT(v_res); | |||||
if(i < j){ | if(i < j){ | ||||
gvl = VSETVL(j-i); | gvl = VSETVL(j-i); | ||||
vy = VLEV_FLOAT(&y[i], gvl); | vy = VLEV_FLOAT(&y[i], gvl); | ||||
@@ -201,8 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 += EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
y[j] += temp1 * a_ptr[j] + alpha * temp2; | y[j] += temp1 * a_ptr[j] + alpha * temp2; | ||||
@@ -240,8 +238,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
ix += inc_xv; | ix += inc_xv; | ||||
iy += inc_yv; | iy += inc_yv; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 = EXTRACT_FLOAT(v_res); | |||||
if(i < j){ | if(i < j){ | ||||
gvl = VSETVL(j-i); | gvl = VSETVL(j-i); | ||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | ||||
@@ -251,8 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
vr = VFMULVV_FLOAT(vx, va, gvl); | vr = VFMULVV_FLOAT(vx, va, gvl); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp2 += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
temp2 += EXTRACT_FLOAT(v_res); | |||||
} | } | ||||
} | } | ||||
y[jy] += temp1 * a_ptr[j] + alpha * temp2; | y[jy] += temp1 * a_ptr[j] + alpha * temp2; | ||||
@@ -28,40 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | #include <math.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
#define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 32 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 16 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
#define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 8 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 4 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||||
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) | |||||
#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) | |||||
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
@@ -70,10 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
if (n <= 0 || inc_x <= 0) return(maxf); | if (n <= 0 || inc_x <= 0) return(maxf); | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T v0, v1, v_max; | FLOAT_V_T v0, v1, v_max; | ||||
FLOAT_V_T_M1 v_res, v_z0; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | ||||
@@ -84,9 +87,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||||
v0 = VFADDVV_FLOAT(v0, v1, gvl); | v0 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | ||||
@@ -94,22 +97,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
j += gvl; | j += gvl; | ||||
ix += inc_xv; | ix += inc_xv; | ||||
} | } | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||||
maxf = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||||
if(j<n){ | if(j<n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||||
v1 = VFADDVV_FLOAT(v0, v1, gvl); | v1 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); | |||||
if(VFMVFS_FLOAT(v_res)> maxf) | |||||
maxf = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||||
} | } | ||||
maxf = EXTRACT_FLOAT(v_res); | |||||
return(maxf); | return(maxf); | ||||
} | } |
@@ -29,38 +29,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include <math.h> | #include <math.h> | ||||
#include <float.h> | #include <float.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
#define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 32 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 16 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
#define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 8 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 4 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||||
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) | |||||
#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) | |||||
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
@@ -69,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
FLOAT minf=FLT_MAX; | FLOAT minf=FLT_MAX; | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T v0, v1, v_min; | FLOAT_V_T v0, v1, v_min; | ||||
FLOAT_V_T_M1 v_res, v_max; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||||
MASK_T mask0, mask1; | MASK_T mask0, mask1; | ||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | ||||
@@ -83,9 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||||
v0 = VFADDVV_FLOAT(v0, v1, gvl); | v0 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl); | v_min = VFMINVV_FLOAT(v_min, v0, gvl); | ||||
@@ -93,21 +99,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
j += gvl; | j += gvl; | ||||
ix += inc_xv; | ix += inc_xv; | ||||
} | } | ||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
minf = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||||
if(j<n){ | if(j<n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | ||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | ||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); | |||||
v1 = VFADDVV_FLOAT(v0, v1, gvl); | v1 = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl); | |||||
if(VFMVFS_FLOAT(v_res) < minf) | |||||
minf = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||||
} | } | ||||
minf = EXTRACT_FLOAT(v_res); | |||||
return(minf); | return(minf); | ||||
} | } |
@@ -28,37 +28,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <math.h> | #include <math.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 | |||||
#define MASK_T vbool4_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
#define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN _b32 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN _b16 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||||
#define MASK_T vbool8_t | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
#define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN _b8 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN _b4 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) | |||||
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
@@ -67,12 +73,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
if (n <= 0 || inc_x <= 0) return(asumf); | if (n <= 0 || inc_x <= 0) return(asumf); | ||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T v0, v1, v_zero,v_sum; | FLOAT_V_T v0, v1, v_zero,v_sum; | ||||
FLOAT_V_T_M1 v_res, v_z0; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
MASK_T mask0, mask1; | |||||
if(inc_x == 1){ | if(inc_x == 1){ | ||||
BLASLONG n2 = n * 2; | BLASLONG n2 = n * 2; | ||||
gvl = VSETVL(n2); | gvl = VSETVL(n2); | ||||
@@ -81,26 +84,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
for(i=0,j=0; i<n2/(gvl*2); i++){ | for(i=0,j=0; i<n2/(gvl*2); i++){ | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
j += gvl * 2; | j += gvl * 2; | ||||
} | } | ||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
asumf += VFFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
} | } | ||||
for(;j<n2;){ | for(;j<n2;){ | ||||
gvl = VSETVL(n2-j); | gvl = VSETVL(n2-j); | ||||
v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
asumf += VFFMVFS_FLOAT(v_res); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -112,34 +110,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
j += gvl; | j += gvl; | ||||
ix += inc_xv; | ix += inc_xv; | ||||
} | } | ||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
asumf += VFFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
if(j<n){ | if(j<n){ | ||||
gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||||
v0 = VFABS_FLOAT(v0, gvl); | |||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||||
v1 = VFABS_FLOAT(v1, gvl); | |||||
v_sum = VFADDVV_FLOAT(v0, v1, gvl); | v_sum = VFADDVV_FLOAT(v0, v1, gvl); | ||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
asumf += VFFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
} | } | ||||
} | } | ||||
asumf = EXTRACT_FLOAT(v_res); | |||||
return(asumf); | return(asumf); | ||||
} | } | ||||
@@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
#define VFMSACVF_FLOAT vfmsac_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 | |||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
#define VFMSACVF_FLOAT vfmsac_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 | |||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) | ||||
@@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
@@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#endif | #endif | ||||
@@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
#define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
#define VFMSACVV_FLOAT vfmsac_vv_f32m4 | |||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||||
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 | |||||
#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 | |||||
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
#define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
#define VFMSACVV_FLOAT vfmsac_vv_f64m4 | |||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||||
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 | |||||
#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 | |||||
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 | |||||
#endif | #endif | ||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | ||||
@@ -109,9 +109,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
ix += inc_xv; | ix += inc_xv; | ||||
iy += inc_yv; | iy += inc_yv; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); | |||||
dot[0] += VFMVFS_FLOAT(v_res); | dot[0] += VFMVFS_FLOAT(v_res); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); | |||||
dot[1] += VFMVFS_FLOAT(v_res); | dot[1] += VFMVFS_FLOAT(v_res); | ||||
//tail | //tail | ||||
if(j < n){ | if(j < n){ | ||||
@@ -132,9 +132,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
vr1 = VFMULVV_FLOAT(vx1, vy0, gvl); | vr1 = VFMULVV_FLOAT(vx1, vy0, gvl); | ||||
vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl); | vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl); | ||||
#endif | #endif | ||||
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); | |||||
dot[0] += VFMVFS_FLOAT(v_res); | dot[0] += VFMVFS_FLOAT(v_res); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); | |||||
dot[1] += VFMVFS_FLOAT(v_res); | dot[1] += VFMVFS_FLOAT(v_res); | ||||
} | } | ||||
CREAL(result) = dot[0]; | CREAL(result) = dot[0]; | ||||
@@ -0,0 +1,140 @@ | |||||
#include "common.h" | |||||
/* for debugging/unit tests | |||||
* this is a drop-in replacement for zgemm/cgemm/ztrmm/ctrmm kernels that supports arbitrary combinations of unroll values | |||||
*/ | |||||
#ifdef TRMMKERNEL | |||||
#if defined(LEFT) != defined(TRANSA) | |||||
#define BACKWARDS | |||||
#endif | |||||
#endif | |||||
#ifdef DOUBLE | |||||
#define UNROLL_M ZGEMM_DEFAULT_UNROLL_M | |||||
#define UNROLL_N ZGEMM_DEFAULT_UNROLL_N | |||||
#else | |||||
#define UNROLL_M CGEMM_DEFAULT_UNROLL_M | |||||
#define UNROLL_N CGEMM_DEFAULT_UNROLL_N | |||||
#endif | |||||
int CNAME(BLASLONG M,BLASLONG N,BLASLONG K,FLOAT alphar,FLOAT alphai,FLOAT* A,FLOAT* B,FLOAT* C,BLASLONG ldc | |||||
#ifdef TRMMKERNEL | |||||
,BLASLONG offset | |||||
#endif | |||||
) | |||||
{ | |||||
FLOAT res[UNROLL_M*UNROLL_N*2]; | |||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
FLOAT sign[4] = { 1, -1, 1, 1}; | |||||
#endif | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
FLOAT sign[4] = { 1, 1, 1, -1}; | |||||
#endif | |||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
FLOAT sign[4] = { 1, 1, -1, 1}; | |||||
#endif | |||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
FLOAT sign[4] = { 1, -1, -1, -1}; | |||||
#endif | |||||
BLASLONG n_packing = UNROLL_N; | |||||
BLASLONG n_top = 0; | |||||
while(n_top < N) | |||||
{ | |||||
while( n_top+n_packing > N ) | |||||
n_packing >>= 1; | |||||
BLASLONG m_packing = UNROLL_M; | |||||
BLASLONG m_top = 0; | |||||
while (m_top < M) | |||||
{ | |||||
while( m_top+m_packing > M ) | |||||
m_packing >>= 1; | |||||
BLASLONG ai = K*m_top*2; | |||||
BLASLONG bi = K*n_top*2; | |||||
BLASLONG pass_K = K; | |||||
#ifdef TRMMKERNEL | |||||
#ifdef LEFT | |||||
BLASLONG off = offset + m_top; | |||||
#else | |||||
BLASLONG off = -offset + n_top; | |||||
#endif | |||||
#ifdef BACKWARDS | |||||
ai += off * m_packing*2; | |||||
bi += off * n_packing*2; | |||||
pass_K -= off; | |||||
#else | |||||
#ifdef LEFT | |||||
pass_K = off + m_packing; | |||||
#else | |||||
pass_K = off + n_packing; | |||||
#endif | |||||
#endif | |||||
#endif | |||||
memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) ); | |||||
for (BLASLONG k=0; k<pass_K; k+=1) | |||||
{ | |||||
for( BLASLONG ki = 0; ki < n_packing; ++ki ) | |||||
{ | |||||
FLOAT B0 = B[bi+ki*2+0]; | |||||
FLOAT B1 = B[bi+ki*2+1]; | |||||
for( BLASLONG kj = 0; kj < m_packing; ++kj ) | |||||
{ | |||||
FLOAT A0 = A[ai+kj*2+0]; | |||||
FLOAT A1 = A[ai+kj*2+1]; | |||||
res[(ki*UNROLL_M+kj)*2+0] += sign[0]*A0*B0 +sign[1]*A1*B1; | |||||
res[(ki*UNROLL_M+kj)*2+1] += sign[2]*A1*B0 +sign[3]*A0*B1; | |||||
} | |||||
} | |||||
ai += m_packing*2; | |||||
bi += n_packing*2; | |||||
} | |||||
BLASLONG cofs = ldc * n_top + m_top; | |||||
for( BLASLONG ki = 0; ki < n_packing; ++ki ) | |||||
{ | |||||
for( BLASLONG kj = 0; kj < m_packing; ++kj ) | |||||
{ | |||||
#ifdef TRMMKERNEL | |||||
FLOAT Cr = 0; | |||||
FLOAT Ci = 0; | |||||
#else | |||||
FLOAT Cr = C[(cofs+ki*ldc+kj)*2+0]; | |||||
FLOAT Ci = C[(cofs+ki*ldc+kj)*2+1]; | |||||
#endif | |||||
Cr += res[(ki*UNROLL_M+kj)*2+0]*alphar; | |||||
Cr += -res[(ki*UNROLL_M+kj)*2+1]*alphai; | |||||
Ci += res[(ki*UNROLL_M+kj)*2+1]*alphar; | |||||
Ci += res[(ki*UNROLL_M+kj)*2+0]*alphai; | |||||
C[(cofs+ki*ldc+kj)*2+0] = Cr; | |||||
C[(cofs+ki*ldc+kj)*2+1] = Ci; | |||||
} | |||||
} | |||||
m_top += m_packing; | |||||
} | |||||
n_top += n_packing; | |||||
} | |||||
return 0; | |||||
} |
@@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSEV_FLOAT vse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSEV_FLOAT __riscv_vse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSEV_FLOAT vse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSEV_FLOAT __riscv_vse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
@@ -27,31 +27,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | |||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m2_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 | |||||
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | |||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m2_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 | |||||
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
@@ -62,49 +62,43 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
FLOAT temp_r, temp_i; | FLOAT temp_r, temp_i; | ||||
FLOAT_V_T va0, va1, vx0, vx1, vr, vi; | FLOAT_V_T va0, va1, vx0, vx1, vr, vi; | ||||
unsigned int gvl = 0; | |||||
FLOAT_V_T_M1 v_res, v_z0; | |||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
unsigned int gvl = VSETVL(m); | |||||
FLOAT_V_T_M1 v_res_r, v_res_i; | |||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | ||||
BLASLONG stride_a = sizeof(FLOAT) * 2; | BLASLONG stride_a = sizeof(FLOAT) * 2; | ||||
gvl = VSETVL(m); | |||||
BLASLONG inc_xv = inc_x * gvl * 2; | BLASLONG inc_xv = inc_x * gvl * 2; | ||||
BLASLONG inc_av = gvl * 2; | BLASLONG inc_av = gvl * 2; | ||||
BLASLONG inc_y2 = inc_y * 2; | BLASLONG inc_y2 = inc_y * 2; | ||||
BLASLONG lda2 = lda * 2; | BLASLONG lda2 = lda * 2; | ||||
for(i = 0; i < n; i++){ | for(i = 0; i < n; i++){ | ||||
v_res_r = VFMVVF_FLOAT_M1(0, 1); | |||||
v_res_i = VFMVVF_FLOAT_M1(0, 1); | |||||
gvl = VSETVL(m); | gvl = VSETVL(m); | ||||
j = 0; | j = 0; | ||||
ix = 0; | ix = 0; | ||||
vr = VFMVVF_FLOAT(0, gvl); | |||||
vi = VFMVVF_FLOAT(0, gvl); | |||||
for(k = 0; k < m/gvl; k++){ | for(k = 0; k < m/gvl; k++){ | ||||
va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | ||||
va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | ||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl); | |||||
vr = VFMULVV_FLOAT(va0, vx0, gvl); | |||||
vi = VFMULVV_FLOAT(va0, vx1, gvl); | |||||
vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); | vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); | ||||
vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl); | |||||
vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl); | vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl); | ||||
#else | #else | ||||
vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl); | |||||
vr = VFMULVV_FLOAT(va0, vx0, gvl); | |||||
vi = VFMULVV_FLOAT(va0, vx1, gvl); | |||||
vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); | vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); | ||||
vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl); | |||||
vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); | vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); | ||||
#endif | #endif | ||||
v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl); | |||||
v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl); | |||||
j += inc_av; | j += inc_av; | ||||
ix += inc_xv; | ix += inc_xv; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp_r = VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); | |||||
temp_i = VFMVFS_FLOAT(v_res); | |||||
if(j/2 < m){ | if(j/2 < m){ | ||||
gvl = VSETVL(m-j/2); | gvl = VSETVL(m-j/2); | ||||
va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | ||||
@@ -113,21 +107,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | ||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
vr = VFMULVV_FLOAT(va0, vx0, gvl); | vr = VFMULVV_FLOAT(va0, vx0, gvl); | ||||
vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); | |||||
vi = VFMULVV_FLOAT(va0, vx1, gvl); | vi = VFMULVV_FLOAT(va0, vx1, gvl); | ||||
vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); | |||||
vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl); | vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl); | ||||
#else | #else | ||||
vr = VFMULVV_FLOAT(va0, vx0, gvl); | vr = VFMULVV_FLOAT(va0, vx0, gvl); | ||||
vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); | |||||
vi = VFMULVV_FLOAT(va0, vx1, gvl); | vi = VFMULVV_FLOAT(va0, vx1, gvl); | ||||
vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); | |||||
vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); | vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); | ||||
#endif | #endif | ||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
temp_r += VFMVFS_FLOAT(v_res); | |||||
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); | |||||
temp_i += VFMVFS_FLOAT(v_res); | |||||
v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl); | |||||
v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl); | |||||
} | } | ||||
temp_r = VFMVFS_FLOAT(v_res_r); | |||||
temp_i = VFMVFS_FLOAT(v_res_i); | |||||
#if !defined(XCONJ) | #if !defined(XCONJ) | ||||
y[iy] += alpha_r * temp_r - alpha_i * temp_i; | y[iy] += alpha_r * temp_r - alpha_i * temp_i; | ||||
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; | y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; | ||||
@@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||||
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 | |||||
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||||
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 | |||||
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | ||||
@@ -143,9 +143,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
iy += inc_yv; | iy += inc_yv; | ||||
ia += inc_av; | ia += inc_av; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); | |||||
temp_r2 = VFMVFS_FLOAT(v_res); | temp_r2 = VFMVFS_FLOAT(v_res); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); | |||||
temp_i2 = VFMVFS_FLOAT(v_res); | temp_i2 = VFMVFS_FLOAT(v_res); | ||||
if(i < m){ | if(i < m){ | ||||
gvl = VSETVL(m-i); | gvl = VSETVL(m-i); | ||||
@@ -181,9 +181,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | ||||
#endif | #endif | ||||
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); | |||||
temp_r2 += VFMVFS_FLOAT(v_res); | temp_r2 += VFMVFS_FLOAT(v_res); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); | |||||
temp_i2 += VFMVFS_FLOAT(v_res); | temp_i2 += VFMVFS_FLOAT(v_res); | ||||
} | } | ||||
} | } | ||||
@@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT vfmul_vv_f32m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||||
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 | |||||
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT vfmul_vv_f64m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||||
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 | |||||
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | ||||
@@ -142,9 +142,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
iy += inc_yv; | iy += inc_yv; | ||||
ia += inc_av; | ia += inc_av; | ||||
} | } | ||||
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); | |||||
temp_r2 = VFMVFS_FLOAT(v_res); | temp_r2 = VFMVFS_FLOAT(v_res); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); | |||||
temp_i2 = VFMVFS_FLOAT(v_res); | temp_i2 = VFMVFS_FLOAT(v_res); | ||||
if(i < j){ | if(i < j){ | ||||
gvl = VSETVL(j-i); | gvl = VSETVL(j-i); | ||||
@@ -180,9 +180,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||||
vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | ||||
#endif | #endif | ||||
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); | |||||
temp_r2 += VFMVFS_FLOAT(v_res); | temp_r2 += VFMVFS_FLOAT(v_res); | ||||
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||||
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); | |||||
temp_i2 += VFMVFS_FLOAT(v_res); | temp_i2 += VFMVFS_FLOAT(v_res); | ||||
} | } | ||||
} | } | ||||
@@ -26,264 +26,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | *****************************************************************************/ | ||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | |||||
#define FLOAT_V_T_M1 vfloat32m1_t | |||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
#define ABS fabsf | |||||
#define MASK_T vbool8_t | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m | |||||
#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 | |||||
#define VMFIRSTM vmfirst_m_b8 | |||||
#define VFDIVVF_FLOAT vfdiv_vf_f32m4 | |||||
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m1 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | |||||
#define FLOAT_V_T_M1 vfloat64m1_t | |||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
# define LMUL m4 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 16 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 8 | |||||
# endif | |||||
#endif | |||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||||
#define VFABS JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) | |||||
#define VMFNE JOIN(__riscv_vmfne_vf_f,ELEN, LMUL, _b, MLEN) | |||||
#define VMFGT JOIN(__riscv_vmfgt_vv_f,ELEN, LMUL, _b, MLEN) | |||||
#define VMFEQ JOIN(__riscv_vmfeq_vv_f,ELEN, LMUL, _b, MLEN) | |||||
#define VCPOP JOIN(__riscv_vcpop, _m_b, MLEN, _, _) | |||||
#define VFREDMAX JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) | |||||
#define VFIRST JOIN(__riscv_vfirst, _m_b, MLEN, _, _) | |||||
#define VRGATHER JOIN(__riscv_vrgather, _vx_f, ELEN, LMUL, _) | |||||
#define VFDIV JOIN(__riscv_vfdiv, _vf_f, ELEN, LMUL, _) | |||||
#define VFDIV_M JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _mu) | |||||
#define VFMUL JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _) | |||||
#define VFMACC JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _) | |||||
#define VFMACC_M JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _mu) | |||||
#define VMSOF JOIN(__riscv_vmsof, _m_b, MLEN, _, _) | |||||
#define VMANDN JOIN(__riscv_vmandn, _mm_b, MLEN, _, _) | |||||
#define VFREDUSUM JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | #define ABS fabs | ||||
#define MASK_T vbool16_t | |||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m | |||||
#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 | |||||
#define VMFIRSTM vmfirst_m_b16 | |||||
#define VFDIVVF_FLOAT vfdiv_vf_f64m4 | |||||
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | #endif | ||||
#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
{ | { | ||||
BLASLONG i=0, j=0; | |||||
BLASLONG i=0; | |||||
if ( n < 0 ) return(0.0); | |||||
// if(n == 1) return (ABS(x[0])); | |||||
if(n < 0) return(0.0); | |||||
FLOAT_V_T vr, v0, v_zero; | |||||
FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero; | |||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T_M1 v_res, v_z0; | FLOAT_V_T_M1 v_res, v_z0; | ||||
gvl = VSETVL_MAX; | |||||
v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
FLOAT scale = 0.0, ssq = 0.0; | |||||
MASK_T mask; | |||||
BLASLONG index = 0; | |||||
if(inc_x == 1){ | |||||
BLASLONG n2 = n * 2; | |||||
gvl = VSETVL(n2); | |||||
vr = VFMVVF_FLOAT(0, gvl); | |||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
for(i=0,j=0; i<n2/gvl; i++){ | |||||
v0 = VLEV_FLOAT(&x[j], gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0){ | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl); | |||||
} | |||||
}else{//found greater element | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq before current vector | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
//ssq in vector vr | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
} | |||||
j += gvl; | |||||
} | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq now | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
v_z0 = VFMVVF_FLOAT_M1(0, 1); | |||||
//tail | |||||
if(j < n2){ | |||||
gvl = VSETVL(n2-j); | |||||
v0 = VLEV_FLOAT(&x[j], gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0) | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
}else{//found greater element | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
} | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq now | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
} | |||||
}else{ | |||||
gvl = VSETVL(n); | |||||
vr = VFMVVF_FLOAT(0, gvl); | |||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | |||||
int idx = 0, inc_v = inc_x * gvl * 2; | |||||
for(i=0,j=0; i<n/gvl; i++){ | |||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0){ | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl); | |||||
} | |||||
}else{//found greater element | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq before current vector | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
//ssq in vector vr | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
} | |||||
gvl = VSETVL(n); | |||||
v_ssq = VFMVVF_FLOAT(0, gvl); | |||||
v_scale = VFMVVF_FLOAT(0, gvl); | |||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0){ | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl); | |||||
} | |||||
}else{//found greater element | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq before current vector | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
//ssq in vector vr | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
} | |||||
j += gvl; | |||||
idx += inc_v; | |||||
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | |||||
int idx = 0; | |||||
for(i=0; i<n/gvl; i++){ | |||||
v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl ); | |||||
v1 = VLSEV_FLOAT( &x[idx+1], stride_x, gvl ); | |||||
v0 = VFABS( v0, gvl ); | |||||
v1 = VFABS( v1, gvl ); | |||||
MASK_T scale_mask0 = VMFGT( v0, v_scale, gvl ); | |||||
MASK_T scale_mask1 = VMFGT( v1, v_scale, gvl ); | |||||
if( VCPOP( scale_mask0, gvl ) + VCPOP( scale_mask1, gvl ) > 0 ){ // scale change? | |||||
// find largest element in v0 and v1 | |||||
v_res = VFREDMAX( v0, v_z0, gvl ); | |||||
v_res = VFREDMAX( v1, v_res, gvl ); | |||||
FLOAT const largest_elt = EXTRACT_FLOAT( v_res ); | |||||
v_scale = VFDIV( v_scale, largest_elt, gvl ); // scale/largest_elt | |||||
v_scale = VFMUL( v_scale, v_scale, gvl ); // (scale/largest_elt)*(scale/largest_elt) | |||||
v_ssq = VFMUL( v_scale, v_ssq, gvl ); // ssq*(scale/largest_elt)*(scale/largest_elt) | |||||
v_scale = VFMVVF_FLOAT( largest_elt, gvl ); // splated largest_elt becomes new scale | |||||
} | } | ||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq now | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//tail | |||||
if(j < n){ | |||||
gvl = VSETVL(n-j); | |||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0){ | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl ); | |||||
MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl ); | |||||
v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl ); | |||||
v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl ); | |||||
v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl ); | |||||
v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl ); | |||||
idx += inc_x * gvl * 2; | |||||
} | |||||
v_res = VFREDUSUM(v_ssq, v_z0, gvl); | |||||
FLOAT ssq = EXTRACT_FLOAT(v_res); | |||||
FLOAT scale = EXTRACT_FLOAT0_V(v_scale); | |||||
//finish any tail using scalar ops | |||||
i*=gvl; | |||||
if(i<n){ | |||||
i *= inc_x*2; | |||||
n *= inc_x*2; | |||||
FLOAT temp; | |||||
do{ | |||||
if ( x[i] != 0.0 ){ | |||||
temp = ABS( x[i] ); | |||||
if ( scale < temp ){ | |||||
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); | |||||
scale = temp ; | |||||
}else{ | |||||
ssq += ( temp / scale ) * ( temp / scale ); | |||||
} | } | ||||
}else{//found greater element | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
} | } | ||||
v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||||
//fabs(vector) | |||||
mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||||
//if scale change | |||||
mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||||
index = VMFIRSTM(mask, gvl); | |||||
if(index == -1){//no elements greater than scale | |||||
if(scale != 0.0){ | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl); | |||||
if ( x[i+1] != 0.0 ){ | |||||
temp = ABS( x[i+1] ); | |||||
if ( scale < temp ){ | |||||
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); | |||||
scale = temp ; | |||||
}else{ | |||||
ssq += ( temp / scale ) * ( temp / scale ); | |||||
} | } | ||||
}else{//found greater element | |||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq before current vector | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
//find max | |||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
//update ssq before max_index | |||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||||
//update scale | |||||
scale = VFMVFS_FLOAT(v_res); | |||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||||
} | } | ||||
//ssq in vector vr: vr[0] | |||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
//total ssq now | |||||
ssq += VFMVFS_FLOAT(v_res); | |||||
} | |||||
} | |||||
return(scale * sqrt(ssq)); | |||||
} | |||||
i += inc_x*2; | |||||
}while(i<n); | |||||
} | |||||
return(scale * sqrt(ssq)); | |||||
} |
@@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define VLEV_FLOAT vle32_v_f32m4 | |||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSEV_FLOAT vse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSEV_FLOAT __riscv_vse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define VLEV_FLOAT vle64_v_f64m4 | |||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSEV_FLOAT vse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSEV_FLOAT __riscv_vse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | ||||
@@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; | FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; | ||||
gvl = VSETVL(n); | |||||
gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | |||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | ||||
BLASLONG inc_xv = inc_x * 2 * gvl; | BLASLONG inc_xv = inc_x * 2 * gvl; | ||||
@@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define VSETVL(n) vsetvl_e32m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
#define VLSEV_FLOAT vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 | |||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m4(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
#define VLSEV_FLOAT vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
#define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 | |||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 | |||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 | |||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 | |||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||||
#endif | #endif | ||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
@@ -0,0 +1,131 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2020, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN _b32 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN _b16 | |||||
# endif | |||||
#else | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN _b8 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN _b4 | |||||
# endif | |||||
#endif | |||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) | |||||
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) | |||||
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) | |||||
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i=0, j=0; | |||||
BLASLONG ix=0; | |||||
FLOAT asumf=0.0; | |||||
if (n <= 0 || inc_x <= 0) return(asumf); | |||||
unsigned int gvl = 0; | |||||
FLOAT_V_T v0, v1, v_zero,v_sum; | |||||
FLOAT_V_T_M1 v_res; | |||||
v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
if(inc_x == 1){ | |||||
BLASLONG n2 = n * 2; | |||||
gvl = VSETVL(n2); | |||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
if(gvl <= n2/2){ | |||||
v_sum = VFMVVF_FLOAT(0, gvl); | |||||
for(i=0,j=0; i<n2/(gvl*2); i++){ | |||||
v0 = VLEV_FLOAT(&x[j], gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||||
j += gvl * 2; | |||||
} | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
} | |||||
for(;j<n2;){ | |||||
gvl = VSETVL(n2-j); | |||||
v0 = VLEV_FLOAT(&x[j], gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||||
j += gvl; | |||||
} | |||||
}else{ | |||||
gvl = VSETVL(n); | |||||
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | |||||
v_zero = VFMVVF_FLOAT(0, gvl); | |||||
BLASLONG inc_xv = inc_x * 2 * gvl; | |||||
v_sum = VFMVVF_FLOAT(0, gvl); | |||||
for(i=0,j=0; i<n/gvl; i++){ | |||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||||
j += gvl; | |||||
ix += inc_xv; | |||||
} | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
if(j<n){ | |||||
gvl = VSETVL(n-j); | |||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||||
v_sum = VFADDVV_FLOAT(v0, v1, gvl); | |||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
} | |||||
} | |||||
asumf = EXTRACT_FLOAT(v_res); | |||||
return(asumf); | |||||
} | |||||
@@ -27,35 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <stdio.h> | #include <stdio.h> | ||||
#if !defined(DOUBLE) | |||||
#define VSETVL(n) vsetvl_e32m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e32m1() | |||||
#define FLOAT_V_T vfloat32m8_t | |||||
#define VLEV_FLOAT vle32_v_f32m8 | |||||
#define VLSEV_FLOAT vlse32_v_f32m8 | |||||
#define VSEV_FLOAT vse32_v_f32m8 | |||||
#define VSSEV_FLOAT vsse32_v_f32m8 | |||||
#ifdef RISCV64_ZVL256B | |||||
# define LMUL m2 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 64 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 32 | |||||
# endif | |||||
#else | #else | ||||
#define VSETVL(n) vsetvl_e64m8(n) | |||||
#define VSETVL_MAX vsetvlmax_e64m1() | |||||
#define FLOAT_V_T vfloat64m8_t | |||||
#define VLEV_FLOAT vle64_v_f64m8 | |||||
#define VLSEV_FLOAT vlse64_v_f64m8 | |||||
#define VSEV_FLOAT vse64_v_f64m8 | |||||
#define VSSEV_FLOAT vsse64_v_f64m8 | |||||
# define LMUL m8 | |||||
# if defined(DOUBLE) | |||||
# define ELEN 64 | |||||
# define MLEN 16 | |||||
# else | |||||
# define ELEN 32 | |||||
# define MLEN 8 | |||||
# endif | |||||
#endif | #endif | ||||
#define _ | |||||
#define JOIN2_X(x, y) x ## y | |||||
#define JOIN2(x, y) JOIN2_X(x, y) | |||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) | |||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) | |||||
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) | |||||
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) | |||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
{ | { | ||||
BLASLONG i = 0, j = 0; | BLASLONG i = 0, j = 0; | ||||
BLASLONG ix = 0,iy = 0; | BLASLONG ix = 0,iy = 0; | ||||
BLASLONG stride_x, stride_y; | BLASLONG stride_x, stride_y; | ||||
FLOAT_V_T vx0, vx1, vy0, vy1; | FLOAT_V_T vx0, vx1, vy0, vy1; | ||||
unsigned int gvl = 0; | |||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | |||||
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | |||||
if (n < 0) return(0); | if (n < 0) return(0); | ||||
if(inc_x == 1 && inc_y == 1){ | if(inc_x == 1 && inc_y == 1){ | ||||
gvl = VSETVL(n); | |||||
BLASLONG n2 = n * 2; | BLASLONG n2 = n * 2; | ||||
if(gvl <= n2/2){ | if(gvl <= n2/2){ | ||||
for(i=0,j=0; i<n2/(2*gvl); i++){ | for(i=0,j=0; i<n2/(2*gvl); i++){ | ||||
@@ -80,7 +95,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||||
j += gvl; | j += gvl; | ||||
} | } | ||||
}else{ | }else{ | ||||
gvl = VSETVL(n); | |||||
stride_x = inc_x * 2 * sizeof(FLOAT); | stride_x = inc_x * 2 * sizeof(FLOAT); | ||||
stride_y = inc_y * 2 * sizeof(FLOAT); | stride_y = inc_y * 2 * sizeof(FLOAT); | ||||
BLASLONG inc_xv = inc_x * gvl * 2; | BLASLONG inc_xv = inc_x * gvl * 2; | ||||
@@ -3121,6 +3121,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
#ifdef RISCV64_ZVL256B | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | |||||
#define GEMM_DEFAULT_OFFSET_B 0 | |||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||||
#define SGEMM_DEFAULT_UNROLL_N 8 | |||||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||||
#define DGEMM_DEFAULT_UNROLL_N 8 | |||||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||||
#define CGEMM_DEFAULT_UNROLL_N 8 | |||||
#define ZGEMM_DEFAULT_UNROLL_M 8 | |||||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
#define SGEMM_DEFAULT_P 128 | |||||
#define DGEMM_DEFAULT_P 64 | |||||
#define CGEMM_DEFAULT_P 64 | |||||
#define ZGEMM_DEFAULT_P 64 | |||||
#define SGEMM_DEFAULT_Q 128 | |||||
#define DGEMM_DEFAULT_Q 128 | |||||
#define CGEMM_DEFAULT_Q 128 | |||||
#define ZGEMM_DEFAULT_Q 64 | |||||
#define SGEMM_DEFAULT_R 16384 | |||||
#define DGEMM_DEFAULT_R 8192 | |||||
#define CGEMM_DEFAULT_R 8192 | |||||
#define ZGEMM_DEFAULT_R 4096 | |||||
#define SYMV_P 16 | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | |||||
#define GEMM_DEFAULT_OFFSET_B 0 | |||||
#endif | |||||
#ifdef ARMV7 | #ifdef ARMV7 | ||||
#define SNUMOPT 2 | #define SNUMOPT 2 | ||||
#define DNUMOPT 2 | #define DNUMOPT 2 | ||||