| @@ -55,6 +55,14 @@ ifeq ($(TARGET), C910V) | |||
| TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), x280) | |||
| TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_GENERIC) | |||
| TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | |||
| endif | |||
| all: getarch_2nd | |||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | |||
| ./getarch_2nd 1 >> $(TARGET_CONF) | |||
| @@ -2,3 +2,11 @@ ifeq ($(CORE), C910V) | |||
| CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | |||
| FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | |||
| endif | |||
| ifeq ($(CORE), x280) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math | |||
| FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_GENERIC) | |||
| CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | |||
| endif | |||
| @@ -186,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| ``` | |||
| (also known to work on C906) | |||
| - **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0. | |||
| ```sh | |||
| make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran | |||
| ``` | |||
| ### Support for multiple targets in a single library | |||
| OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. | |||
| @@ -120,6 +120,7 @@ Z14 | |||
| 10.RISC-V 64: | |||
| RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | |||
| C910V | |||
| x280 | |||
| 11.LOONGARCH64: | |||
| LOONGSONGENERIC | |||
| @@ -95,4 +95,8 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #include <riscv_vector.h> | |||
| #endif | |||
| #if defined(x280) | |||
| #include <riscv_vector.h> | |||
| #endif | |||
| #endif | |||
| @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_C910V 1 | |||
| #define CPU_x280 2 | |||
| static char *cpuname[] = { | |||
| "RISCV64_GENERIC", | |||
| "C910V" | |||
| "x280" | |||
| }; | |||
| int detect(void){ | |||
| @@ -1677,6 +1677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define LIBNAME "c910v" | |||
| #define CORENAME "C910V" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_x280 | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "x280" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-Dx280 " \ | |||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "x280" | |||
| #define CORENAME "x280" | |||
| #else | |||
| #endif | |||
| @@ -0,0 +1,235 @@ | |||
| # ********************************************************************************** | |||
| # Copyright (c) 2022, The OpenBLAS Project | |||
| # All rights reserved. | |||
| # Redistribution and use in source and binary forms, with or without | |||
| # modification, are permitted provided that the following conditions are | |||
| # met: | |||
| # 1. Redistributions of source code must retain the above copyright | |||
| # notice, this list of conditions and the following disclaimer. | |||
| # 2. Redistributions in binary form must reproduce the above copyright | |||
| # notice, this list of conditions and the following disclaimer in | |||
| # the documentation and/or other materials provided with the | |||
| # distribution. | |||
| # 3. Neither the name of the OpenBLAS project nor the names of | |||
| # its contributors may be used to endorse or promote products | |||
| # derived from this software without specific prior written permission. | |||
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| # ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| # ********************************************************************************** | |||
| SAMAXKERNEL = amax_rvv.c | |||
| DAMAXKERNEL = amax_rvv.c | |||
| CAMAXKERNEL = zamax_rvv.c | |||
| ZAMAXKERNEL = zamax_rvv.c | |||
| SAMINKERNEL = amin_rvv.c | |||
| DAMINKERNEL = amin_rvv.c | |||
| CAMINKERNEL = zamin_rvv.c | |||
| ZAMINKERNEL = zamin_rvv.c | |||
| SMAXKERNEL = max_rvv.c | |||
| DMAXKERNEL = max_rvv.c | |||
| SMINKERNEL = min_rvv.c | |||
| DMINKERNEL = min_rvv.c | |||
| ISAMAXKERNEL = iamax_rvv.c | |||
| IDAMAXKERNEL = iamax_rvv.c | |||
| ICAMAXKERNEL = izamax_rvv.c | |||
| IZAMAXKERNEL = izamax_rvv.c | |||
| ISAMINKERNEL = iamin_rvv.c | |||
| IDAMINKERNEL = iamin_rvv.c | |||
| ICAMINKERNEL = izamin_rvv.c | |||
| IZAMINKERNEL = izamin_rvv.c | |||
| ISMAXKERNEL = imax_rvv.c | |||
| IDMAXKERNEL = imax_rvv.c | |||
| ISMINKERNEL = imin_rvv.c | |||
| IDMINKERNEL = imin_rvv.c | |||
| SASUMKERNEL = asum_rvv.c | |||
| DASUMKERNEL = asum_rvv.c | |||
| CASUMKERNEL = zasum_rvv.c | |||
| ZASUMKERNEL = zasum_rvv.c | |||
| SSUMKERNEL = sum_rvv.c | |||
| DSUMKERNEL = sum_rvv.c | |||
| CSUMKERNEL = zsum_rvv.c | |||
| ZSUMKERNEL = zsum_rvv.c | |||
| SAXPYKERNEL = axpy_rvv.c | |||
| DAXPYKERNEL = axpy_rvv.c | |||
| CAXPYKERNEL = zaxpy_rvv.c | |||
| ZAXPYKERNEL = zaxpy_rvv.c | |||
| SAXPBYKERNEL = axpby_rvv.c | |||
| DAXPBYKERNEL = axpby_rvv.c | |||
| CAXPBYKERNEL = zaxpby_rvv.c | |||
| ZAXPBYKERNEL = zaxpby_rvv.c | |||
| SCOPYKERNEL = copy_rvv.c | |||
| DCOPYKERNEL = copy_rvv.c | |||
| CCOPYKERNEL = zcopy_rvv.c | |||
| ZCOPYKERNEL = zcopy_rvv.c | |||
| SDOTKERNEL = dot_rvv.c | |||
| DDOTKERNEL = dot_rvv.c | |||
| CDOTKERNEL = zdot_rvv.c | |||
| ZDOTKERNEL = zdot_rvv.c | |||
| DSDOTKERNEL = dot_rvv.c | |||
| SNRM2KERNEL = nrm2_rvv.c | |||
| DNRM2KERNEL = nrm2_rvv.c | |||
| CNRM2KERNEL = znrm2_rvv.c | |||
| ZNRM2KERNEL = znrm2_rvv.c | |||
| SROTKERNEL = rot_rvv.c | |||
| DROTKERNEL = rot_rvv.c | |||
| CROTKERNEL = zrot_rvv.c | |||
| ZROTKERNEL = zrot_rvv.c | |||
| SSCALKERNEL = scal_rvv.c | |||
| DSCALKERNEL = scal_rvv.c | |||
| CSCALKERNEL = zscal_rvv.c | |||
| ZSCALKERNEL = zscal_rvv.c | |||
| SSWAPKERNEL = swap_rvv.c | |||
| DSWAPKERNEL = swap_rvv.c | |||
| CSWAPKERNEL = zswap_rvv.c | |||
| ZSWAPKERNEL = zswap_rvv.c | |||
| SGEMVNKERNEL = gemv_n_rvv.c | |||
| DGEMVNKERNEL = gemv_n_rvv.c | |||
| CGEMVNKERNEL = zgemv_n_rvv.c | |||
| ZGEMVNKERNEL = zgemv_n_rvv.c | |||
| SGEMVTKERNEL = gemv_t_rvv.c | |||
| DGEMVTKERNEL = gemv_t_rvv.c | |||
| CGEMVTKERNEL = zgemv_t_rvv.c | |||
| ZGEMVTKERNEL = zgemv_t_rvv.c | |||
| CTRMMKERNEL = ztrmmkernel_2x2_rvv.c | |||
| ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c | |||
| # SGEMM_UNROLL_N set in params.h | |||
| ifeq ($(SGEMM_UNROLL_N), 8) | |||
| # UNROLL_M is VLMAX | |||
| SGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||
| SGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||
| SGEMMITCOPY = gemm_tcopy_rvv_v1.c | |||
| SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c | |||
| SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMKERNEL = trmmkernel_rvv_v1x8.c | |||
| STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_rvv_v1.c | |||
| SSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||
| endif | |||
| # SGEMM_UNROLL_N set in params.h | |||
| ifeq ($(DGEMM_UNROLL_N), 8) | |||
| # UNROLL_M is VLMAX | |||
| DGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||
| DGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||
| DGEMMITCOPY = gemm_tcopy_rvv_v1.c | |||
| DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c | |||
| DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMKERNEL = trmmkernel_rvv_v1x8.c | |||
| DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_rvv_v1.c | |||
| DSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||
| endif | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c | |||
| TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c | |||
| TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c | |||
| SSYMV_U_KERNEL = symv_U_rvv.c | |||
| SSYMV_L_KERNEL = symv_L_rvv.c | |||
| DSYMV_U_KERNEL = symv_U_rvv.c | |||
| DSYMV_L_KERNEL = symv_L_rvv.c | |||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| @@ -0,0 +1,102 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VFABSV_FLOAT vfabs_v_f32m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VFABSV_FLOAT vfabs_v_f64m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT maxf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| FLOAT_V_T vx, vmax; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmax = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmax = VFMAXVV_FLOAT(vmax, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmax = VFMAXVV_FLOAT(vmax, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -0,0 +1,102 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VFABSV_FLOAT vfabs_v_f32m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VFABSV_FLOAT vfabs_v_f64m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| FLOAT_V_T vx, vmin; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmin = VFMINVV_FLOAT(vmin, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmin = VFMINVV_FLOAT(vmin, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -0,0 +1,99 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #define VFABSV_FLOAT vfabs_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #define VFABSV_FLOAT vfabs_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT asumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| FLOAT_V_T vx, vsum; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vsum = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vsum = VFADDVV_FLOAT(vsum, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vsum = VFADDVV_FLOAT(vsum, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); | |||
| asumf = VFMVFS_FLOAT_M1(v_res); | |||
| return(asumf); | |||
| } | |||
| @@ -0,0 +1,171 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| FLOAT_V_T vx, vy; | |||
| if ( n < 0 ) return(0); | |||
| if ( beta == 0.0 ) { | |||
| if ( alpha == 0.0 ) { | |||
| if (1 == inc_y) { | |||
| memset(&y[0], 0, n * sizeof(FLOAT)); | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| size_t vl = VSETVL(n); | |||
| vy = VFMVVF_FLOAT(0.0, vl); | |||
| for ( ; n > 0; n -= vl, y += vl*stride_y) { | |||
| vl = VSETVL(n); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } else { | |||
| if ((1 == inc_x) && (1 == inc_y)) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if ( alpha == 0.0 ) { | |||
| if (1 == inc_y) { | |||
| for (size_t vl; n > 0; n -= vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } else { | |||
| if ((1 == inc_x) && (1 == inc_y)) { | |||
| for (size_t vl; n > 0; n -= vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,109 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if ( n <= 0 ) return(0); | |||
| if ( da == 0.0 ) return(0); | |||
| FLOAT_V_T vx, vy; | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSEV_FLOAT(y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,94 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if(n < 0) return(0); | |||
| FLOAT_V_T v0; | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| VSEV_FLOAT(y, v0, vl); | |||
| } | |||
| } else if (inc_y == 1) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| VSEV_FLOAT(y, v0, vl); | |||
| } | |||
| } else if(inc_x == 1) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| VSSEV_FLOAT(y, stride_y, v0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| VSSEV_FLOAT(y, stride_y, v0, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,126 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DSDOT) | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| double dot = 0.0; | |||
| if ( n <= 0 ) return(dot); | |||
| size_t vlmax = vsetvlmax_e64m8(); | |||
| vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax); | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = vle32_v_f32m4(x, vl); | |||
| vfloat32m4_t vy = vle32_v_f32m4(y, vl); | |||
| vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = vle64_v_f64m8(x, vl); | |||
| vfloat64m8_t vy = vle64_v_f64m8(y, vl); | |||
| vr = vfmacc_vv_f64m8(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = vle32_v_f32m4(x, vl); | |||
| vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); | |||
| vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = vle64_v_f64m8(x, vl); | |||
| vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); | |||
| vr = vfmacc_vv_f64m8(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); | |||
| vfloat32m4_t vy = vle32_v_f32m4(y, vl); | |||
| vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); | |||
| vfloat64m8_t vy = vle64_v_f64m8(y, vl); | |||
| vr = vfmacc_vv_f64m8(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); | |||
| vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); | |||
| vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); | |||
| vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); | |||
| vr = vfmacc_vv_f64m8(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } | |||
| vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax); | |||
| vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax); | |||
| dot = vfmv_f_s_f64m1_f64(vec_sum); | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,89 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_beta.c | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, | |||
| IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, | |||
| FLOAT *c, BLASLONG ldc) | |||
| { | |||
| BLASLONG chunk; | |||
| FLOAT *c_offset; | |||
| size_t vl; | |||
| FLOAT_V_T vx; | |||
| if (beta == ZERO) { | |||
| vl = VSETVL(m); | |||
| vx = VFMVVF_FLOAT(0.0, vl); | |||
| for( ; n > 0; n--, c += ldc) { | |||
| c_offset = c; | |||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { | |||
| vl = VSETVL(chunk); | |||
| VSEV_FLOAT(c_offset, vx, vl); | |||
| } | |||
| } | |||
| } else { | |||
| for( ; n > 0; n--, c += ldc) { | |||
| c_offset = c; | |||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { | |||
| vl = VSETVL(chunk); | |||
| vx = VLEV_FLOAT(c_offset, vl); | |||
| vx = VFMULVF_FLOAT(vx, beta, vl); | |||
| VSEV_FLOAT(c_offset, vx, vl); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,164 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m1(n) | |||
| #define FLOAT_V_T vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m1 | |||
| #define VSEV_FLOAT vse32_v_f32m1 | |||
| #define VSSEG2_FLOAT vsseg2e32_v_f32m1 | |||
| #define VSSEG4_FLOAT vsseg4e32_v_f32m1 | |||
| #define VSSEG8_FLOAT vsseg8e32_v_f32m1 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m1(n) | |||
| #define FLOAT_V_T vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m1 | |||
| #define VSEV_FLOAT vse64_v_f64m1 | |||
| #define VSSEG2_FLOAT vsseg2e64_v_f64m1 | |||
| #define VSSEG4_FLOAT vsseg4e64_v_f64m1 | |||
| #define VSSEG8_FLOAT vsseg8e64_v_f64m1 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_ncopy_8.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset; | |||
| FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; | |||
| FLOAT *b_offset; | |||
| FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; | |||
| size_t vl; | |||
| //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for(j = (n >> 3); j > 0; j--) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset5 = a_offset4 + lda; | |||
| a_offset6 = a_offset5 + lda; | |||
| a_offset7 = a_offset6 + lda; | |||
| a_offset8 = a_offset7 + lda; | |||
| a_offset += 8 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||
| v5 = VLEV_FLOAT(a_offset5, vl); | |||
| v6 = VLEV_FLOAT(a_offset6, vl); | |||
| v7 = VLEV_FLOAT(a_offset7, vl); | |||
| v8 = VLEV_FLOAT(a_offset8, vl); | |||
| VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| a_offset3 += vl; | |||
| a_offset4 += vl; | |||
| a_offset5 += vl; | |||
| a_offset6 += vl; | |||
| a_offset7 += vl; | |||
| a_offset8 += vl; | |||
| b_offset += vl*8; | |||
| } | |||
| } | |||
| if (n & 4) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||
| VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| a_offset3 += vl; | |||
| a_offset4 += vl; | |||
| b_offset += vl*4; | |||
| } | |||
| } | |||
| if (n & 2) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| VSSEG2_FLOAT(b_offset, v1, v2, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| b_offset += vl*2; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| a_offset1 = a_offset; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| VSEV_FLOAT(b_offset, v1, vl); | |||
| a_offset1 += vl; | |||
| b_offset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,76 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset; | |||
| FLOAT *a_offset1; | |||
| FLOAT *b_offset; | |||
| FLOAT_V_T v0; | |||
| size_t vl; | |||
| //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for(j = n; j > 0; j -= vl) { | |||
| vl = VSETVL(j); | |||
| a_offset1 = a_offset; | |||
| a_offset += vl * lda; | |||
| for(i = m; i > 0; i--) { | |||
| v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(b_offset, v0, vl); | |||
| a_offset1++; | |||
| b_offset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,264 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m1(n) | |||
| #define FLOAT_V_T vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m1 | |||
| #define VLSEV_FLOAT vlse32_v_f32m1 | |||
| #define VSEV_FLOAT vse32_v_f32m1 | |||
| #define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 | |||
| #define VSSEG2_FLOAT vsseg2e32_v_f32m1 | |||
| #define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 | |||
| #define VSSEG4_FLOAT vsseg4e32_v_f32m1 | |||
| #define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 | |||
| #define VSSEG8_FLOAT vsseg8e32_v_f32m1 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m1(n) | |||
| #define FLOAT_V_T vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m1 | |||
| #define VLSEV_FLOAT vlse64_v_f64m1 | |||
| #define VSEV_FLOAT vse64_v_f64m1 | |||
| #define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 | |||
| #define VSSEG2_FLOAT vsseg2e64_v_f64m1 | |||
| #define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 | |||
| #define VSSEG4_FLOAT vsseg4e64_v_f64m1 | |||
| #define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 | |||
| #define VSSEG8_FLOAT vsseg8e64_v_f64m1 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1; | |||
| IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; | |||
| // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||
| aoffset = a; | |||
| boffset = b; | |||
| boffset2 = b + m * (n & ~7); | |||
| boffset3 = b + m * (n & ~3); | |||
| boffset4 = b + m * (n & ~1); | |||
| for(j = (m >> 3); j > 0; j--) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 8 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 64; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 8; | |||
| VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 8; | |||
| VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 32; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 8; | |||
| VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, v0, v1, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 16; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 8; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 8; | |||
| } | |||
| } | |||
| if (m & 4) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 4 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 32; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 4; | |||
| VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 4; | |||
| VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 16; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 4; | |||
| VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, v0, v1, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 8; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 4; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 4; | |||
| } | |||
| } | |||
| if (m & 2) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 2 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 16; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 2; | |||
| VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 2; | |||
| VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 8; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 2; | |||
| VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, v0, v1, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 4; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 2; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 2; | |||
| } | |||
| } | |||
| if (m & 1) { | |||
| aoffset1 = aoffset; | |||
| boffset1 = boffset; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 8; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset1, v0, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += 8 * m; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 4; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset2, v0, vl); | |||
| aoffset1 += 4; | |||
| //boffset2 += 4; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 2; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset3, v0, vl); | |||
| aoffset1 += 2; | |||
| // boffset3 += 2; | |||
| } | |||
| if (n & 1) { | |||
| *(boffset4) = *(aoffset1); | |||
| // aoffset1 ++; | |||
| // boffset4 ++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,74 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1; | |||
| IFLOAT *boffset; | |||
| FLOAT_V_T v0; | |||
| size_t vl; | |||
| //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); | |||
| aoffset = a; | |||
| boffset = b; | |||
| for(j = n; j > 0; j -= vl) { | |||
| vl = VSETVL(j); | |||
| aoffset1 = aoffset; | |||
| aoffset += vl; | |||
| for(i = m; i > 0; i--) { | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset, v0, vl); | |||
| aoffset1 += lda; | |||
| boffset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,601 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; | |||
| IFLOAT *ptrba,*ptrbb; | |||
| //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug | |||
| FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; | |||
| FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; | |||
| size_t vl; | |||
| // N:8 | |||
| for (j = bn/8; j > 0; j--) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| C2 = C1 + ldc; | |||
| C3 = C2 + ldc; | |||
| C4 = C3 + ldc; | |||
| C5 = C4 + ldc; | |||
| C6 = C5 + ldc; | |||
| C7 = C6 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||
| vres4 = VFMVVF_FLOAT(0.0, vl); | |||
| vres5 = VFMVVF_FLOAT(0.0, vl); | |||
| vres6 = VFMVVF_FLOAT(0.0, vl); | |||
| vres7 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 8; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrbb += 8; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); | |||
| ptrbb += 8; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); | |||
| ptrbb += 8; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); | |||
| ptrbb += 8; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); | |||
| ptrbb += 8; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); | |||
| ptrbb += 8; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); | |||
| ptrbb += 8; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); | |||
| ptrbb += 8; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrbb += 8; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| va2 = VLEV_FLOAT(C2, vl); | |||
| va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); | |||
| VSEV_FLOAT(C2, va2, vl); | |||
| va3 = VLEV_FLOAT(C3, vl); | |||
| va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); | |||
| VSEV_FLOAT(C3, va3, vl); | |||
| va4 = VLEV_FLOAT(C4, vl); | |||
| va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); | |||
| VSEV_FLOAT(C4, va4, vl); | |||
| va5 = VLEV_FLOAT(C5, vl); | |||
| va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); | |||
| VSEV_FLOAT(C5, va5, vl); | |||
| va6 = VLEV_FLOAT(C6, vl); | |||
| va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); | |||
| VSEV_FLOAT(C6, va6, vl); | |||
| va7 = VLEV_FLOAT(C7, vl); | |||
| va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); | |||
| VSEV_FLOAT(C7, va7, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| C2 += vl; | |||
| C3 += vl; | |||
| C4 += vl; | |||
| C5 += vl; | |||
| C6 += vl; | |||
| C7 += vl; | |||
| } | |||
| bb += (bk<<3); | |||
| C += (ldc<<3); | |||
| } | |||
| // N:4 | |||
| if (bn & 4) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| C2 = C1 + ldc; | |||
| C3 = C2 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 4; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrbb += 4; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||
| ptrbb += 4; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||
| ptrbb += 4; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||
| ptrbb += 4; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||
| ptrbb += 4; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||
| ptrbb += 4; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||
| ptrbb += 4; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||
| ptrbb += 4; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrbb += 4; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| va2 = VLEV_FLOAT(C2, vl); | |||
| va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); | |||
| VSEV_FLOAT(C2, va2, vl); | |||
| va3 = VLEV_FLOAT(C3, vl); | |||
| va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); | |||
| VSEV_FLOAT(C3, va3, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| C2 += vl; | |||
| C3 += vl; | |||
| } | |||
| bb += (bk<<2); | |||
| C += (ldc<<2); | |||
| } | |||
| // N:2 | |||
| if (bn & 2) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 2; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrbb += 2; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| ptrbb += 2; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| ptrbb += 2; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| ptrbb += 2; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| ptrbb += 2; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| ptrbb += 2; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| ptrbb += 2; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| ptrbb += 2; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrbb += 2; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| } | |||
| bb += (bk<<1); | |||
| C += (ldc<<1); | |||
| } | |||
| // N:1 | |||
| if (bn & 1) { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 1; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrbb += 1; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| ptrbb += 1; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| ptrbb += 1; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| ptrbb += 1; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| ptrbb += 1; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| ptrbb += 1; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| ptrbb += 1; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| ptrbb += 1; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrbb += 1; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| C0 += vl; | |||
| } | |||
| bb += (bk); | |||
| C += (ldc); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,94 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| if(n < 0) return(0); | |||
| FLOAT *a_ptr, *x_ptr; | |||
| BLASLONG i; | |||
| FLOAT_V_T va, vy; | |||
| if(inc_y == 1) { | |||
| for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { | |||
| vl = VSETVL(m); | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vy = VLEV_FLOAT(y, vl); | |||
| for(i = 0; i < n; i++) { | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| VSEV_FLOAT(y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { | |||
| vl = VSETVL(m); | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| for(i = 0; i < n; i++) { | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,119 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_ptr, *x_ptr; | |||
| FLOAT_V_T va, vx, vr; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| size_t vlmax = VSETVL_MAX_M1; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| vlmax = VSETVL_MAX; | |||
| if(inc_x == 1) { | |||
| for(i = 0; i < n; i++) { | |||
| j = m; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { | |||
| vl = VSETVL(j); | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vx = VLEV_FLOAT(x_ptr, vl); | |||
| vr = VFMACCVV_FLOAT(vr, va, vx, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); | |||
| *y += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| y += inc_y; | |||
| a += lda; | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(i = 0; i < n; i++) { | |||
| j = m; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { | |||
| vl = VSETVL(j); | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vx = VLSEV_FLOAT(x_ptr, stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, va, vx, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); | |||
| *y += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| y += inc_y; | |||
| a += lda; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,150 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFABSV_FLOAT vfabs_v_f64m8 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VFIRSTM vfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u64m8 | |||
| #define VMVVXS_UINT vmv_x_s_u64m8_u64 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFABSV_FLOAT vfabs_v_f32m8 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VFIRSTM vfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u32m8 | |||
| #define VMVVXS_UINT vmv_x_s_u32m8_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int max_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT_V_T vx, v_max; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_max_index = VMVVX_UINT(0, vlmax); | |||
| v_max = VFMVVF_FLOAT(-1, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT maxf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, vl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, vl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); | |||
| max_index = VFIRSTM(mask, vlmax); | |||
| v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); | |||
| max_index = VMVVXS_UINT(v_max_index); | |||
| return(max_index+1); | |||
| } | |||
| @@ -0,0 +1,151 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFABSV_FLOAT vfabs_v_f64m8 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VFIRSTM vfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u64m8 | |||
| #define VMVVXS_UINT vmv_x_s_u64m8_u64 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFABSV_FLOAT vfabs_v_f32m8 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VFIRSTM vfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u32m8 | |||
| #define VMVVXS_UINT vmv_x_s_u32m8_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT_V_T vx, v_min; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_min_index = VMVVX_UINT(0, vlmax); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT minf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, vlmax); | |||
| min_index = VFIRSTM(mask, vlmax); | |||
| v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); | |||
| min_index = VMVVXS_UINT(v_min_index); | |||
| return(min_index+1); | |||
| } | |||
| @@ -0,0 +1,147 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VFIRSTM vfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u64m8 | |||
| #define VMVVXS_UINT vmv_x_s_u64m8_u64 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VFIRSTM vfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u32m8 | |||
| #define VMVVXS_UINT vmv_x_s_u32m8_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int max_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT_V_T vx, v_max; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_max_index = VMVVX_UINT(0, vlmax); | |||
| v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT maxf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, vl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, vl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res, v_min; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); | |||
| max_index = VFIRSTM(mask, vlmax); | |||
| v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); | |||
| max_index = VMVVXS_UINT(v_max_index); | |||
| return(max_index+1); | |||
| } | |||
| @@ -0,0 +1,147 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VFIRSTM vfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u64m8 | |||
| #define VMVVXS_UINT vmv_x_s_u64m8_u64 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VFIRSTM vfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u32m8 | |||
| #define VMVVXS_UINT vmv_x_s_u32m8_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT_V_T vx, v_min; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_min_index = VMVVX_UINT(0, vlmax); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT minf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, vlmax); | |||
| min_index = VFIRSTM(mask, vlmax); | |||
| v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); | |||
| min_index = VMVVXS_UINT(v_min_index); | |||
| return(min_index+1); | |||
| } | |||
| @@ -0,0 +1,162 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||
| #define MASK_T vbool16_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m4_b16 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFABSV_FLOAT vfabs_v_f64m4 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m4 | |||
| #define VFIRSTM vfirst_m_b16 | |||
| #define UINT_V_T vuint64m4_t | |||
| #define VIDV_MASK_UINT vid_v_u64m4_m | |||
| #define VIDV_UINT vid_v_u64m4 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m4_m | |||
| #define VADDVX_UINT vadd_vx_u64m4 | |||
| #define VMVVX_UINT vmv_v_x_u64m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u64m4 | |||
| #define VMVVXS_UINT vmv_x_s_u64m4_u64 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m4_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFABSV_FLOAT vfabs_v_f32m4 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m4 | |||
| #define VFIRSTM vfirst_m_b8 | |||
| #define UINT_V_T vuint32m4_t | |||
| #define VIDV_MASK_UINT vid_v_u32m4_m | |||
| #define VIDV_UINT vid_v_u32m4 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m4_m | |||
| #define VADDVX_UINT vadd_vx_u32m4 | |||
| #define VMVVX_UINT vmv_v_x_u32m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u32m4 | |||
| #define VMVVXS_UINT vmv_x_s_u32m4_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int max_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT_V_T vx0, vx1, v_max; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_max_index = VMVVX_UINT(0, vlmax); | |||
| v_max = VFMVVF_FLOAT(-1, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT maxf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| vx0 = VFABSV_FLOAT(vx0, vl); | |||
| vx1 = VFABSV_FLOAT(vx1, vl); | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx0, vl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx0, vl); | |||
| } | |||
| } | |||
| else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| vx0 = VFABSV_FLOAT(vx0, vl); | |||
| vx1 = VFABSV_FLOAT(vx1, vl); | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx0, vl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx0, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); | |||
| max_index = VFIRSTM(mask, vlmax); | |||
| v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); | |||
| max_index = VMVVXS_UINT(v_max_index); | |||
| return(max_index+1); | |||
| } | |||
| @@ -0,0 +1,161 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 | |||
| #define MASK_T vbool16_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m4_b16 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFABSV_FLOAT vfabs_v_f64m4 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m4 | |||
| #define VFIRSTM vfirst_m_b16 | |||
| #define UINT_V_T vuint64m4_t | |||
| #define VIDV_MASK_UINT vid_v_u64m4_m | |||
| #define VIDV_UINT vid_v_u64m4 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m4_m | |||
| #define VADDVX_UINT vadd_vx_u64m4 | |||
| #define VMVVX_UINT vmv_v_x_u64m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u64m4 | |||
| #define VMVVXS_UINT vmv_x_s_u64m4_u64 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m4_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFABSV_FLOAT vfabs_v_f32m4 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m4 | |||
| #define VFIRSTM vfirst_m_b8 | |||
| #define UINT_V_T vuint32m4_t | |||
| #define VIDV_MASK_UINT vid_v_u32m4_m | |||
| #define VIDV_UINT vid_v_u32m4 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m4_m | |||
| #define VADDVX_UINT vadd_vx_u32m4 | |||
| #define VMVVX_UINT vmv_v_x_u32m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT vslidedown_vx_u32m4 | |||
| #define VMVVXS_UINT vmv_x_s_u32m4_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT_V_T vx0, vx1, v_min; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_min_index = VMVVX_UINT(0, vlmax); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT minf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| vx0 = VFABSV_FLOAT(vx0, vl); | |||
| vx1 = VFABSV_FLOAT(vx1, vl); | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx0, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| vx0 = VFABSV_FLOAT(vx0, vl); | |||
| vx1 = VFABSV_FLOAT(vx1, vl); | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx0, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx0, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, vlmax); | |||
| min_index = VFIRSTM(mask, vlmax); | |||
| v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); | |||
| min_index = VMVVXS_UINT(v_min_index); | |||
| return(min_index+1); | |||
| } | |||
| @@ -0,0 +1,98 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT maxf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| FLOAT_V_T vx, vmax; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vmax = VFMAXVV_FLOAT(vmax, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vmax = VFMAXVV_FLOAT(vmax, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -0,0 +1,98 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| FLOAT_V_T vx, vmin; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vmin = VFMINVV_FLOAT(vmin, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vmin = VFMINVV_FLOAT(vmin, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -0,0 +1,103 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define ABS fabsf | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define ABS fabs | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| if( n <= 0 ) return(0.0); | |||
| if(n == 1) return (ABS(x[0])); | |||
| FLOAT_V_T vr, v0; | |||
| FLOAT_V_T_M1 v_res; | |||
| FLOAT ssq = 0.0; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, v0, v0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl * inc_x) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, v0, v0, vl); | |||
| } | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); | |||
| ssq = VFMVFS_FLOAT_M1(v_res); | |||
| return sqrt(ssq); | |||
| } | |||
| @@ -0,0 +1,149 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| if(n <= 0) return(0); | |||
| FLOAT_V_T v0, v1, vx, vy; | |||
| if (inc_x == 0 || inc_y == 0) { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp; | |||
| while(i < n) | |||
| { | |||
| temp = c*x[ix] + s*y[iy] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| x[ix] = temp ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| v0 = VFMULVF_FLOAT(vx, c, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, s, vy, vl); | |||
| VSEV_FLOAT(x, v0, vl); | |||
| v1 = VFMULVF_FLOAT(vx, s, vl); | |||
| v1 = VFMSACVF_FLOAT(v1, c, vy, vl); | |||
| VSEV_FLOAT(y, v1, vl); | |||
| } | |||
| } else if(inc_y == 1) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| v0 = VFMULVF_FLOAT(vx, c, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, s, vy, vl); | |||
| VSSEV_FLOAT(x, stride_x, v0, vl); | |||
| v1 = VFMULVF_FLOAT(vx, s, vl); | |||
| v1 = VFMSACVF_FLOAT(v1, c, vy, vl); | |||
| VSEV_FLOAT(y, v1, vl); | |||
| } | |||
| } else if(inc_x == 1) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| v0 = VFMULVF_FLOAT(vx, c, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, s, vy, vl); | |||
| VSEV_FLOAT(x, v0, vl); | |||
| v1 = VFMULVF_FLOAT(vx, s, vl); | |||
| v1 = VFMSACVF_FLOAT(v1, c, vy, vl); | |||
| VSSEV_FLOAT(y, stride_y, v1, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| v0 = VFMULVF_FLOAT(vx, c, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, s, vy, vl); | |||
| VSSEV_FLOAT(x, stride_x, v0, vl); | |||
| v1 = VFMULVF_FLOAT(vx, s, vl); | |||
| v1 = VFMSACVF_FLOAT(v1, c, vy, vl); | |||
| VSSEV_FLOAT(y, stride_y, v1, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,80 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if ( (n <= 0) || (inc_x <= 0)) return(0); | |||
| FLOAT_V_T v0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| v0 = VFMULVF_FLOAT(v0, da, vl); | |||
| VSEV_FLOAT(x, v0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| v0 = VFMULVF_FLOAT(v0, da, vl); | |||
| VSSEV_FLOAT(x, stride_x, v0, vl); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,95 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| FLOAT_V_T vx, vsum; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vsum = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vsum = VFADDVV_FLOAT(vsum, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vsum = VFADDVV_FLOAT(vsum, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); | |||
| sumf = VFMVFS_FLOAT_M1(v_res); | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,142 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG stride_x, stride_y; | |||
| FLOAT_V_T vx, vy; | |||
| if (n <= 0) return(0); | |||
| if (inc_x == 0 && inc_y == 0) { | |||
| if (n & 1) { | |||
| FLOAT temp = x[0]; | |||
| x[0] = y[0]; | |||
| y[0] = temp; | |||
| } | |||
| else { | |||
| return 0; | |||
| } | |||
| } | |||
| else if(inc_x == 0) { | |||
| FLOAT temp = x[0]; | |||
| x[0] = y[(n - 1) * inc_y]; | |||
| FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one | |||
| stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse | |||
| BLASLONG m = n - 1; | |||
| for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) { | |||
| vl = VSETVL(m); | |||
| vy = VLSEV_FLOAT(ptr - 1, stride_y, vl); | |||
| VSSEV_FLOAT(ptr, stride_y, vy, vl); | |||
| } | |||
| y[0] = temp; | |||
| } | |||
| else if(inc_y == 0) { | |||
| FLOAT temp = y[0]; | |||
| y[0] = x[(n - 1) * inc_x]; | |||
| FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one | |||
| stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse | |||
| BLASLONG m = n - 1; | |||
| for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) { | |||
| vl = VSETVL(m); | |||
| vx = VLSEV_FLOAT(ptr - 1, stride_x, vl); | |||
| VSSEV_FLOAT(ptr, stride_x, vx, vl); | |||
| } | |||
| x[0] = temp; | |||
| } | |||
| else if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| VSEV_FLOAT(y, vx, vl); | |||
| VSEV_FLOAT(x, vy, vl); | |||
| } | |||
| } else if (inc_y == 1) { | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| VSEV_FLOAT(y, vx, vl); | |||
| VSSEV_FLOAT(x, stride_x, vy, vl); | |||
| } | |||
| } else if(inc_x == 1) { | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| VSSEV_FLOAT(y, stride_y, vx, vl); | |||
| VSEV_FLOAT(x, vy, vl); | |||
| } | |||
| } else { | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| VSSEV_FLOAT(y, stride_y, vx, vl); | |||
| VSSEV_FLOAT(x, stride_x, vy, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,101 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m2() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define INT_V_T vint32m2_t | |||
| #define VID_V_INT vid_v_i32m2 | |||
| #define VADD_VX_INT vadd_vx_i32m2 | |||
| #define VMSGT_VX_INT vmsgt_vx_i32m2_b16 | |||
| #define VBOOL_T vbool16_t | |||
| #define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m2() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define INT_V_T vint64m2_t | |||
| #define VID_V_INT vid_v_i64m2 | |||
| #define VADD_VX_INT vadd_vx_i64m2 | |||
| #define VMSGT_VX_INT vmsgt_vx_i64m2_b32 | |||
| #define VBOOL_T vbool32_t | |||
| #define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/symm_lcopy_4.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) | |||
| { | |||
| BLASLONG i, js, offset; | |||
| FLOAT *ao1, *ao2; | |||
| BLASLONG stride_lda = sizeof(FLOAT)*lda; | |||
| FLOAT_V_T vb, va1, va2; | |||
| VBOOL_T vbool; | |||
| INT_V_T vindex_max, vindex; | |||
| size_t vl = VSETVL_MAX; | |||
| vindex_max = VID_V_INT(vl); | |||
| for (js = n; js > 0; js -= vl, posX += vl) { | |||
| vl = VSETVL(js); | |||
| offset = posX - posY; | |||
| ao1 = a + posX + posY * lda; | |||
| ao2 = a + posY + (posX) * lda; | |||
| for (i = m; i > 0; i--, offset--) { | |||
| va2 = VLSEV_FLOAT(ao2, stride_lda, vl); | |||
| va1 = VLEV_FLOAT(ao1, vl); | |||
| // offset > (0 - vindex) ---> (offset + vindex) > 0 | |||
| vindex = VADD_VX_INT(vindex_max, offset, vl); | |||
| vbool = VMSGT_VX_INT(vindex, 0, vl); | |||
| vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); | |||
| VSEV_FLOAT(b, vb, vl); | |||
| b += vl; | |||
| ao1 += lda; | |||
| ao2++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,100 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m2() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define INT_V_T vint32m2_t | |||
| #define VID_V_INT vid_v_i32m2 | |||
| #define VADD_VX_INT vadd_vx_i32m2 | |||
| #define VMSGT_VX_INT vmsgt_vx_i32m2_b16 | |||
| #define VBOOL_T vbool16_t | |||
| #define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m2() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define INT_V_T vint64m2_t | |||
| #define VID_V_INT vid_v_i64m2 | |||
| #define VADD_VX_INT vadd_vx_i64m2 | |||
| #define VMSGT_VX_INT vmsgt_vx_i64m2_b32 | |||
| #define VBOOL_T vbool32_t | |||
| #define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/symm_ucopy_4.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) | |||
| { | |||
| BLASLONG i, js, offset; | |||
| FLOAT *ao1, *ao2; | |||
| BLASLONG stride_lda = sizeof(FLOAT)*lda; | |||
| FLOAT_V_T vb, va1, va2; | |||
| VBOOL_T vbool; | |||
| INT_V_T vindex_max, vindex; | |||
| size_t vl = VSETVL_MAX; | |||
| vindex_max = VID_V_INT(vl); | |||
| for (js = n; js > 0; js -= vl, posX += vl) { | |||
| vl = VSETVL(js); | |||
| offset = posX - posY; | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posX + 0 + posY * lda; | |||
| for (i = m; i > 0; i--, offset--) { | |||
| va1 = VLSEV_FLOAT(ao1, stride_lda, vl); | |||
| va2 = VLEV_FLOAT(ao2, vl); | |||
| // offset > (0 - vindex) ---> (offset + vindex) > 0 | |||
| vindex = VADD_VX_INT(vindex_max, offset, vl); | |||
| vbool = VMSGT_VX_INT(vindex, 0, vl); | |||
| vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); | |||
| VSEV_FLOAT(b, vb, vl); | |||
| b += vl; | |||
| ao1++; | |||
| ao2 += lda; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,224 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m8 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m8 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, k; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| FLOAT temp1; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| size_t vlmax = VSETVL_MAX_M1, vl; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| vlmax = VSETVL_MAX; | |||
| FLOAT_V_T va, vx, vy, vr; | |||
| BLASLONG stride_x, stride_y, inc_xv, inc_yv; | |||
| if(inc_x == 1 && inc_y == 1) | |||
| { | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| y[j] += temp1 * a_ptr[j]; | |||
| i = j + 1; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (k = (m-i); k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vr = VFMVVF_FLOAT(0, vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VLEV_FLOAT(&y[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); | |||
| y[j] += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| else if(inc_x == 1) | |||
| { | |||
| jy = 0; | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| y[jy] += temp1 * a_ptr[j]; | |||
| iy = jy + inc_y; | |||
| i = j + 1; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (k = (m-i); k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_yv = inc_y * vl; | |||
| vr = VFMVVF_FLOAT(0, vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); | |||
| y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| jy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| else if(inc_y == 1) | |||
| { | |||
| jx = 0; | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| y[j] += temp1 * a_ptr[j]; | |||
| ix = jx + inc_x; | |||
| i = j + 1; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (k = (m-i); k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vr = VFMVVF_FLOAT(0, vl); | |||
| inc_xv = inc_x * vl; | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VLEV_FLOAT(&y[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); | |||
| y[j] += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| jx += inc_x; | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| jx = 0; | |||
| jy = 0; | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| y[jy] += temp1 * a_ptr[j]; | |||
| ix = jx + inc_x; | |||
| iy = jy + inc_y; | |||
| i = j + 1; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (k = (m-i); k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_xv = inc_x * vl; | |||
| inc_yv = inc_y * vl; | |||
| vr = VFMVVF_FLOAT(0, vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); | |||
| y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,221 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m8 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m8 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, k; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| FLOAT temp1; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| size_t vl_max = VSETVL_MAX_M1, vl; | |||
| v_res = VFMVVF_FLOAT_M1(0, vl_max); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vl_max); | |||
| vl_max = VSETVL_MAX; | |||
| FLOAT_V_T va, vx, vy, vr; | |||
| BLASLONG stride_x, stride_y, inc_xv, inc_yv; | |||
| BLASLONG m1 = m - offset; | |||
| if(inc_x == 1 && inc_y == 1) | |||
| { | |||
| a_ptr += m1 * lda; | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| i = 0; | |||
| vr = VFMVVF_FLOAT(0, vl_max); | |||
| for (k = j; k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vr = VFMVVF_FLOAT(0, vl); | |||
| vy = VLEV_FLOAT(&y[i], vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); | |||
| y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| else if(inc_x == 1) | |||
| { | |||
| jy = m1 * inc_y; | |||
| a_ptr += m1 * lda; | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| iy = 0; | |||
| i = 0; | |||
| vr = VFMVVF_FLOAT(0, vl_max); | |||
| for (k = j; k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_yv = inc_y * vl; | |||
| vr = VFMVVF_FLOAT(0, vl); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); | |||
| y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| jy += inc_y; | |||
| } | |||
| } | |||
| else if(inc_y == 1) | |||
| { | |||
| jx = m1 * inc_x; | |||
| a_ptr += m1 * lda; | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| ix = 0; | |||
| i = 0; | |||
| vr = VFMVVF_FLOAT(0, vl_max); | |||
| for (k = j; k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_xv = inc_x * vl; | |||
| vr = VFMVVF_FLOAT(0, vl); | |||
| vy = VLEV_FLOAT(&y[i], vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); | |||
| y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| jx += inc_x; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| jx = m1 * inc_x; | |||
| jy = m1 * inc_y; | |||
| a_ptr += m1 * lda; | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| ix = 0; | |||
| iy = 0; | |||
| i = 0; | |||
| vr = VFMVVF_FLOAT(0, vl_max); | |||
| for (k = j; k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_xv = inc_x * vl; | |||
| inc_yv = inc_y * vl; | |||
| vr = VFMVVF_FLOAT(0, vl); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); | |||
| y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,138 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VBOOL_T vbool16_t | |||
| #define UINT_V_T vint32m2_t | |||
| #define VID_V_UINT vid_v_i32m2 | |||
| #define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 | |||
| #define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 | |||
| #define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VBOOL_T vbool32_t | |||
| #define UINT_V_T vuint64m2_t | |||
| #define VID_V_UINT vid_v_u64m2 | |||
| #define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 | |||
| #define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 | |||
| #define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js, X; | |||
| FLOAT *ao; | |||
| BLASLONG stride_lda = sizeof(FLOAT)*lda; | |||
| FLOAT_V_T vb, va1; | |||
| size_t vl; | |||
| #ifdef UNIT | |||
| VBOOL_T vbool_eq; | |||
| #endif | |||
| VBOOL_T vbool_cmp; | |||
| UINT_V_T vindex; | |||
| for (js = n; js > 0; js -= vl) | |||
| { | |||
| vl = VSETVL(js); | |||
| X = posX; | |||
| if (posX <= posY) | |||
| { | |||
| ao = a + posY + posX * lda; | |||
| } | |||
| else | |||
| { | |||
| ao = a + posX + posY * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X > posY) | |||
| { | |||
| va1 = VLSEV_FLOAT(ao, stride_lda, vl); | |||
| VSEV_FLOAT(b, va1, vl); | |||
| ao ++; | |||
| b += vl; | |||
| X ++; | |||
| i ++; | |||
| } | |||
| else if (X < posY) | |||
| { | |||
| ao += lda; | |||
| b += vl; | |||
| X ++; | |||
| i ++; | |||
| } | |||
| else | |||
| { | |||
| vindex = VID_V_UINT(vl); | |||
| for (unsigned int j = 0; j < vl; j++) | |||
| { | |||
| va1 = VLSEV_FLOAT(ao, stride_lda, vl); | |||
| vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); | |||
| vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); | |||
| #ifdef UNIT | |||
| vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); | |||
| vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); | |||
| #endif | |||
| VSEV_FLOAT(b, vb, vl); | |||
| ao++; | |||
| b += vl; | |||
| } | |||
| X += vl; | |||
| i += vl; | |||
| } | |||
| } while (i < m); | |||
| posY += vl; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,134 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VBOOL_T vbool16_t | |||
| #define UINT_V_T vuint32m2_t | |||
| #define VID_V_UINT vid_v_u32m2 | |||
| #define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 | |||
| #define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 | |||
| #define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VBOOL_T vbool32_t | |||
| #define UINT_V_T vuint64m2_t | |||
| #define VID_V_UINT vid_v_u64m2 | |||
| #define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 | |||
| #define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 | |||
| #define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js, X; | |||
| FLOAT *ao; | |||
| FLOAT_V_T vb, va1; | |||
| size_t vl; | |||
| #ifdef UNIT | |||
| VBOOL_T vbool_eq; | |||
| #endif | |||
| VBOOL_T vbool_cmp; | |||
| UINT_V_T vindex; | |||
| for (js = n; js > 0; js -= vl) | |||
| { | |||
| vl = VSETVL(js); | |||
| X = posX; | |||
| if (posX <= posY) | |||
| { | |||
| ao = a + posY + posX * lda; | |||
| } | |||
| else | |||
| { | |||
| ao = a + posX + posY * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X > posY) | |||
| { | |||
| ao ++; | |||
| b += vl; | |||
| X ++; | |||
| i ++; | |||
| } | |||
| else if (X < posY) | |||
| { | |||
| va1 = VLEV_FLOAT(ao, vl); | |||
| VSEV_FLOAT(b, va1, vl); | |||
| ao += lda; | |||
| b += vl; | |||
| X ++; | |||
| i ++; | |||
| } | |||
| else | |||
| { | |||
| vindex = VID_V_UINT(vl); | |||
| for (unsigned int j = 0; j < vl; j++) | |||
| { | |||
| va1 = VLEV_FLOAT(ao, vl); | |||
| vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); | |||
| vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); | |||
| #ifdef UNIT | |||
| vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); | |||
| vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); | |||
| #endif | |||
| VSEV_FLOAT(b, vb, vl); | |||
| ao += lda; | |||
| b += vl; | |||
| } | |||
| X += vl; | |||
| i += vl; | |||
| } | |||
| } while (i < m); | |||
| posY += vl; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VBOOL_T vbool16_t | |||
| #define UINT_V_T vuint32m2_t | |||
| #define VID_V_UINT vid_v_u32m2 | |||
| #define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 | |||
| #define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 | |||
| #define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VBOOL_T vbool32_t | |||
| #define UINT_V_T vuint64m2_t | |||
| #define VID_V_UINT vid_v_u64m2 | |||
| #define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 | |||
| #define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 | |||
| #define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js, X; | |||
| BLASLONG stride_lda = sizeof(FLOAT) * lda; | |||
| FLOAT *ao; | |||
| FLOAT_V_T vb, va1; | |||
| size_t vl; | |||
| #ifdef UNIT | |||
| VBOOL_T vbool_eq; | |||
| #endif | |||
| VBOOL_T vbool_cmp; | |||
| UINT_V_T vindex; | |||
| for (js = n; js > 0; js -= vl) | |||
| { | |||
| vl = VSETVL(js); | |||
| X = posX; | |||
| if (posX <= posY) | |||
| { | |||
| ao = a + posX + posY * lda; | |||
| } | |||
| else | |||
| { | |||
| ao = a + posY + posX * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X < posY) | |||
| { | |||
| va1 = VLSEV_FLOAT(ao, stride_lda, vl); | |||
| VSEV_FLOAT(b, va1, vl); | |||
| ao ++; | |||
| b += vl; | |||
| X ++; | |||
| i ++; | |||
| } | |||
| else if (X > posY) | |||
| { | |||
| ao += lda; | |||
| b += vl; | |||
| X ++; | |||
| i ++; | |||
| } | |||
| else | |||
| { | |||
| vindex = VID_V_UINT(vl); | |||
| for (unsigned int j = 0; j < vl; j++) | |||
| { | |||
| va1 = VLSEV_FLOAT(ao, stride_lda, vl); | |||
| vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); | |||
| vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); | |||
| #ifdef UNIT | |||
| vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); | |||
| vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); | |||
| #endif | |||
| VSEV_FLOAT(b, vb, vl); | |||
| ao++; | |||
| b += vl; | |||
| } | |||
| X += vl; | |||
| i += vl; | |||
| } | |||
| }while (i < m); | |||
| posY += vl; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,133 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VBOOL_T vbool16_t | |||
| #define UINT_V_T vuint32m2_t | |||
| #define VID_V_UINT vid_v_u32m2 | |||
| #define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 | |||
| #define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 | |||
| #define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VBOOL_T vbool32_t | |||
| #define UINT_V_T vuint64m2_t | |||
| #define VID_V_UINT vid_v_u64m2 | |||
| #define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 | |||
| #define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 | |||
| #define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, j, js, X; | |||
| FLOAT *ao; | |||
| FLOAT_V_T vb, va1; | |||
| #ifdef UNIT | |||
| VBOOL_T vbool_eq; | |||
| #endif | |||
| VBOOL_T vbool_cmp; | |||
| UINT_V_T vindex; | |||
| size_t vl; | |||
| for (js = n; js > 0; js -= vl) | |||
| { | |||
| vl = VSETVL(js); | |||
| X = posX; | |||
| if (posX <= posY) | |||
| { | |||
| ao = a + posX + posY * lda; | |||
| } | |||
| else | |||
| { | |||
| ao = a + posY + posX * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X < posY) | |||
| { | |||
| ao ++; | |||
| b += vl; | |||
| X ++; | |||
| i++; | |||
| } | |||
| else if (X > posY) | |||
| { | |||
| va1 = VLEV_FLOAT(ao, vl); | |||
| VSEV_FLOAT(b, va1, vl); | |||
| ao += lda; | |||
| b += vl; | |||
| X++; | |||
| i++; | |||
| } | |||
| else | |||
| { | |||
| vindex = VID_V_UINT(vl); | |||
| for (j = 0; j < vl; j++) | |||
| { | |||
| va1 = VLEV_FLOAT(ao, vl); | |||
| vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); | |||
| vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); | |||
| #ifdef UNIT | |||
| vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); | |||
| vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); | |||
| #endif | |||
| VSEV_FLOAT(b, vb, vl); | |||
| ao += lda; | |||
| b += vl; | |||
| } | |||
| X += vl; | |||
| i += vl; | |||
| } | |||
| }while (i < m); | |||
| posY += vl; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,685 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m2 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m2 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/trmmkernel_8x8.c | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) | |||
| { | |||
| //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc); | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; | |||
| FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; | |||
| FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; | |||
| size_t vl; | |||
| BLASLONG off, temp; | |||
| #if !defined(LEFT) | |||
| off = -offset; | |||
| #else | |||
| off = 0; | |||
| #endif | |||
| for (j = bn/8; j > 0; j--) | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| C2 = C1+ldc; | |||
| C3 = C2+ldc; | |||
| C4 = C3+ldc; | |||
| C5 = C4+ldc; | |||
| C6 = C5+ldc; | |||
| C7 = C6+ldc; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) | |||
| { | |||
| vl = VSETVL(i); | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*vl; | |||
| ptrbb = bb + off*8; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||
| vres4 = VFMVVF_FLOAT(0.0, vl); | |||
| vres5 = VFMVVF_FLOAT(0.0, vl); | |||
| vres6 = VFMVVF_FLOAT(0.0, vl); | |||
| vres7 = VFMVVF_FLOAT(0.0, vl); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+vl; // number of values in A | |||
| #else | |||
| temp = off+8; // number of values in B | |||
| #endif | |||
| for (k = temp/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrbb += 8; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); | |||
| ptrbb += 8; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); | |||
| ptrbb += 8; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); | |||
| ptrbb += 8; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); | |||
| ptrbb += 8; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); | |||
| ptrbb += 8; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); | |||
| ptrbb += 8; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); | |||
| ptrbb += 8; | |||
| } | |||
| for (k = temp&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrbb += 8; | |||
| ptrba += vl; | |||
| } | |||
| va0 = VFMULVF_FLOAT(vres0, alpha, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VFMULVF_FLOAT(vres1, alpha, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| va2 = VFMULVF_FLOAT(vres2, alpha, vl); | |||
| VSEV_FLOAT(C2, va2, vl); | |||
| va3 = VFMULVF_FLOAT(vres3, alpha, vl); | |||
| VSEV_FLOAT(C3, va3, vl); | |||
| va4 = VFMULVF_FLOAT(vres4, alpha, vl); | |||
| VSEV_FLOAT(C4, va4, vl); | |||
| va5 = VFMULVF_FLOAT(vres5, alpha, vl); | |||
| VSEV_FLOAT(C5, va5, vl); | |||
| va6 = VFMULVF_FLOAT(vres6, alpha, vl); | |||
| VSEV_FLOAT(C6, va6, vl); | |||
| va7 = VFMULVF_FLOAT(vres7, alpha, vl); | |||
| VSEV_FLOAT(C7, va7, vl); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= vl; // number of values in A | |||
| #else | |||
| temp -= 8; // number of values in B | |||
| #endif | |||
| ptrba += temp*vl; | |||
| ptrbb += temp*8; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += vl; // number of values in A | |||
| #endif | |||
| C0 += vl; | |||
| C1 += vl; | |||
| C2 += vl; | |||
| C3 += vl; | |||
| C4 += vl; | |||
| C5 += vl; | |||
| C6 += vl; | |||
| C7 += vl; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 8; | |||
| #endif | |||
| bb += (bk<<3); | |||
| C += (ldc<<3); | |||
| } | |||
| if (bn & 4) | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| C2 = C1+ldc; | |||
| C3 = C2+ldc; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) | |||
| { | |||
| vl = VSETVL(i); | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*vl; | |||
| ptrbb = bb + off*4; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+vl; // number of values in A | |||
| #else | |||
| temp = off+4; // number of values in B | |||
| #endif | |||
| for (k = temp/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrbb += 4; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||
| ptrbb += 4; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||
| ptrbb += 4; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||
| ptrbb += 4; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||
| ptrbb += 4; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||
| ptrbb += 4; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||
| ptrbb += 4; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||
| ptrbb += 4; | |||
| } | |||
| // K remainder | |||
| for (k = temp&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrbb += 4; | |||
| ptrba += vl; | |||
| } | |||
| va0 = VFMULVF_FLOAT(vres0, alpha, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VFMULVF_FLOAT(vres1, alpha, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| va2 = VFMULVF_FLOAT(vres2, alpha, vl); | |||
| VSEV_FLOAT(C2, va2, vl); | |||
| va3 = VFMULVF_FLOAT(vres3, alpha, vl); | |||
| VSEV_FLOAT(C3, va3, vl); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= vl; // number of values in A | |||
| #else | |||
| temp -= 4; // number of values in B | |||
| #endif | |||
| ptrba += temp*vl; | |||
| ptrbb += temp*4; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += vl; // number of values in A | |||
| #endif | |||
| C0 += vl; | |||
| C1 += vl; | |||
| C2 += vl; | |||
| C3 += vl; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 4; | |||
| #endif | |||
| bb += (bk<<2); | |||
| C += (ldc<<2); | |||
| } | |||
| if (bn & 2) | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) | |||
| { | |||
| vl = VSETVL(i); | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*vl; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+vl; // number of values in A | |||
| #else | |||
| temp = off+2; // number of values in B | |||
| #endif | |||
| for (k = temp/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrbb += 2; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| ptrbb += 2; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| ptrbb += 2; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| ptrbb += 2; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| ptrbb += 2; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| ptrbb += 2; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| ptrbb += 2; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| ptrbb += 2; | |||
| } | |||
| // K remainder | |||
| for (k = temp&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrbb += 2; | |||
| ptrba += vl; | |||
| } | |||
| va0 = VFMULVF_FLOAT(vres0, alpha, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VFMULVF_FLOAT(vres1, alpha, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= vl; // number of values in A | |||
| #else | |||
| temp -= 2; // number of values in B | |||
| #endif | |||
| ptrba += temp*vl; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += vl; // number of values in A | |||
| #endif | |||
| C0 += vl; | |||
| C1 += vl; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; | |||
| #endif | |||
| bb += (bk<<1); | |||
| C += (ldc<<1); | |||
| } | |||
| if (bn & 1) | |||
| { | |||
| C0 = C; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) | |||
| { | |||
| vl = VSETVL(i); | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*vl; | |||
| ptrbb = bb + off*1; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+vl; // number of values in A | |||
| #else | |||
| temp = off+1; // number of values in B | |||
| #endif | |||
| for (k = temp/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrbb += 1; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| ptrbb += 1; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| ptrbb += 1; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| ptrbb += 1; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| ptrbb += 1; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| ptrbb += 1; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| ptrbb += 1; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| ptrbb += 1; | |||
| } | |||
| // K remainder | |||
| for (k = temp&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrbb += 1; | |||
| ptrba += vl; | |||
| } | |||
| va0 = VFMULVF_FLOAT(vres0, alpha, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= vl; // number of values in A | |||
| #else | |||
| temp -= 1; // number of values in B | |||
| #endif | |||
| ptrba += temp*vl; | |||
| ptrbb += temp*1; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += vl; // number of values in A | |||
| #endif | |||
| C0 += vl; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 1; | |||
| #endif | |||
| bb += (bk); | |||
| C += (ldc); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,847 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m2() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSSEV_FLOAT vsse32_v_f32m2 | |||
| #define VSSEG2_FLOAT vsseg2e32_v_f32m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m2 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m2 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m2() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSSEV_FLOAT vsse64_v_f64m2 | |||
| #define VSSEG2_FLOAT vsseg2e64_v_f64m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m2 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m2 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 | |||
| #endif | |||
| static FLOAT dm1 = -1.; | |||
| #ifdef CONJ | |||
| #define GEMM_KERNEL GEMM_KERNEL_L | |||
| #else | |||
| #define GEMM_KERNEL GEMM_KERNEL_N | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| #define GEMM_UNROLL_N_SHIFT 0 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 2 | |||
| #define GEMM_UNROLL_N_SHIFT 1 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 4 | |||
| #define GEMM_UNROLL_N_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 8 | |||
| #define GEMM_UNROLL_N_SHIFT 3 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 16 | |||
| #define GEMM_UNROLL_N_SHIFT 4 | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c | |||
| #ifndef COMPLEX | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| FLOAT *pa, *pc; | |||
| int i, j, k; | |||
| //fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug | |||
| size_t vl; | |||
| FLOAT_V_T va, vc; | |||
| a += (m - 1) * m; | |||
| b += (m - 1) * n; | |||
| for (i = m - 1; i >= 0; i--) | |||
| { | |||
| aa = *(a + i); | |||
| for (j = 0; j < n; j ++) | |||
| { | |||
| bb = *(c + i + j * ldc); | |||
| bb *= aa; | |||
| *b = bb; | |||
| *(c + i + j * ldc) = bb; | |||
| b ++; | |||
| pa = a; | |||
| pc = c + j * ldc; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc = VLEV_FLOAT(pc, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc = VFNMSACVF_FLOAT(vc, bb, va, vl); | |||
| VSEV_FLOAT(pc, vc, vl); | |||
| pa += vl; | |||
| pc += vl; | |||
| } | |||
| } | |||
| a -= m; | |||
| b -= 2 * n; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 2 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb0, bb1; | |||
| FLOAT *pa, *pc, *pc0, *pc1; | |||
| FLOAT *pb0, *pb1; | |||
| int i, j, k; | |||
| fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug | |||
| size_t vl; | |||
| FLOAT_V_T va, vc0, vc1; | |||
| a += (m - 1) * m; | |||
| b += (m - 1) * n; | |||
| for (i = m - 1; i >= 0; i--) | |||
| { | |||
| aa = *(a + i); | |||
| pc = c + i; | |||
| for (j = 0; j < n/2; j ++) | |||
| { | |||
| //bb = *(c + i + j * ldc); | |||
| pb0 = pc + j * ldc * 2; | |||
| pb1 = pb0 + ldc; | |||
| //bb *= aa; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| //*b = bb; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| //*(c + i + j * ldc) = bb; | |||
| //b ++; | |||
| b += 2; | |||
| //pa = a + i + 1; | |||
| pc0 = c + j * ldc * 2; | |||
| pc1 = pc0 + ldc; | |||
| pa = a; | |||
| //pc = c + j * ldc; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| } | |||
| } | |||
| pc += ldc * (n/2) * 2; | |||
| if (n & 1) | |||
| { | |||
| pb0 = pc; | |||
| bb0 = (*pb0) * aa; | |||
| *b = bb0; | |||
| *pb0 = bb0; | |||
| b += 1; | |||
| pc0 = pc - i; | |||
| pa = a; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| } | |||
| } | |||
| a -= m; | |||
| b -= 2 * n; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 4 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb0, bb1, bb2, bb3; | |||
| FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3; | |||
| FLOAT *pb0, *pb1, *pb2, *pb3; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T va, vc0, vc1, vc2, vc3; | |||
| a += (m - 1) * m; | |||
| b += (m - 1) * n; | |||
| for (i = m - 1; i >= 0; i--) | |||
| { | |||
| aa = *(a + i); | |||
| pc = c + i; | |||
| for (j = 0; j < n/4; j ++) | |||
| { | |||
| pb0 = pc + j * ldc * 4; | |||
| pb1 = pb0 + ldc; | |||
| pb2 = pb1 + ldc; | |||
| pb3 = pb2 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| bb2 = (*pb2) * aa; | |||
| bb3 = (*pb3) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *(b+2) = bb2; | |||
| *(b+3) = bb3; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| *pb2 = bb2; | |||
| *pb3 = bb3; | |||
| b += 4; | |||
| pc0 = c + j * ldc * 4; | |||
| pc1 = pc0 + ldc; | |||
| pc2 = pc1 + ldc; | |||
| pc3 = pc2 + ldc; | |||
| pa = a; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| vc2 = VLEV_FLOAT(pc2, vl); | |||
| vc3 = VLEV_FLOAT(pc3, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| VSEV_FLOAT(pc2, vc2, vl); | |||
| VSEV_FLOAT(pc3, vc3, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| pc2 += vl; | |||
| pc3 += vl; | |||
| } | |||
| } | |||
| pc += ldc * (n/4) * 4; | |||
| if (n & 2) | |||
| { | |||
| pb0 = pc + j * ldc * 2; | |||
| pb1 = pb0 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| b += 2; | |||
| pc0 = c + j * ldc * 2; | |||
| pc1 = pc0 + ldc; | |||
| pa = a; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| } | |||
| pc += ldc * 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| pb0 = pc; | |||
| bb0 = (*pb0) * aa; | |||
| *b = bb0; | |||
| *pb0 = bb0; | |||
| b += 1; | |||
| pc0 = pc - i; | |||
| pa = a; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| } | |||
| } | |||
| a -= m; | |||
| b -= 2 * n; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 8 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; | |||
| FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; | |||
| FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; | |||
| a += (m - 1) * m; | |||
| b += (m - 1) * n; | |||
| for (i = m - 1; i >= 0; i--) | |||
| { | |||
| aa = *(a + i); | |||
| pc = c + i; | |||
| for (j = 0; j < n/8; j ++) | |||
| { | |||
| pb0 = pc + j * ldc * 8; | |||
| pb1 = pb0 + ldc; | |||
| pb2 = pb1 + ldc; | |||
| pb3 = pb2 + ldc; | |||
| pb4 = pb3 + ldc; | |||
| pb5 = pb4 + ldc; | |||
| pb6 = pb5 + ldc; | |||
| pb7 = pb6 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| bb2 = (*pb2) * aa; | |||
| bb3 = (*pb3) * aa; | |||
| bb4 = (*pb4) * aa; | |||
| bb5 = (*pb5) * aa; | |||
| bb6 = (*pb6) * aa; | |||
| bb7 = (*pb7) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *(b+2) = bb2; | |||
| *(b+3) = bb3; | |||
| *(b+4) = bb4; | |||
| *(b+5) = bb5; | |||
| *(b+6) = bb6; | |||
| *(b+7) = bb7; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| *pb2 = bb2; | |||
| *pb3 = bb3; | |||
| *pb4 = bb4; | |||
| *pb5 = bb5; | |||
| *pb6 = bb6; | |||
| *pb7 = bb7; | |||
| b += 8; | |||
| pc0 = c + j * ldc * 8; | |||
| pc1 = pc0 + ldc; | |||
| pc2 = pc1 + ldc; | |||
| pc3 = pc2 + ldc; | |||
| pc4 = pc3 + ldc; | |||
| pc5 = pc4 + ldc; | |||
| pc6 = pc5 + ldc; | |||
| pc7 = pc6 + ldc; | |||
| pa = a; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| vc2 = VLEV_FLOAT(pc2, vl); | |||
| vc3 = VLEV_FLOAT(pc3, vl); | |||
| vc4 = VLEV_FLOAT(pc4, vl); | |||
| vc5 = VLEV_FLOAT(pc5, vl); | |||
| vc6 = VLEV_FLOAT(pc6, vl); | |||
| vc7 = VLEV_FLOAT(pc7, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); | |||
| vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); | |||
| vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); | |||
| vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); | |||
| vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| VSEV_FLOAT(pc2, vc2, vl); | |||
| VSEV_FLOAT(pc3, vc3, vl); | |||
| VSEV_FLOAT(pc4, vc4, vl); | |||
| VSEV_FLOAT(pc5, vc5, vl); | |||
| VSEV_FLOAT(pc6, vc6, vl); | |||
| VSEV_FLOAT(pc7, vc7, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| pc2 += vl; | |||
| pc3 += vl; | |||
| pc4 += vl; | |||
| pc5 += vl; | |||
| pc6 += vl; | |||
| pc7 += vl; | |||
| } | |||
| } | |||
| pc += ldc * (n/8) * 8; | |||
| if (n & 4) | |||
| { | |||
| pb0 = pc + j * ldc * 4; | |||
| pb1 = pb0 + ldc; | |||
| pb2 = pb1 + ldc; | |||
| pb3 = pb2 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| bb2 = (*pb2) * aa; | |||
| bb3 = (*pb3) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *(b+2) = bb2; | |||
| *(b+3) = bb3; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| *pb2 = bb2; | |||
| *pb3 = bb3; | |||
| b += 4; | |||
| pc0 = c + j * ldc * 4; | |||
| pc1 = pc0 + ldc; | |||
| pc2 = pc1 + ldc; | |||
| pc3 = pc2 + ldc; | |||
| pa = a; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| vc2 = VLEV_FLOAT(pc2, vl); | |||
| vc3 = VLEV_FLOAT(pc3, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| VSEV_FLOAT(pc2, vc2, vl); | |||
| VSEV_FLOAT(pc3, vc3, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| pc2 += vl; | |||
| pc3 += vl; | |||
| } | |||
| pc += ldc * 4; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| pb0 = pc + j * ldc * 2; | |||
| pb1 = pb0 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| b += 2; | |||
| pc0 = c + j * ldc * 2; | |||
| pc1 = pc0 + ldc; | |||
| pa = a; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| } | |||
| pc += ldc * 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| pb0 = pc; | |||
| bb0 = (*pb0) * aa; | |||
| *b = bb0; | |||
| *pb0 = bb0; | |||
| b += 1; | |||
| pc0 = pc - i; | |||
| pa = a; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| } | |||
| } | |||
| a -= m; | |||
| b -= 2 * n; | |||
| } | |||
| } | |||
| #else | |||
| static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| int i, j, k; | |||
| a += (m - 1) * m; | |||
| b += (m - 1) * n; | |||
| for (i = m - 1; i >= 0; i--) { | |||
| aa = *(a + i); | |||
| for (j = 0; j < n; j ++) { | |||
| bb = *(c + i + j * ldc); | |||
| bb *= aa; | |||
| *b = bb; | |||
| *(c + i + j * ldc) = bb; | |||
| b ++; | |||
| for (k = 0; k < i; k ++){ | |||
| *(c + k + j * ldc) -= bb * *(a + k); | |||
| } | |||
| } | |||
| a -= m; | |||
| b -= 2 * n; | |||
| } | |||
| } | |||
| #endif | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| int i, j, k; | |||
| ldc *= 2; | |||
| a += (m - 1) * m * 2; | |||
| b += (m - 1) * n * 2; | |||
| for (i = m - 1; i >= 0; i--) { | |||
| aa1 = *(a + i * 2 + 0); | |||
| aa2 = *(a + i * 2 + 1); | |||
| for (j = 0; j < n; j ++) { | |||
| bb1 = *(c + i * 2 + 0 + j * ldc); | |||
| bb2 = *(c + i * 2 + 1 + j * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = aa1 * bb2 - aa2 * bb1; | |||
| #endif | |||
| *(b + 0) = cc1; | |||
| *(b + 1) = cc2; | |||
| *(c + i * 2 + 0 + j * ldc) = cc1; | |||
| *(c + i * 2 + 1 + j * ldc) = cc2; | |||
| b += 2; | |||
| for (k = 0; k < i; k ++){ | |||
| #ifndef CONJ | |||
| *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); | |||
| *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
| #else | |||
| *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); | |||
| *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
| #endif | |||
| } | |||
| } | |||
| a -= m * 2; | |||
| b -= 4 * n; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| #ifdef COMPLEX | |||
| FLOAT dummy2, | |||
| #endif | |||
| FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
| BLASLONG i, j; | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| size_t vl = VSETVL_MAX; | |||
| //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug | |||
| j = (n >> GEMM_UNROLL_N_SHIFT); | |||
| while (j > 0) { | |||
| kk = m + offset; | |||
| i = m % vl; | |||
| if (i) { | |||
| aa = a + (m - i) * k * COMPSIZE; | |||
| cc = c + (m - i) * COMPSIZE; | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + i * kk * COMPSIZE, | |||
| b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(i, GEMM_UNROLL_N, | |||
| aa + (kk - i) * i * COMPSIZE, | |||
| b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| kk -= i; | |||
| } | |||
| int mod = i; | |||
| i = vl; | |||
| if (i <= m) { | |||
| aa = a + (m - mod - vl) * k * COMPSIZE; | |||
| cc = c + (m - mod - vl) * COMPSIZE; | |||
| do { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + vl * kk * COMPSIZE, | |||
| b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(vl, GEMM_UNROLL_N, | |||
| aa + (kk - vl) * vl * COMPSIZE, | |||
| b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa -= vl * k * COMPSIZE; | |||
| cc -= vl * COMPSIZE; | |||
| kk -= vl; | |||
| i += vl; | |||
| } while (i <= m); | |||
| } | |||
| b += GEMM_UNROLL_N * k * COMPSIZE; | |||
| c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
| j --; | |||
| } | |||
| if (n & (GEMM_UNROLL_N - 1)) { | |||
| j = (GEMM_UNROLL_N >> 1); | |||
| while (j > 0) { | |||
| if (n & j) { | |||
| kk = m + offset; | |||
| i = m % vl; | |||
| if (i) { | |||
| aa = a + (m - i) * k * COMPSIZE; | |||
| cc = c + (m - i) * COMPSIZE; | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(i, j, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + i * kk * COMPSIZE, | |||
| b + j * kk * COMPSIZE, | |||
| cc, ldc); | |||
| } | |||
| solve(i, j, | |||
| aa + (kk - i) * i * COMPSIZE, | |||
| b + (kk - i) * j * COMPSIZE, | |||
| cc, ldc); | |||
| kk -= i; | |||
| } | |||
| int mod = i; | |||
| i = vl; | |||
| if (i <= m) { | |||
| aa = a + (m - mod - vl) * k * COMPSIZE; | |||
| cc = c + (m - mod - vl) * COMPSIZE; | |||
| do { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(vl, j, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + vl * kk * COMPSIZE, | |||
| b + j * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(vl, j, | |||
| aa + (kk - vl) * vl * COMPSIZE, | |||
| b + (kk - vl) * j * COMPSIZE, | |||
| cc, ldc); | |||
| aa -= vl * k * COMPSIZE; | |||
| cc -= vl * COMPSIZE; | |||
| kk -= vl; | |||
| i += vl; | |||
| } while (i <= m); | |||
| } | |||
| b += j * k * COMPSIZE; | |||
| c += j * ldc * COMPSIZE; | |||
| } | |||
| j >>= 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,840 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m2() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSSEV_FLOAT vsse32_v_f32m2 | |||
| #define VSSEG2_FLOAT vsseg2e32_v_f32m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m2 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m2 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m2() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSSEV_FLOAT vsse64_v_f64m2 | |||
| #define VSSEG2_FLOAT vsseg2e64_v_f64m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m2 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m2 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 | |||
| #endif | |||
| static FLOAT dm1 = -1.; | |||
| #ifdef CONJ | |||
| #define GEMM_KERNEL GEMM_KERNEL_L | |||
| #else | |||
| #define GEMM_KERNEL GEMM_KERNEL_N | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| #define GEMM_UNROLL_N_SHIFT 0 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 2 | |||
| #define GEMM_UNROLL_N_SHIFT 1 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 4 | |||
| #define GEMM_UNROLL_N_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 8 | |||
| #define GEMM_UNROLL_N_SHIFT 3 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 16 | |||
| #define GEMM_UNROLL_N_SHIFT 4 | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c | |||
| #ifndef COMPLEX | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) | |||
| { | |||
| FLOAT aa, bb; | |||
| FLOAT *pa, *pc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T va, vc; | |||
| for (i = 0; i < m; i++) | |||
| { | |||
| aa = *(a + i); | |||
| for (j = 0; j < n; j ++) | |||
| { | |||
| bb = *(c + i + j * ldc); | |||
| bb *= aa; | |||
| *b = bb; | |||
| *(c + i + j * ldc) = bb; | |||
| b++; | |||
| pa = a + i + 1; | |||
| pc = c + j * ldc + i + 1; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc = VLEV_FLOAT(pc, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc = VFNMSACVF_FLOAT(vc, bb, va, vl); | |||
| VSEV_FLOAT(pc, vc, vl); | |||
| pa += vl; | |||
| pc += vl; | |||
| } | |||
| } | |||
| a += m; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 2 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) | |||
| { | |||
| FLOAT aa, bb0, bb1; | |||
| FLOAT *pa, *pc, *pc0, *pc1; | |||
| FLOAT *pb0, *pb1; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T va, vc0, vc1; | |||
| for (i = 0; i < m; i++) | |||
| { | |||
| aa = *(a + i); | |||
| pc = c + i; | |||
| for (j = 0; j < n/2; j ++) | |||
| { | |||
| pb0 = pc + j * ldc * 2; | |||
| pb1 = pb0 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| b += 2; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| pc1 = pc0 + ldc; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| } | |||
| } | |||
| pc += ldc * (n/2) * 2; | |||
| if (n & 1) | |||
| { | |||
| pb0 = pc; | |||
| bb0 = *(pb0); | |||
| bb0 *= aa; | |||
| *b = bb0; | |||
| *(c + i) = bb0; | |||
| b++; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| } | |||
| } | |||
| a += m; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 4 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) | |||
| { | |||
| FLOAT aa, bb0, bb1, bb2, bb3; | |||
| FLOAT *pa, *pc; | |||
| FLOAT *pc0, *pc1, *pc2, *pc3; | |||
| FLOAT *pb0, *pb1, *pb2, *pb3; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T va; | |||
| FLOAT_V_T vc0, vc1, vc2, vc3; | |||
| for (i = 0; i < m; i++) | |||
| { | |||
| aa = *(a + i); | |||
| pc = c + i; | |||
| for (j = 0; j < n/4; j ++) | |||
| { | |||
| pb0 = pc; | |||
| pb1 = pb0 + ldc; | |||
| pb2 = pb1 + ldc; | |||
| pb3 = pb2 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| bb2 = (*pb2) * aa; | |||
| bb3 = (*pb3) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *(b+2) = bb2; | |||
| *(b+3) = bb3; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| *pb2 = bb2; | |||
| *pb3 = bb3; | |||
| b += 4; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| pc1 = pc0 + ldc; | |||
| pc2 = pc1 + ldc; | |||
| pc3 = pc2 + ldc; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| vc2 = VLEV_FLOAT(pc2, vl); | |||
| vc3 = VLEV_FLOAT(pc3, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| VSEV_FLOAT(pc2, vc2, vl); | |||
| VSEV_FLOAT(pc3, vc3, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| pc2 += vl; | |||
| pc3 += vl; | |||
| } | |||
| } | |||
| pc += ldc * (n/4) * 4; | |||
| if (n & 2) | |||
| { | |||
| pb0 = pc; | |||
| pb1 = pb0 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| b += 2; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| pc1 = pc0 + ldc; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| } | |||
| pc += ldc * 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| pb0 = pc; | |||
| bb0 = *(pb0); | |||
| bb0 *= aa; | |||
| *b = bb0; | |||
| *(c + i) = bb0; | |||
| b++; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| } | |||
| } | |||
| a += m; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 8 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) | |||
| { | |||
| FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; | |||
| FLOAT *pa, *pc; | |||
| FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; | |||
| FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T va; | |||
| FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; | |||
| for (i = 0; i < m; i++) | |||
| { | |||
| aa = *(a + i); | |||
| pc = c + i; | |||
| for (j = 0; j < n/8; j ++) | |||
| { | |||
| pb0 = pc + j * ldc * 8; | |||
| pb1 = pb0 + ldc; | |||
| pb2 = pb1 + ldc; | |||
| pb3 = pb2 + ldc; | |||
| pb4 = pb3 + ldc; | |||
| pb5 = pb4 + ldc; | |||
| pb6 = pb5 + ldc; | |||
| pb7 = pb6 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| bb2 = (*pb2) * aa; | |||
| bb3 = (*pb3) * aa; | |||
| bb4 = (*pb4) * aa; | |||
| bb5 = (*pb5) * aa; | |||
| bb6 = (*pb6) * aa; | |||
| bb7 = (*pb7) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *(b+2) = bb2; | |||
| *(b+3) = bb3; | |||
| *(b+4) = bb4; | |||
| *(b+5) = bb5; | |||
| *(b+6) = bb6; | |||
| *(b+7) = bb7; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| *pb2 = bb2; | |||
| *pb3 = bb3; | |||
| *pb4 = bb4; | |||
| *pb5 = bb5; | |||
| *pb6 = bb6; | |||
| *pb7 = bb7; | |||
| b += 8; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| pc1 = pc0 + ldc; | |||
| pc2 = pc1 + ldc; | |||
| pc3 = pc2 + ldc; | |||
| pc4 = pc3 + ldc; | |||
| pc5 = pc4 + ldc; | |||
| pc6 = pc5 + ldc; | |||
| pc7 = pc6 + ldc; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| vc2 = VLEV_FLOAT(pc2, vl); | |||
| vc3 = VLEV_FLOAT(pc3, vl); | |||
| vc4 = VLEV_FLOAT(pc4, vl); | |||
| vc5 = VLEV_FLOAT(pc5, vl); | |||
| vc6 = VLEV_FLOAT(pc6, vl); | |||
| vc7 = VLEV_FLOAT(pc7, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); | |||
| vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); | |||
| vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); | |||
| vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); | |||
| vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| VSEV_FLOAT(pc2, vc2, vl); | |||
| VSEV_FLOAT(pc3, vc3, vl); | |||
| VSEV_FLOAT(pc4, vc4, vl); | |||
| VSEV_FLOAT(pc5, vc5, vl); | |||
| VSEV_FLOAT(pc6, vc6, vl); | |||
| VSEV_FLOAT(pc7, vc7, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| pc2 += vl; | |||
| pc3 += vl; | |||
| pc4 += vl; | |||
| pc5 += vl; | |||
| pc6 += vl; | |||
| pc7 += vl; | |||
| } | |||
| } | |||
| pc += ldc * (n/8) * 8; | |||
| if (n & 4) | |||
| { | |||
| pb0 = pc; | |||
| pb1 = pb0 + ldc; | |||
| pb2 = pb1 + ldc; | |||
| pb3 = pb2 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| bb2 = (*pb2) * aa; | |||
| bb3 = (*pb3) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *(b+2) = bb2; | |||
| *(b+3) = bb3; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| *pb2 = bb2; | |||
| *pb3 = bb3; | |||
| b += 4; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| pc1 = pc0 + ldc; | |||
| pc2 = pc1 + ldc; | |||
| pc3 = pc2 + ldc; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| vc2 = VLEV_FLOAT(pc2, vl); | |||
| vc3 = VLEV_FLOAT(pc3, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| VSEV_FLOAT(pc2, vc2, vl); | |||
| VSEV_FLOAT(pc3, vc3, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| pc2 += vl; | |||
| pc3 += vl; | |||
| } | |||
| pc += ldc * 4; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| pb0 = pc; | |||
| pb1 = pb0 + ldc; | |||
| bb0 = (*pb0) * aa; | |||
| bb1 = (*pb1) * aa; | |||
| *b = bb0; | |||
| *(b+1) = bb1; | |||
| *pb0 = bb0; | |||
| *pb1 = bb1; | |||
| b += 2; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| pc1 = pc0 + ldc; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| vc1 = VLEV_FLOAT(pc1, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| VSEV_FLOAT(pc1, vc1, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| pc1 += vl; | |||
| } | |||
| pc += ldc * 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| pb0 = pc; | |||
| bb0 = *(pb0); | |||
| bb0 *= aa; | |||
| *b = bb0; | |||
| *(c + i) = bb0; | |||
| b++; | |||
| pa = a + i + 1; | |||
| pc0 = pb0 + 1; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLEV_FLOAT(pc0, vl); | |||
| va = VLEV_FLOAT(pa, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); | |||
| VSEV_FLOAT(pc0, vc0, vl); | |||
| pa += vl; | |||
| pc0 += vl; | |||
| } | |||
| } | |||
| a += m; | |||
| } | |||
| } | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| int i, j, k; | |||
| for (i = 0; i < m; i++) { | |||
| aa = *(a + i); | |||
| for (j = 0; j < n; j ++) { | |||
| bb = *(c + i + j * ldc); | |||
| bb *= aa; | |||
| *b = bb; | |||
| *(c + i + j * ldc) = bb; | |||
| b ++; | |||
| for (k = i + 1; k < m; k ++){ | |||
| *(c + k + j * ldc) -= bb * *(a + k); | |||
| } | |||
| } | |||
| a += m; | |||
| } | |||
| } | |||
| #endif | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| int i, j, k; | |||
| ldc *= 2; | |||
| for (i = 0; i < m; i++) { | |||
| aa1 = *(a + i * 2 + 0); | |||
| aa2 = *(a + i * 2 + 1); | |||
| for (j = 0; j < n; j ++) { | |||
| bb1 = *(c + i * 2 + 0 + j * ldc); | |||
| bb2 = *(c + i * 2 + 1 + j * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = aa1 * bb2 - aa2 * bb1; | |||
| #endif | |||
| *(b + 0) = cc1; | |||
| *(b + 1) = cc2; | |||
| *(c + i * 2 + 0 + j * ldc) = cc1; | |||
| *(c + i * 2 + 1 + j * ldc) = cc2; | |||
| b += 2; | |||
| for (k = i + 1; k < m; k ++){ | |||
| #ifndef CONJ | |||
| *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); | |||
| *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
| #else | |||
| *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); | |||
| *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
| #endif | |||
| } | |||
| } | |||
| a += m * 2; | |||
| } | |||
| } | |||
| static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| FLOAT *pa, *pc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T va0, va1, vc0, vc1; | |||
| ldc *= 2; | |||
| for (i = 0; i < m; i++) { | |||
| aa1 = *(a + i * 2 + 0); | |||
| aa2 = *(a + i * 2 + 1); | |||
| for (j = 0; j < n; j ++) { | |||
| bb1 = *(c + i * 2 + 0 + j * ldc); | |||
| bb2 = *(c + i * 2 + 1 + j * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = aa1 * bb2 - aa2 * bb1; | |||
| #endif | |||
| *(b + 0) = cc1; | |||
| *(b + 1) = cc2; | |||
| *(c + i * 2 + 0 + j * ldc) = cc1; | |||
| *(c + i * 2 + 1 + j * ldc) = cc2; | |||
| b += 2; | |||
| pa = a + (i + 1) * 2; | |||
| pc = c + j * ldc + (i + 1) * 2; | |||
| for (k = (m - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, pa, vl); | |||
| VLSEG2_FLOAT(&vc0, &vc1, pc, vl); | |||
| #ifndef CONJ | |||
| vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); | |||
| vc0 = VFMACCVF_FLOAT(vc0, cc2, va1); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); | |||
| #else | |||
| vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1); | |||
| vc1 = VFMACCVF_FLOAT(vc1, cc1, va1); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); | |||
| #endif | |||
| VSSEG2_FLOAT(pc, vc0, vc1, vl); | |||
| pa += vl * 2; | |||
| pc += vl * 2; | |||
| } | |||
| } | |||
| } | |||
| a += m * 2; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| #ifdef COMPLEX | |||
| FLOAT dummy2, | |||
| #endif | |||
| FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| BLASLONG i, j; | |||
| size_t vl = VSETVL_MAX; | |||
| //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug | |||
| j = (n >> GEMM_UNROLL_N_SHIFT); | |||
| while (j > 0) { | |||
| kk = offset; | |||
| aa = a; | |||
| cc = c; | |||
| i = vl; | |||
| while (i <= m) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, b, cc, ldc); | |||
| } | |||
| solve(vl, GEMM_UNROLL_N, | |||
| aa + kk * vl * COMPSIZE, | |||
| b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += vl * k * COMPSIZE; | |||
| cc += vl * COMPSIZE; | |||
| kk += vl; | |||
| i += vl; | |||
| } | |||
| i = m % vl; | |||
| if (i) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, b, cc, ldc); | |||
| } | |||
| solve(i, GEMM_UNROLL_N, | |||
| aa + kk * i * COMPSIZE, | |||
| b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| kk += i; | |||
| } | |||
| b += GEMM_UNROLL_N * k * COMPSIZE; | |||
| c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
| j --; | |||
| } | |||
| if (n & (GEMM_UNROLL_N - 1)) { | |||
| j = (GEMM_UNROLL_N >> 1); | |||
| while (j > 0) { | |||
| if (n & j) { | |||
| kk = offset; | |||
| aa = a; | |||
| cc = c; | |||
| i = vl; | |||
| while (i <= m) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(vl, j, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, | |||
| b, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(vl, j, | |||
| aa + kk * vl * COMPSIZE, | |||
| b + kk * j * COMPSIZE, cc, ldc); | |||
| aa += vl * k * COMPSIZE; | |||
| cc += vl * COMPSIZE; | |||
| kk += vl; | |||
| i += vl; | |||
| } | |||
| i = m % vl; | |||
| if (i) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(i, j, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, | |||
| b, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(i, j, | |||
| aa + kk * i * COMPSIZE, | |||
| b + kk * j * COMPSIZE, cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| kk += i; | |||
| } | |||
| b += j * k * COMPSIZE; | |||
| c += j * ldc * COMPSIZE; | |||
| } | |||
| j >>= 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,792 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m2() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSSEV_FLOAT vsse32_v_f32m2 | |||
| #define VSSEG2_FLOAT vsseg2e32_v_f32m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m2 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m2() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSSEV_FLOAT vsse64_v_f64m2 | |||
| #define VSSEG2_FLOAT vsseg2e64_v_f64m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m2 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 | |||
| #endif | |||
| static FLOAT dm1 = -1.; | |||
| #ifdef CONJ | |||
| #define GEMM_KERNEL GEMM_KERNEL_R | |||
| #else | |||
| #define GEMM_KERNEL GEMM_KERNEL_N | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| #define GEMM_UNROLL_N_SHIFT 0 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 2 | |||
| #define GEMM_UNROLL_N_SHIFT 1 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 4 | |||
| #define GEMM_UNROLL_N_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 8 | |||
| #define GEMM_UNROLL_N_SHIFT 3 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 16 | |||
| #define GEMM_UNROLL_N_SHIFT 4 | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c | |||
| #ifndef COMPLEX | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| FLOAT *pb, *pc; | |||
| BLASLONG stride_ldc = sizeof(FLOAT) * ldc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T vb, vc; | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| bb = *(b + i); | |||
| for (j = 0; j < m; j ++) | |||
| { | |||
| aa = *(c + j + i * ldc); | |||
| aa *= bb; | |||
| *a = aa; | |||
| *(c + j + i * ldc) = aa; | |||
| a ++; | |||
| pb = b + i + 1; | |||
| pc = c + j + (i + 1) *ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc = VLSEV_FLOAT(pc, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); | |||
| VSSEV_FLOAT(pc, stride_ldc, vc, vl); | |||
| pb += vl; | |||
| pc ++; | |||
| } | |||
| } | |||
| b += n; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 2 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa0, aa1, bb; | |||
| FLOAT *pb, *pc; | |||
| FLOAT *pa0, *pa1, *pc0, *pc1; | |||
| BLASLONG stride_ldc = sizeof(FLOAT) * ldc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T vb, vc0, vc1; | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| bb = *(b + i); | |||
| pc = c + i * ldc; | |||
| for (j = 0; j < m/2; j ++) | |||
| { | |||
| pa0 = pc + j * 2; | |||
| pa1 = pc + j * 2 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| a += 2; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| pc1 = pa1 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| } | |||
| } | |||
| pc += (m/2)*2; | |||
| if (m & 1) | |||
| { | |||
| pa0 = pc; | |||
| aa0 = *pa0 * bb; | |||
| *pa0 = aa0; | |||
| *a = aa0; | |||
| a += 1; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| } | |||
| } | |||
| b += n; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 4 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT bb; | |||
| FLOAT aa0, aa1, aa2, aa3; | |||
| FLOAT *pb, *pc; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3; | |||
| FLOAT *pc0, *pc1, *pc2, *pc3; | |||
| BLASLONG stride_ldc = sizeof(FLOAT) * ldc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T vb, vc0, vc1, vc2, vc3; | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| bb = *(b + i); | |||
| pc = c + i * ldc; | |||
| for (j = 0; j < m/4; j ++) | |||
| { | |||
| pa0 = pc + j * 4; | |||
| pa1 = pa0 + 1; | |||
| pa2 = pa1 + 1; | |||
| pa3 = pa2 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| aa2 = *pa2 * bb; | |||
| aa3 = *pa3 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *pa2 = aa2; | |||
| *pa3 = aa3; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| *(a + 2)= aa2; | |||
| *(a + 3)= aa3; | |||
| a += 4; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| pc1 = pa1 + ldc; | |||
| pc2 = pa2 + ldc; | |||
| pc3 = pa3 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); | |||
| vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); | |||
| VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| pc2++; | |||
| pc3++; | |||
| } | |||
| } | |||
| pc += (m/4)*4; | |||
| if (m & 2) | |||
| { | |||
| pa0 = pc; | |||
| pa1 = pa0 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| a += 2; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| pc1 = pa1 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| } | |||
| pc += 2; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| pa0 = pc; | |||
| aa0 = *pa0 * bb; | |||
| *pa0 = aa0; | |||
| *a = aa0; | |||
| a += 1; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| } | |||
| } | |||
| b += n; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 8 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT bb; | |||
| FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; | |||
| FLOAT *pb, *pc; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; | |||
| FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; | |||
| BLASLONG stride_ldc = sizeof(FLOAT) * ldc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| bb = *(b + i); | |||
| pc = c + i * ldc; | |||
| for (j = 0; j < m/8; j ++) | |||
| { | |||
| pa0 = pc + j * 8; | |||
| pa1 = pa0 + 1; | |||
| pa2 = pa1 + 1; | |||
| pa3 = pa2 + 1; | |||
| pa4 = pa3 + 1; | |||
| pa5 = pa4 + 1; | |||
| pa6 = pa5 + 1; | |||
| pa7 = pa6 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| aa2 = *pa2 * bb; | |||
| aa3 = *pa3 * bb; | |||
| aa4 = *pa4 * bb; | |||
| aa5 = *pa5 * bb; | |||
| aa6 = *pa6 * bb; | |||
| aa7 = *pa7 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *pa2 = aa2; | |||
| *pa3 = aa3; | |||
| *pa4 = aa4; | |||
| *pa5 = aa5; | |||
| *pa6 = aa6; | |||
| *pa7 = aa7; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| *(a + 2)= aa2; | |||
| *(a + 3)= aa3; | |||
| *(a + 4)= aa4; | |||
| *(a + 5)= aa5; | |||
| *(a + 6)= aa6; | |||
| *(a + 7)= aa7; | |||
| a += 8; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| pc1 = pa1 + ldc; | |||
| pc2 = pa2 + ldc; | |||
| pc3 = pa3 + ldc; | |||
| pc4 = pa4 + ldc; | |||
| pc5 = pa5 + ldc; | |||
| pc6 = pa6 + ldc; | |||
| pc7 = pa7 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); | |||
| vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); | |||
| vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); | |||
| vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); | |||
| vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); | |||
| vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); | |||
| vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); | |||
| vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); | |||
| vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); | |||
| vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); | |||
| VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); | |||
| VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); | |||
| VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); | |||
| VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); | |||
| VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| pc2++; | |||
| pc3++; | |||
| pc4++; | |||
| pc5++; | |||
| pc6++; | |||
| pc7++; | |||
| } | |||
| } | |||
| pc += (m/8)*8; | |||
| if (m & 4) | |||
| { | |||
| pa0 = pc; | |||
| pa1 = pa0 + 1; | |||
| pa2 = pa1 + 1; | |||
| pa3 = pa2 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| aa2 = *pa2 * bb; | |||
| aa3 = *pa3 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *pa2 = aa2; | |||
| *pa3 = aa3; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| *(a + 2)= aa2; | |||
| *(a + 3)= aa3; | |||
| a += 4; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| pc1 = pa1 + ldc; | |||
| pc2 = pa2 + ldc; | |||
| pc3 = pa3 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); | |||
| vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); | |||
| VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| pc2++; | |||
| pc3++; | |||
| } | |||
| pc += 4; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| pa0 = pc; | |||
| pa1 = pa0 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| a += 2; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| pc1 = pa1 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| } | |||
| pc += 2; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| pa0 = pc; | |||
| aa0 = *pa0 * bb; | |||
| *pa0 = aa0; | |||
| *a = aa0; | |||
| a += 1; | |||
| pb = b + i + 1; | |||
| pc0 = pa0 + ldc; | |||
| for (k = (n - i - 1); k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| } | |||
| } | |||
| b += n; | |||
| } | |||
| } | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| int i, j, k; | |||
| for (i = 0; i < n; i++) { | |||
| bb = *(b + i); | |||
| for (j = 0; j < m; j ++) { | |||
| aa = *(c + j + i * ldc); | |||
| aa *= bb; | |||
| *a = aa; | |||
| *(c + j + i * ldc) = aa; | |||
| a ++; | |||
| for (k = i + 1; k < n; k ++){ | |||
| *(c + j + k * ldc) -= aa * *(b + k); | |||
| } | |||
| } | |||
| b += n; | |||
| } | |||
| } | |||
| #endif | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| int i, j, k; | |||
| ldc *= 2; | |||
| for (i = 0; i < n; i++) { | |||
| bb1 = *(b + i * 2 + 0); | |||
| bb2 = *(b + i * 2 + 1); | |||
| for (j = 0; j < m; j ++) { | |||
| aa1 = *(c + j * 2 + 0 + i * ldc); | |||
| aa2 = *(c + j * 2 + 1 + i * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = -aa1 * bb2 + aa2 * bb1; | |||
| #endif | |||
| *(a + 0) = cc1; | |||
| *(a + 1) = cc2; | |||
| *(c + j * 2 + 0 + i * ldc) = cc1; | |||
| *(c + j * 2 + 1 + i * ldc) = cc2; | |||
| a += 2; | |||
| for (k = i + 1; k < n; k ++){ | |||
| #ifndef CONJ | |||
| *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | |||
| *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
| #else | |||
| *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | |||
| *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
| #endif | |||
| } | |||
| } | |||
| b += n * 2; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| #ifdef COMPLEX | |||
| FLOAT dummy2, | |||
| #endif | |||
| FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| BLASLONG i, j; | |||
| size_t vl = VSETVL_MAX; | |||
| //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug | |||
| j = (n >> GEMM_UNROLL_N_SHIFT); | |||
| kk = -offset; | |||
| while (j > 0) { | |||
| aa = a; | |||
| cc = c; | |||
| i = vl; | |||
| if (i <= m) { | |||
| do { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, b, cc, ldc); | |||
| } | |||
| solve(vl, GEMM_UNROLL_N, | |||
| aa + kk * vl * COMPSIZE, | |||
| b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += vl * k * COMPSIZE; | |||
| cc += vl * COMPSIZE; | |||
| i += vl; | |||
| } while (i <= m); | |||
| } | |||
| i = m % vl; | |||
| if (i) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, b, cc, ldc); | |||
| } | |||
| solve(i, GEMM_UNROLL_N, | |||
| aa + kk * i * COMPSIZE, | |||
| b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| } | |||
| kk += GEMM_UNROLL_N; | |||
| b += GEMM_UNROLL_N * k * COMPSIZE; | |||
| c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
| j --; | |||
| } | |||
| if (n & (GEMM_UNROLL_N - 1)) { | |||
| j = (GEMM_UNROLL_N >> 1); | |||
| while (j > 0) { | |||
| if (n & j) { | |||
| aa = a; | |||
| cc = c; | |||
| i = vl; | |||
| while (i <= m) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(vl, j, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, | |||
| b, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(vl, j, | |||
| aa + kk * vl * COMPSIZE, | |||
| b + kk * j * COMPSIZE, cc, ldc); | |||
| aa += vl * k * COMPSIZE; | |||
| cc += vl * COMPSIZE; | |||
| i += vl; | |||
| } | |||
| i = m % vl; | |||
| if (i) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(i, j, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, | |||
| b, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(i, j, | |||
| aa + kk * i * COMPSIZE, | |||
| b + kk * j * COMPSIZE, cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| } | |||
| b += j * k * COMPSIZE; | |||
| c += j * ldc * COMPSIZE; | |||
| kk += j; | |||
| } | |||
| j >>= 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,828 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m2() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSSEV_FLOAT vsse32_v_f32m2 | |||
| #define VSSEG2_FLOAT vsseg2e32_v_f32m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m2 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m2() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSSEV_FLOAT vsse64_v_f64m2 | |||
| #define VSSEG2_FLOAT vsseg2e64_v_f64m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m2 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 | |||
| #endif | |||
| static FLOAT dm1 = -1.; | |||
| #ifdef CONJ | |||
| #define GEMM_KERNEL GEMM_KERNEL_R | |||
| #else | |||
| #define GEMM_KERNEL GEMM_KERNEL_N | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| #define GEMM_UNROLL_N_SHIFT 0 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 2 | |||
| #define GEMM_UNROLL_N_SHIFT 1 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 4 | |||
| #define GEMM_UNROLL_N_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 8 | |||
| #define GEMM_UNROLL_N_SHIFT 3 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 16 | |||
| #define GEMM_UNROLL_N_SHIFT 4 | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c | |||
| #ifndef COMPLEX | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| FLOAT *pb, *pc; | |||
| BLASLONG stride_ldc = sizeof(FLOAT) * ldc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T vb, vc; | |||
| a += (n - 1) * m; | |||
| b += (n - 1) * n; | |||
| for (i = n - 1; i >= 0; i--) { | |||
| bb = *(b + i); | |||
| for (j = 0; j < m; j ++) { | |||
| aa = *(c + j + i * ldc); | |||
| aa *= bb; | |||
| *a = aa; | |||
| *(c + j + i * ldc) = aa; | |||
| a ++; | |||
| pb = b; | |||
| pc = c + j; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc = VLSEV_FLOAT(pc, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); | |||
| VSSEV_FLOAT(pc, stride_ldc, vc, vl); | |||
| pb += vl; | |||
| pc++; | |||
| } | |||
| } | |||
| b -= n; | |||
| a -= 2 * m; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 2 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa0, aa1, bb; | |||
| FLOAT *pb, *pc; | |||
| FLOAT *pa0, *pa1, *pc0, *pc1; | |||
| BLASLONG stride_ldc = sizeof(FLOAT) * ldc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T vb, vc0, vc1; | |||
| a += (n - 1) * m; | |||
| b += (n - 1) * n; | |||
| for (i = n - 1; i >= 0; i--) | |||
| { | |||
| bb = *(b + i); | |||
| pc = c + i * ldc; | |||
| for (j = 0; j < m/2; j ++) | |||
| { | |||
| pa0 = pc + j * 2; | |||
| pa1 = pc + j * 2 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| a += 2; | |||
| pb = b; | |||
| pc0 = c + j * 2; | |||
| pc1 = pc0 + 1; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| } | |||
| } | |||
| pc += (m/2)*2; | |||
| if (m & 1) | |||
| { | |||
| pa0 = pc; | |||
| aa0 = *pa0 * bb; | |||
| *pa0 = aa0; | |||
| *a = aa0; | |||
| a += 1; | |||
| pb = b; | |||
| pc0 = pc - i * ldc; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| } | |||
| } | |||
| b -= n; | |||
| a -= 2 * m; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 4 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa0, aa1, aa2, aa3; | |||
| FLOAT bb; | |||
| FLOAT *pb, *pc; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3; | |||
| FLOAT *pc0, *pc1, *pc2, *pc3; | |||
| BLASLONG stride_ldc = sizeof(FLOAT) * ldc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T vb, vc0, vc1, vc2, vc3; | |||
| a += (n - 1) * m; | |||
| b += (n - 1) * n; | |||
| for (i = n - 1; i >= 0; i--) | |||
| { | |||
| bb = *(b + i); | |||
| pc = c + i * ldc; | |||
| for (j = 0; j < m/4; j ++) | |||
| { | |||
| pa0 = pc + j * 4; | |||
| pa1 = pa0 + 1; | |||
| pa2 = pa1 + 1; | |||
| pa3 = pa2 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| aa2 = *pa2 * bb; | |||
| aa3 = *pa3 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *pa2 = aa2; | |||
| *pa3 = aa3; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| *(a + 2)= aa2; | |||
| *(a + 3)= aa3; | |||
| a += 4; | |||
| pb = b; | |||
| pc0 = c + j * 4; | |||
| pc1 = pc0 + 1; | |||
| pc2 = pc1 + 1; | |||
| pc3 = pc2 + 1; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); | |||
| vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); | |||
| VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| pc2++; | |||
| pc3++; | |||
| } | |||
| } | |||
| pc += (m/4)*4; | |||
| if (m & 2) | |||
| { | |||
| pa0 = pc + j * 2; | |||
| pa1 = pa0 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| a += 2; | |||
| pb = b; | |||
| pc0 = c + j * 4; | |||
| pc1 = pc0 + 1; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| } | |||
| pc += 2; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| pa0 = pc; | |||
| aa0 = *pa0 * bb; | |||
| *pa0 = aa0; | |||
| *a = aa0; | |||
| a += 1; | |||
| pb = b; | |||
| pc0 = pc - i * ldc; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| } | |||
| } | |||
| b -= n; | |||
| a -= 2 * m; | |||
| } | |||
| } | |||
| #elif GEMM_DEFAULT_UNROLL_N == 8 | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; | |||
| FLOAT bb; | |||
| FLOAT *pb, *pc; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; | |||
| FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; | |||
| BLASLONG stride_ldc = sizeof(FLOAT) * ldc; | |||
| int i, j, k; | |||
| size_t vl; | |||
| FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; | |||
| a += (n - 1) * m; | |||
| b += (n - 1) * n; | |||
| for (i = n - 1; i >= 0; i--) | |||
| { | |||
| bb = *(b + i); | |||
| pc = c + i * ldc; | |||
| for (j = 0; j < m/8; j ++) | |||
| { | |||
| pa0 = pc + j * 8; | |||
| pa1 = pa0 + 1; | |||
| pa2 = pa1 + 1; | |||
| pa3 = pa2 + 1; | |||
| pa4 = pa3 + 1; | |||
| pa5 = pa4 + 1; | |||
| pa6 = pa5 + 1; | |||
| pa7 = pa6 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| aa2 = *pa2 * bb; | |||
| aa3 = *pa3 * bb; | |||
| aa4 = *pa4 * bb; | |||
| aa5 = *pa5 * bb; | |||
| aa6 = *pa6 * bb; | |||
| aa7 = *pa7 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *pa2 = aa2; | |||
| *pa3 = aa3; | |||
| *pa4 = aa4; | |||
| *pa5 = aa5; | |||
| *pa6 = aa6; | |||
| *pa7 = aa7; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| *(a + 2)= aa2; | |||
| *(a + 3)= aa3; | |||
| *(a + 4)= aa4; | |||
| *(a + 5)= aa5; | |||
| *(a + 6)= aa6; | |||
| *(a + 7)= aa7; | |||
| a += 8; | |||
| pb = b; | |||
| pc0 = c + j * 8; | |||
| pc1 = pc0 + 1; | |||
| pc2 = pc1 + 1; | |||
| pc3 = pc2 + 1; | |||
| pc4 = pc3 + 1; | |||
| pc5 = pc4 + 1; | |||
| pc6 = pc5 + 1; | |||
| pc7 = pc6 + 1; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); | |||
| vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); | |||
| vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); | |||
| vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); | |||
| vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); | |||
| vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); | |||
| vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); | |||
| vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); | |||
| vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); | |||
| vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); | |||
| VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); | |||
| VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); | |||
| VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); | |||
| VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); | |||
| VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| pc2++; | |||
| pc3++; | |||
| pc4++; | |||
| pc5++; | |||
| pc6++; | |||
| pc7++; | |||
| } | |||
| } | |||
| pc += (m/8)*8; | |||
| if (m & 4) | |||
| { | |||
| pa0 = pc; | |||
| pa1 = pa0 + 1; | |||
| pa2 = pa1 + 1; | |||
| pa3 = pa2 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| aa2 = *pa2 * bb; | |||
| aa3 = *pa3 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *pa2 = aa2; | |||
| *pa3 = aa3; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| *(a + 2)= aa2; | |||
| *(a + 3)= aa3; | |||
| a += 4; | |||
| pb = b; | |||
| pc0 = pc - i * ldc; | |||
| pc1 = pc0 + 1; | |||
| pc2 = pc1 + 1; | |||
| pc3 = pc2 + 1; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); | |||
| vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); | |||
| vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); | |||
| VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| pc2++; | |||
| pc3++; | |||
| } | |||
| pc += 4; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| pa0 = pc; | |||
| pa1 = pa0 + 1; | |||
| aa0 = *pa0 * bb; | |||
| aa1 = *pa1 * bb; | |||
| *pa0 = aa0; | |||
| *pa1 = aa1; | |||
| *a = aa0; | |||
| *(a + 1)= aa1; | |||
| a += 2; | |||
| pb = b; | |||
| pc0 = pc - i * ldc; | |||
| pc1 = pc0 + 1; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| pc1++; | |||
| } | |||
| pc += 2; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| pa0 = pc; | |||
| aa0 = *pa0 * bb; | |||
| *pa0 = aa0; | |||
| *a = aa0; | |||
| a += 1; | |||
| pb = b; | |||
| pc0 = pc - i * ldc; | |||
| for (k = i; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); | |||
| vb = VLEV_FLOAT(pb, vl); | |||
| vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); | |||
| VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); | |||
| pb += vl; | |||
| pc0++; | |||
| } | |||
| } | |||
| b -= n; | |||
| a -= 2 * m; | |||
| } | |||
| } | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| int i, j, k; | |||
| a += (n - 1) * m; | |||
| b += (n - 1) * n; | |||
| for (i = n - 1; i >= 0; i--) { | |||
| bb = *(b + i); | |||
| for (j = 0; j < m; j ++) { | |||
| aa = *(c + j + i * ldc); | |||
| aa *= bb; | |||
| *a = aa; | |||
| *(c + j + i * ldc) = aa; | |||
| a ++; | |||
| for (k = 0; k < i; k ++){ | |||
| *(c + j + k * ldc) -= aa * *(b + k); | |||
| } | |||
| } | |||
| b -= n; | |||
| a -= 2 * m; | |||
| } | |||
| } | |||
| #endif | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| int i, j, k; | |||
| ldc *= 2; | |||
| a += (n - 1) * m * 2; | |||
| b += (n - 1) * n * 2; | |||
| for (i = n - 1; i >= 0; i--) { | |||
| bb1 = *(b + i * 2 + 0); | |||
| bb2 = *(b + i * 2 + 1); | |||
| for (j = 0; j < m; j ++) { | |||
| aa1 = *(c + j * 2 + 0 + i * ldc); | |||
| aa2 = *(c + j * 2 + 1 + i * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = - aa1 * bb2 + aa2 * bb1; | |||
| #endif | |||
| *(a + 0) = cc1; | |||
| *(a + 1) = cc2; | |||
| *(c + j * 2 + 0 + i * ldc) = cc1; | |||
| *(c + j * 2 + 1 + i * ldc) = cc2; | |||
| a += 2; | |||
| for (k = 0; k < i; k ++){ | |||
| #ifndef CONJ | |||
| *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | |||
| *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
| #else | |||
| *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | |||
| *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
| #endif | |||
| } | |||
| } | |||
| b -= n * 2; | |||
| a -= 4 * m; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| #ifdef COMPLEX | |||
| FLOAT dummy2, | |||
| #endif | |||
| FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
| BLASLONG i, j; | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| size_t vl = VSETVL_MAX; | |||
| //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug | |||
| kk = n - offset; | |||
| c += n * ldc * COMPSIZE; | |||
| b += n * k * COMPSIZE; | |||
| if (n & (GEMM_UNROLL_N - 1)) { | |||
| j = 1; | |||
| while (j < GEMM_UNROLL_N) { | |||
| if (n & j) { | |||
| aa = a; | |||
| b -= j * k * COMPSIZE; | |||
| c -= j * ldc* COMPSIZE; | |||
| cc = c; | |||
| i = vl; | |||
| if (i <= m) { | |||
| do { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(vl, j, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + vl * kk * COMPSIZE, | |||
| b + j * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(vl, j, | |||
| aa + (kk - j) * vl * COMPSIZE, | |||
| b + (kk - j) * j * COMPSIZE, | |||
| cc, ldc); | |||
| aa += vl * k * COMPSIZE; | |||
| cc += vl * COMPSIZE; | |||
| i += vl; | |||
| } while (i <= m); | |||
| } | |||
| i = m % vl; | |||
| if (i) { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(i, j, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + i * kk * COMPSIZE, | |||
| b + j * kk * COMPSIZE, | |||
| cc, ldc); | |||
| } | |||
| solve(i, j, | |||
| aa + (kk - j) * i * COMPSIZE, | |||
| b + (kk - j) * j * COMPSIZE, | |||
| cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| } | |||
| kk -= j; | |||
| } | |||
| j <<= 1; | |||
| } | |||
| } | |||
| j = (n >> GEMM_UNROLL_N_SHIFT); | |||
| if (j > 0) { | |||
| do { | |||
| aa = a; | |||
| b -= GEMM_UNROLL_N * k * COMPSIZE; | |||
| c -= GEMM_UNROLL_N * ldc * COMPSIZE; | |||
| cc = c; | |||
| i = vl; | |||
| if (i <= m) { | |||
| do { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + vl * kk * COMPSIZE, | |||
| b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(vl, GEMM_UNROLL_N, | |||
| aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE, | |||
| b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += vl * k * COMPSIZE; | |||
| cc += vl * COMPSIZE; | |||
| i += vl; | |||
| } while (i <= m); | |||
| } | |||
| i = m % vl; | |||
| if (i) { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + i * kk * COMPSIZE, | |||
| b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(i, GEMM_UNROLL_N, | |||
| aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, | |||
| b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| } | |||
| kk -= GEMM_UNROLL_N; | |||
| j --; | |||
| } while (j > 0); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,122 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSEV_FLOAT_M vse32_v_f32m2_m | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VBOOL_T vbool16_t | |||
| #define UINT_V_T vuint32m2_t | |||
| #define VID_V_UINT vid_v_u32m2 | |||
| #define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSEV_FLOAT_M vse64_v_f64m2_m | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VBOOL_T vbool32_t | |||
| #define UINT_V_T vuint64m2_t | |||
| #define VID_V_UINT vid_v_u64m2 | |||
| #define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 | |||
| #endif | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/trsm_lncopy_sve.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj, js; | |||
| FLOAT *ao; | |||
| jj = offset; | |||
| BLASLONG stride_lda = sizeof(FLOAT)*lda; | |||
| FLOAT_V_T va1; | |||
| VBOOL_T vbool_cmp; | |||
| UINT_V_T vindex; | |||
| size_t vl; | |||
| for (js = n; js > 0; js -= vl) | |||
| { | |||
| vl = VSETVL(js); | |||
| ao = a; | |||
| ii = 0; | |||
| for (i = 0; i < m;) | |||
| { | |||
| if (ii == jj) | |||
| { | |||
| vindex = VID_V_UINT(vl); | |||
| for (unsigned int j = 0; j < vl; j++) | |||
| { | |||
| va1 = VLSEV_FLOAT(ao, stride_lda, vl); | |||
| vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); | |||
| VSEV_FLOAT_M(vbool_cmp, b, va1, vl); | |||
| *(b + j) = INV(*(ao + j * lda)); | |||
| ao++; | |||
| b += vl; | |||
| } | |||
| i += vl; | |||
| ii += vl; | |||
| } | |||
| else | |||
| { | |||
| if (ii > jj) | |||
| { | |||
| va1 = VLSEV_FLOAT(ao, stride_lda, vl); | |||
| VSEV_FLOAT(b, va1, vl); | |||
| } | |||
| ao++; | |||
| b += vl; | |||
| i++; | |||
| ii++; | |||
| } | |||
| } | |||
| a += vl * lda; | |||
| jj += vl; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,122 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSEV_FLOAT_M vse32_v_f32m2_m | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VBOOL_T vbool16_t | |||
| #define UINT_V_T vuint32m2_t | |||
| #define VID_V_UINT vid_v_u32m2 | |||
| #define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSEV_FLOAT_M vse64_v_f64m2_m | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VBOOL_T vbool32_t | |||
| #define UINT_V_T vuint64m2_t | |||
| #define VID_V_UINT vid_v_u64m2 | |||
| #define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 | |||
| #endif | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj, js; | |||
| FLOAT *ao; | |||
| jj = offset; | |||
| FLOAT_V_T va1; | |||
| VBOOL_T vbool_cmp; | |||
| UINT_V_T vindex; | |||
| size_t vl; | |||
| for (js = n; js > 0; js -= vl) | |||
| { | |||
| vl = VSETVL(js); | |||
| ao = a; | |||
| ii = 0; | |||
| for (i = 0; i < m;) | |||
| { | |||
| if (ii == jj) | |||
| { | |||
| vindex = VID_V_UINT(vl); | |||
| for (unsigned int j = 0; j < vl; j++) | |||
| { | |||
| *(b + j) = INV(*(ao + j)); | |||
| va1 = VLEV_FLOAT(ao, vl); | |||
| vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); | |||
| VSEV_FLOAT_M(vbool_cmp, b, va1, vl); | |||
| b += vl; | |||
| ao += lda; | |||
| } | |||
| i += vl; | |||
| ii += vl; | |||
| } | |||
| else | |||
| { | |||
| if (ii < jj) | |||
| { | |||
| va1 = VLEV_FLOAT(ao, vl); | |||
| VSEV_FLOAT(b, va1, vl); | |||
| } | |||
| ao += lda; | |||
| b += vl; | |||
| i ++; | |||
| ii ++; | |||
| } | |||
| } | |||
| a += vl; | |||
| jj += vl; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,121 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSEV_FLOAT_M vse32_v_f32m2_m | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VBOOL_T vbool16_t | |||
| #define UINT_V_T vuint32m2_t | |||
| #define VID_V_UINT vid_v_u32m2 | |||
| #define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSEV_FLOAT_M vse64_v_f64m2_m | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VBOOL_T vbool32_t | |||
| #define UINT_V_T vuint64m2_t | |||
| #define VID_V_UINT vid_v_u64m2 | |||
| #define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 | |||
| #endif | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/trsm_uncopy_sve.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj, js; | |||
| BLASLONG stride_lda = sizeof(FLOAT)*lda; | |||
| FLOAT *ao; | |||
| jj = offset; | |||
| FLOAT_V_T va1; | |||
| VBOOL_T vbool_cmp; | |||
| UINT_V_T vindex; | |||
| size_t vl; | |||
| for (js = n; js > 0; js -= vl) | |||
| { | |||
| vl = VSETVL(js); | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| for (i = 0; i < m;) | |||
| { | |||
| if (ii == jj) | |||
| { | |||
| vindex = VID_V_UINT(vl); | |||
| for (unsigned int j = 0; j < vl; j++) | |||
| { | |||
| *(b + j) = INV(*(ao + j * lda)); | |||
| va1 = VLSEV_FLOAT(ao, stride_lda, vl); | |||
| vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); | |||
| VSEV_FLOAT_M(vbool_cmp, b, va1, vl); | |||
| ao++; | |||
| b += vl; | |||
| } | |||
| i += vl; | |||
| ii += vl; | |||
| } | |||
| else | |||
| { | |||
| if (ii < jj) | |||
| { | |||
| va1 = VLSEV_FLOAT(ao, stride_lda, vl); | |||
| VSEV_FLOAT(b, va1, vl); | |||
| } | |||
| ao++; | |||
| b += vl; | |||
| i++; | |||
| ii++; | |||
| } | |||
| } | |||
| a += vl * lda; | |||
| jj += vl; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,123 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSEV_FLOAT_M vse32_v_f32m2_m | |||
| #define VLSEV_FLOAT vlse32_v_f32m2 | |||
| #define VBOOL_T vbool16_t | |||
| #define UINT_V_T vuint32m2_t | |||
| #define VID_V_UINT vid_v_u32m2 | |||
| #define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSEV_FLOAT_M vse64_v_f64m2_m | |||
| #define VLSEV_FLOAT vlse64_v_f64m2 | |||
| #define VBOOL_T vbool32_t | |||
| #define UINT_V_T vuint64m2_t | |||
| #define VID_V_UINT vid_v_u64m2 | |||
| #define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 | |||
| #endif | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| // Optimizes the implementation in ../arm64/trsm_utcopy_sve.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj, js; | |||
| FLOAT *ao; | |||
| jj = offset; | |||
| FLOAT_V_T va1; | |||
| VBOOL_T vbool_cmp; | |||
| UINT_V_T vindex; | |||
| size_t vl; | |||
| for (js = n; js > 0; js -= vl) | |||
| { | |||
| vl = VSETVL(js); | |||
| ao = a; | |||
| ii = 0; | |||
| for (i = 0; i < m;) | |||
| { | |||
| if (ii == jj) | |||
| { | |||
| vindex = VID_V_UINT(vl); | |||
| for (unsigned int j = 0; j < vl; j++) | |||
| { | |||
| va1 = VLEV_FLOAT(ao, vl); | |||
| vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); | |||
| VSEV_FLOAT_M(vbool_cmp, b, va1, vl); | |||
| *(b + j) = INV(*(ao + j)); | |||
| ao += lda; | |||
| b += vl; | |||
| } | |||
| i += vl; | |||
| ii += vl; | |||
| } | |||
| else | |||
| { | |||
| if (ii > jj) | |||
| { | |||
| va1 = VLEV_FLOAT(ao, vl); | |||
| VSEV_FLOAT(b, va1, vl); | |||
| } | |||
| ao += lda; | |||
| b += vl; | |||
| i ++; | |||
| ii ++; | |||
| } | |||
| } | |||
| a += vl; | |||
| jj += vl; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m4 | |||
| #define VFABSV_FLOAT vfabs_v_f32m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m4 | |||
| #define VFABSV_FLOAT vfabs_v_f64m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT maxf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| FLOAT_V_T v0, v1, vmax; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmax = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&v0, &v1, x, vl); | |||
| v0 = VFABSV_FLOAT(v0, vl); | |||
| v1 = VFABSV_FLOAT(v1, vl); | |||
| v0 = VFADDVV_FLOAT(v0, v1, vl); | |||
| vmax = VFMAXVV_FLOAT(vmax, v0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); | |||
| v0 = VFABSV_FLOAT(v0, vl); | |||
| v1 = VFABSV_FLOAT(v1, vl); | |||
| v0 = VFADDVV_FLOAT(v0, v1, vl); | |||
| vmax = VFMAXVV_FLOAT(vmax, v0, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -0,0 +1,112 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m4 | |||
| #define VFABSV_FLOAT vfabs_v_f32m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m4 | |||
| #define VFABSV_FLOAT vfabs_v_f64m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT minf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| FLOAT_V_T v0, v1, vmin; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&v0, &v1, x, vl); | |||
| v0 = VFABSV_FLOAT(v0, vl); | |||
| v1 = VFABSV_FLOAT(v1, vl); | |||
| v0 = VFADDVV_FLOAT(v0, v1, vl); | |||
| vmin = VFMINVV_FLOAT(vmin, v0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); | |||
| v0 = VFABSV_FLOAT(v0, vl); | |||
| v1 = VFABSV_FLOAT(v1, vl); | |||
| v0 = VFADDVV_FLOAT(v0, v1, vl); | |||
| vmin = VFMINVV_FLOAT(vmin, v0, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -0,0 +1,108 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #define VFABSV_FLOAT vfabs_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #define VFABSV_FLOAT vfabs_v_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT asumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| FLOAT_V_T v0, v1; | |||
| size_t vlmax = VSETVL_MAX; | |||
| FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| v1 = VLEV_FLOAT(x+vl, vl); | |||
| v0 = VFABSV_FLOAT(v0, vl); | |||
| v1 = VFABSV_FLOAT(v1, vl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, vl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, vl); | |||
| } | |||
| } | |||
| else { | |||
| int stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| v1 = VLSEV_FLOAT(x+1, stride_x, vl); | |||
| v0 = VFABSV_FLOAT(v0, vl); | |||
| v1 = VFABSV_FLOAT(v1, vl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, vl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); | |||
| asumf += VFMVFS_FLOAT_M1(v_res); | |||
| return(asumf); | |||
| } | |||
| @@ -0,0 +1,151 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /*************************************************************************** | |||
| * 2014/06/07 Saar | |||
| * | |||
| ***************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m4 | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VSSEG_FLOAT vsseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VSSSEG_FLOAT vssseg2e32_v_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m4 | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VSSEG_FLOAT vsseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VSSSEG_FLOAT vssseg2e64_v_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG inc_x2, inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| BLASLONG stride_x = inc_x2 * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y2 * sizeof(FLOAT); | |||
| FLOAT_V_T vx0, vx1, vy0, vy1; | |||
| if ( beta_r == 0.0 && beta_i == 0.0) | |||
| { | |||
| if ( alpha_r == 0.0 && alpha_i == 0.0 ) | |||
| { | |||
| size_t vl = VSETVL(n); | |||
| FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); | |||
| for ( ; n > 0; n -= vl, y += vl*stride_y) | |||
| { | |||
| vl = VSETVL(n); | |||
| VSSSEG_FLOAT(y, stride_y, temp, temp, vl); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) | |||
| { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); | |||
| vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); | |||
| vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); | |||
| VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| FLOAT_V_T v0, v1; | |||
| if ( alpha_r == 0.0 && alpha_i == 0.0 ) | |||
| { | |||
| for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) | |||
| { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| v0 = VFMULVF_FLOAT(vy1, beta_i, vl); | |||
| v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); | |||
| v1 = VFMULVF_FLOAT(vy1, beta_r, vl); | |||
| v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); | |||
| VSSSEG_FLOAT(y, stride_y, v0, v1, vl); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) | |||
| { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); | |||
| v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); | |||
| v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); | |||
| v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); | |||
| v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); | |||
| v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); | |||
| v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); | |||
| VSSSEG_FLOAT(y, stride_y, v0, v1, vl); | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,154 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VSSEG_FLOAT vsseg2e32_v_f32m4 | |||
| #define VSSSEG_FLOAT vssseg2e32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VSSEG_FLOAT vsseg2e64_v_f64m4 | |||
| #define VSSSEG_FLOAT vssseg2e64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if(n < 0) return(0); | |||
| if(da_r == 0.0 && da_i == 0.0) return(0); | |||
| FLOAT_V_T vx0, vx1, vy0, vy1; | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| #if !defined(CONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); | |||
| #endif | |||
| VSSEG_FLOAT(y, vy0, vy1, vl); | |||
| } | |||
| } else if (inc_x == 1) { | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| #if !defined(CONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); | |||
| #endif | |||
| VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); | |||
| } | |||
| } else if (inc_y == 1) { | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| #if !defined(CONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); | |||
| #endif | |||
| VSSEG_FLOAT(y, vy0, vy1, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| #if !defined(CONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); | |||
| #endif | |||
| VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,105 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL_M8(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T_M8 vfloat32m8_t | |||
| #define VLEV_FLOAT_M8 vle32_v_f32m8 | |||
| #define VSEV_FLOAT_M8 vse32_v_f32m8 | |||
| #define VSETVL_M4(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T_M4 vfloat32m4_t | |||
| #define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4 | |||
| #define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4 | |||
| #define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4 | |||
| #else | |||
| #define VSETVL_M8(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T_M8 vfloat64m8_t | |||
| #define VLEV_FLOAT_M8 vle64_v_f64m8 | |||
| #define VSEV_FLOAT_M8 vse64_v_f64m8 | |||
| #define VSETVL_M4(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T_M4 vfloat64m4_t | |||
| #define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4 | |||
| #define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4 | |||
| #define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if(n < 0) return(0); | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| FLOAT_V_T_M8 vx; | |||
| n *= 2; // convert to words | |||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL_M8(n); | |||
| vx = VLEV_FLOAT_M8(x, vl); | |||
| VSEV_FLOAT_M8(y, vx, vl); | |||
| } | |||
| }else if (1 == inc_x) { | |||
| FLOAT_V_T_M4 vr, vi; | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { | |||
| vl = VSETVL_M4(n); | |||
| VLSEG_FLOAT_M4(&vr, &vi, x, vl); | |||
| VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| FLOAT_V_T_M4 vr, vi; | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { | |||
| vl = VSETVL_M4(n); | |||
| VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); | |||
| VSSEG_FLOAT_M4(y, vr, vi, vl); | |||
| } | |||
| } else { | |||
| FLOAT_V_T_M4 vr, vi; | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { | |||
| vl = VSETVL_M4(n); | |||
| VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); | |||
| VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,170 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #define VFMSACVV_FLOAT vfmsac_vv_f32m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #define VFMSACVV_FLOAT vfmsac_vv_f64m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| OPENBLAS_COMPLEX_FLOAT result; | |||
| CREAL(result) = 0.0; | |||
| CIMAG(result) = 0.0; | |||
| if ( n <= 0 ) return(result); | |||
| FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| size_t vlmax_m1 = VSETVL_MAX_M1; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax_m1); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vr0 = VFMVVF_FLOAT(0, vlmax); | |||
| vr1 = VFMVVF_FLOAT(0, vlmax); | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); | |||
| #if !defined(CONJ) | |||
| vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); | |||
| #else | |||
| vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); | |||
| vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); | |||
| #endif | |||
| } | |||
| } else if (inc_x == 1){ | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); | |||
| #if !defined(CONJ) | |||
| vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); | |||
| #else | |||
| vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); | |||
| vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); | |||
| #endif | |||
| } | |||
| } else if (inc_y == 1){ | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); | |||
| #if !defined(CONJ) | |||
| vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); | |||
| #else | |||
| vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); | |||
| vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); | |||
| #endif | |||
| } | |||
| }else { | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); | |||
| #if !defined(CONJ) | |||
| vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); | |||
| #else | |||
| vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); | |||
| vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); | |||
| #endif | |||
| } | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax); | |||
| CREAL(result) = VFMVFS_FLOAT_M1(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax); | |||
| CIMAG(result) = VFMVFS_FLOAT_M1(v_res); | |||
| return(result); | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VSSEG_FLOAT vsseg2e32_v_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m4 | |||
| #define VFSUBVV_FLOAT vfsub_vv_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VSSEG_FLOAT vsseg2e64_v_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m4 | |||
| #define VFSUBVV_FLOAT vfsub_vv_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, | |||
| FLOAT beta_r, FLOAT beta_i, | |||
| FLOAT *dummy2, BLASLONG dummy3, | |||
| FLOAT *dummy4, BLASLONG dummy5, | |||
| FLOAT *c, BLASLONG ldc) | |||
| { | |||
| BLASLONG chunk; | |||
| FLOAT *c_offset; | |||
| size_t vl; | |||
| FLOAT_V_T vr, vi, v1, v2, v3, v4; | |||
| ldc *= 2; | |||
| c_offset = c; | |||
| if (beta_r == 0.0 && beta_i == 0.0) { | |||
| vl = VSETVL(m); | |||
| vr = VFMVVF_FLOAT(0.0, vl); | |||
| vi = VFMVVF_FLOAT(0.0, vl); | |||
| for( ; n > 0; n--, c += ldc) { | |||
| c_offset = c; | |||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { | |||
| vl = VSETVL(chunk); | |||
| VSSEG_FLOAT(c_offset, vr, vi, vl); | |||
| } | |||
| } | |||
| } else { | |||
| for( ; n > 0; n--, c += ldc) { | |||
| c_offset = c; | |||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { | |||
| vl = VSETVL(chunk); | |||
| VLSEG_FLOAT(&vr, &vi, c_offset, vl); | |||
| v1 = VFMULVF_FLOAT(vr, beta_r, vl); | |||
| v2 = VFMULVF_FLOAT(vi, beta_i, vl); | |||
| v3 = VFMULVF_FLOAT(vi, beta_r, vl); | |||
| v4 = VFMULVF_FLOAT(vr, beta_i, vl); | |||
| vr = VFSUBVV_FLOAT(v1, v2, vl); | |||
| vi = VFADDVV_FLOAT(v3, v4, vl); | |||
| VSSEG_FLOAT(c_offset, vr, vi, vl); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,170 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VSSEG_FLOAT vsseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VSSSEG_FLOAT vssseg2e32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VSSEG_FLOAT vsseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VSSSEG_FLOAT vssseg2e64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp_r, temp_i; | |||
| FLOAT_V_T va0, va1, vy0, vy1; | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; | |||
| BLASLONG inc_x2 = inc_x * 2; | |||
| BLASLONG lda2 = lda * 2; | |||
| if (inc_y == 1) | |||
| { | |||
| for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) { | |||
| vl = VSETVL(m); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| for(i = 0; i < n; i++){ | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||
| #else | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||
| #endif | |||
| VLSEG_FLOAT(&va0, &va1, a_ptr, vl); | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); | |||
| #endif | |||
| #endif | |||
| a_ptr += lda2; | |||
| ix += inc_x2; | |||
| } | |||
| VSSEG_FLOAT(y, vy0, vy1, vl); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(m); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| for(i = 0; i < n; i++){ | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||
| #else | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||
| #endif | |||
| VLSEG_FLOAT(&va0, &va1, a_ptr, vl); | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); | |||
| #endif | |||
| #endif | |||
| a_ptr += lda2; | |||
| ix += inc_x2; | |||
| } | |||
| VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,172 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i = 0, j = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp_r, temp_i; | |||
| FLOAT_V_T va0, va1, vx0, vx1, vr, vi; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| //BLASLONG stride_a = sizeof(FLOAT) * 2; | |||
| BLASLONG inc_y2 = inc_y * 2; | |||
| BLASLONG lda2 = lda * 2; | |||
| size_t vlmax = VSETVL_MAX_M1; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| vlmax = VSETVL(m); | |||
| if (inc_x == 1) | |||
| { | |||
| for(i = 0; i < n; i++) { | |||
| j = 0; | |||
| ix = 0; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| vi = VFMVVF_FLOAT(0, vlmax); | |||
| for(size_t vl, k = m; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); | |||
| VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); | |||
| vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); | |||
| #else | |||
| vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); | |||
| vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); | |||
| vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); | |||
| #endif | |||
| j += vl * 2; | |||
| ix += vl * inc_x * 2; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); | |||
| temp_r = VFMVFS_FLOAT_M1(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); | |||
| temp_i = VFMVFS_FLOAT_M1(v_res); | |||
| #if !defined(XCONJ) | |||
| y[iy] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| #else | |||
| y[iy] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| #endif | |||
| iy += inc_y2; | |||
| a_ptr += lda2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for(i = 0; i < n; i++) { | |||
| j = 0; | |||
| ix = 0; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| vi = VFMVVF_FLOAT(0, vlmax); | |||
| for(size_t vl, k = m; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); | |||
| VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); | |||
| vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); | |||
| #else | |||
| vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); | |||
| vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); | |||
| vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); | |||
| #endif | |||
| j += vl * 2; | |||
| ix += vl * inc_x * 2; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); | |||
| temp_r = VFMVFS_FLOAT_M1(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); | |||
| temp_i = VFMVFS_FLOAT_M1(v_res); | |||
| #if !defined(XCONJ) | |||
| y[iy] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| #else | |||
| y[iy] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| #endif | |||
| iy += inc_y2; | |||
| a_ptr += lda2; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,122 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VFABSV_FLOAT vfabs_v_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VFABSV_FLOAT vfabs_v_f64m4 | |||
| #endif | |||
| // TODO: Should single precision use the widening MAC, or perhaps all should be double? | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| if ( n <= 0 ) return(0.0); | |||
| FLOAT_V_T vr, v0, v1; | |||
| FLOAT_V_T_M1 v_max, v_res; | |||
| FLOAT scale = 0.0, ssq = 0.0; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_max = VFMVVF_FLOAT_M1(0, vlmax); | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| if (inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&v0, &v1, x, vl); | |||
| v0 = VFABSV_FLOAT(v0, vl); | |||
| v1 = VFABSV_FLOAT(v1, vl); | |||
| v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); | |||
| vr = VFMACCVV_FLOAT(vr, v0, v0, vl); | |||
| v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); | |||
| vr = VFMACCVV_FLOAT(vr, v1, v1, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); | |||
| v0 = VFABSV_FLOAT(v0, vl); | |||
| v1 = VFABSV_FLOAT(v1, vl); | |||
| v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); | |||
| vr = VFMACCVV_FLOAT(vr, v0, v0, vl); | |||
| v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); | |||
| vr = VFMACCVV_FLOAT(vr, v1, v1, vl); | |||
| } | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); | |||
| ssq = VFMVFS_FLOAT_M1(v_res); | |||
| scale = VFMVFS_FLOAT_M1(v_max); | |||
| ssq = ssq / (scale*scale); | |||
| return(scale * sqrt(ssq)); | |||
| } | |||
| @@ -0,0 +1,181 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VSSEG_FLOAT vsseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VSSSEG_FLOAT vssseg2e32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VSSEG_FLOAT vsseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VSSSEG_FLOAT vssseg2e64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| if (n <= 0) return(0); | |||
| FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; | |||
| if (inc_x == 0 && inc_y == 0) { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[iy] ; | |||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| vt0 = VFMULVF_FLOAT(vx0, c, vl); | |||
| vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); | |||
| vt1 = VFMULVF_FLOAT(vx1, c, vl); | |||
| vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); | |||
| vy0 = VFMULVF_FLOAT(vy0, c, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); | |||
| vy1 = VFMULVF_FLOAT(vy1, c, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); | |||
| VSSEG_FLOAT(x, vt0, vt1, vl); | |||
| VSSEG_FLOAT(y, vy0, vy1, vl); | |||
| } | |||
| } else if (inc_x == 1){ | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| vt0 = VFMULVF_FLOAT(vx0, c, vl); | |||
| vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); | |||
| vt1 = VFMULVF_FLOAT(vx1, c, vl); | |||
| vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); | |||
| vy0 = VFMULVF_FLOAT(vy0, c, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); | |||
| vy1 = VFMULVF_FLOAT(vy1, c, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); | |||
| VSSEG_FLOAT(x, vt0, vt1, vl); | |||
| VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); | |||
| } | |||
| } else if (inc_y == 1){ | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| vt0 = VFMULVF_FLOAT(vx0, c, vl); | |||
| vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); | |||
| vt1 = VFMULVF_FLOAT(vx1, c, vl); | |||
| vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); | |||
| vy0 = VFMULVF_FLOAT(vy0, c, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); | |||
| vy1 = VFMULVF_FLOAT(vy1, c, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); | |||
| VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); | |||
| VSSEG_FLOAT(y, vy0, vy1, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| vt0 = VFMULVF_FLOAT(vx0, c, vl); | |||
| vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); | |||
| vt1 = VFMULVF_FLOAT(vx1, c, vl); | |||
| vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); | |||
| vy0 = VFMULVF_FLOAT(vy0, c, vl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); | |||
| vy1 = VFMULVF_FLOAT(vy1, c, vl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); | |||
| VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); | |||
| VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,148 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VSSEG_FLOAT vsseg2e32_v_f32m4 | |||
| #define VSSSEG_FLOAT vssseg2e32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VSSEG_FLOAT vsseg2e64_v_f64m4 | |||
| #define VSSSEG_FLOAT vssseg2e64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if((n <= 0) || (inc_x <= 0)) return(0); | |||
| FLOAT_V_T vt, vr, vi; | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| size_t vlmax = VSETVL_MAX; | |||
| if(da_r == 0.0 && da_i == 0.0) { | |||
| vr = VFMVVF_FLOAT(0.0, vlmax); | |||
| vi = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2) { | |||
| vl = VSETVL(n); | |||
| VSSEG_FLOAT(x, vr, vi, vl); | |||
| } | |||
| } else { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| VSSSEG_FLOAT(x, stride_x, vr, vi, vl); | |||
| } | |||
| } | |||
| } else if(da_r == 0.0) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); | |||
| vt = VFMULVF_FLOAT(vi, -da_i, vl); | |||
| vi = VFMULVF_FLOAT(vr, da_i, vl); | |||
| VSSSEG_FLOAT(x, stride_x, vt, vi, vl); | |||
| } | |||
| } else if(da_i == 0.0) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); | |||
| vr = VFMULVF_FLOAT(vr, da_r, vl); | |||
| vi = VFMULVF_FLOAT(vi, da_r, vl); | |||
| VSSSEG_FLOAT(x, stride_x, vr, vi, vl); | |||
| } | |||
| } else { | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vr, &vi, x, vl); | |||
| vt = VFMULVF_FLOAT(vr, da_r, vl); | |||
| vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); | |||
| vi = VFMULVF_FLOAT(vi, da_r, vl); | |||
| vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); | |||
| VSSEG_FLOAT(x, vt, vi, vl); | |||
| } | |||
| } else { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); | |||
| vt = VFMULVF_FLOAT(vr, da_r, vl); | |||
| vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); | |||
| vi = VFMULVF_FLOAT(vi, da_r, vl); | |||
| vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); | |||
| VSSSEG_FLOAT(x, stride_x, vt, vi, vl); | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,97 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m4 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| FLOAT_V_T v0, v1; | |||
| size_t vlmax = VSETVL_MAX; | |||
| FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&v0, &v1, x, vl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, vl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, vl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); | |||
| sumf += VFMVFS_FLOAT_M1(v_res); | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,156 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT vlsseg2e32_v_f32m4 | |||
| #define VSSEG_FLOAT vsseg2e32_v_f32m4 | |||
| #define VSSSEG_FLOAT vssseg2e32_v_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT vlsseg2e64_v_f64m4 | |||
| #define VSSEG_FLOAT vsseg2e64_v_f64m4 | |||
| #define VSSSEG_FLOAT vssseg2e64_v_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if (n <= 0) return(0); | |||
| FLOAT_V_T vx0, vx1, vy0, vy1; | |||
| if (inc_x == 0 && inc_y == 0) { | |||
| if (n & 1) { | |||
| FLOAT temp[2]; | |||
| temp[0] = x[0]; | |||
| temp[1] = x[1]; | |||
| x[0] = y[0]; | |||
| x[1] = y[1]; | |||
| y[0] = temp[0]; | |||
| y[1] = temp[1]; | |||
| } | |||
| else { | |||
| return 0; | |||
| } | |||
| } | |||
| else if(inc_x == 0) { | |||
| FLOAT temp[2]; | |||
| temp[0] = x[0]; | |||
| temp[1] = x[1]; | |||
| x[0] = y[(n - 1) * inc_y * 2]; | |||
| x[0] = y[(n - 1) * inc_y * 2 + 1]; | |||
| FLOAT* ptr = y + (n - 1) * inc_y * 2; // start from the last one | |||
| BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse | |||
| BLASLONG m = n - 1; | |||
| for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { | |||
| vl = VSETVL(m); | |||
| VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); | |||
| VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); | |||
| } | |||
| y[0] = temp[0]; | |||
| y[1] = temp[1]; | |||
| } | |||
| else if(inc_y == 0) { | |||
| FLOAT temp[2]; | |||
| temp[0] = y[0]; | |||
| temp[1] = y[1]; | |||
| y[0] = x[(n - 1) * inc_x * 2]; | |||
| y[0] = x[(n - 1) * inc_x * 2 + 1]; | |||
| FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one | |||
| BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse | |||
| BLASLONG m = n - 1; | |||
| for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { | |||
| vl = VSETVL(m); | |||
| VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); | |||
| VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); | |||
| } | |||
| x[0] = temp[0]; | |||
| x[1] = temp[1]; | |||
| } | |||
| else if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| VSSEG_FLOAT(y, vx0, vx1, vl); | |||
| VSSEG_FLOAT(x, vy0, vy1, vl); | |||
| } | |||
| } else if (inc_x == 1){ | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(n); | |||
| VLSEG_FLOAT(&vx0, &vx1, x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); | |||
| VSSEG_FLOAT(x, vy0, vy1, vl); | |||
| } | |||
| } else if (inc_y == 1){ | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSEG_FLOAT(&vy0, &vy1, y, vl); | |||
| VSSEG_FLOAT(y, vx0, vx1, vl); | |||
| VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { | |||
| vl = VSETVL(n); | |||
| VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); | |||
| VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); | |||
| VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); | |||
| VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,596 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m2() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VLSEG4_FLOAT vlseg4e32_v_f32m2 | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m2 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m2 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m2 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m2 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m2() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VLSEG4_FLOAT vlseg4e64_v_f64m2 | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m2 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m2 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m2 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m2 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/ztrmmkernel_2x2.c | |||
| /******************************** | |||
| ADD1 a*c | |||
| ADD2 b*c | |||
| ADD3 a*d | |||
| ADD4 b*d | |||
| *********************************/ | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, | |||
| FLOAT* C,BLASLONG ldc, BLASLONG offset) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| FLOAT res0,res1; | |||
| BLASLONG off, temp; | |||
| FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; | |||
| FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; | |||
| FLOAT_V_T_M1 v_m1_res0, v_m1_res1; | |||
| FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vl; | |||
| size_t vlmax = VSETVL_MAX; | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off = -offset; | |||
| #else | |||
| off = 0; | |||
| #endif | |||
| for (j = bn/2; j > 0; j--) | |||
| { | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| C0 = C; | |||
| C1 = C0+2*ldc; | |||
| ptrba = ba; | |||
| for (i = bm/2; i > 0; i--) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2*2; | |||
| ptrbb = bb+off*2*2; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres4 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres5 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres6 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres7 = VFMVVF_FLOAT(0.0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk - off; | |||
| #elif defined(LEFT) | |||
| temp = off + 2; | |||
| #else | |||
| temp = off + 2; | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); | |||
| VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); | |||
| vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); | |||
| vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl); | |||
| vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); | |||
| vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); | |||
| vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl); | |||
| vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); | |||
| vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); | |||
| vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); | |||
| vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); | |||
| vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); | |||
| vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); | |||
| vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); | |||
| vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); | |||
| vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); | |||
| vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); | |||
| vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); | |||
| vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); | |||
| vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); | |||
| #endif | |||
| ptrba += vl * 4; | |||
| ptrbb += vl * 4; | |||
| } | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C0[0] = res0 * alphar - res1 * alphai; | |||
| C0[1] = res1 * alphar + res0 * alphai; | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C0[2] = res0 * alphar - res1 * alphai; | |||
| C0[3] = res1 * alphar + res0 * alphai; | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C1[0] = res0 * alphar - res1 * alphai; | |||
| C1[1] = res1 * alphar + res0 * alphai; | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C1[2] = res0 * alphar - res1 * alphai; | |||
| C1[3] = res1 * alphar + res0 * alphai; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; | |||
| #else | |||
| temp -= 2; | |||
| #endif | |||
| ptrba += temp*2*2; | |||
| ptrbb += temp*2*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; | |||
| #endif | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| } | |||
| if (bm & 1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*2*2; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk - off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; | |||
| #else | |||
| temp = off+2; | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); | |||
| vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); | |||
| vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); | |||
| #endif | |||
| ptrba += vl * 2; | |||
| ptrbb += vl * 4; | |||
| } | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C0[0] = res0 * alphar - res1 * alphai; | |||
| C0[1] = res1 * alphar + res0 * alphai; | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C1[0] = res0 * alphar - res1 * alphai; | |||
| C1[1] = res1 * alphar + res0 * alphai; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; | |||
| #else | |||
| temp -= 2; | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*2*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; | |||
| #endif | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; | |||
| #endif | |||
| k = (bk<<2); | |||
| bb = bb+k; | |||
| i = (ldc<<2); | |||
| C = C+i; | |||
| } | |||
| if (bn & 1) | |||
| { | |||
| C0 = C; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i = bm/2; i > 0; i--) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2*2; | |||
| ptrbb = bb+off*2; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk - off; | |||
| #elif defined(LEFT) | |||
| temp = off + 2; | |||
| #else | |||
| temp = off + 1; | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); | |||
| vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); | |||
| #endif | |||
| ptrba += vl * 4; | |||
| ptrbb += vl * 2; | |||
| } | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C0[0] = res0 * alphar - res1 * alphai; | |||
| C0[1] = res1 * alphar + res0 * alphai; | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C0[2] = res0 * alphar - res1 * alphai; | |||
| C0[3] = res1 * alphar + res0 * alphai; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk-off; | |||
| #ifdef LEFT | |||
| temp -= 2; | |||
| #else | |||
| temp -= 1; | |||
| #endif | |||
| ptrba += temp*2*2; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; | |||
| #endif | |||
| C0 = C0+4; | |||
| } | |||
| if (bm & 1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off + 1; | |||
| #else | |||
| temp = off + 1; | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); | |||
| vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); | |||
| #endif | |||
| ptrba += vl * 2; | |||
| ptrbb += vl * 2; | |||
| } | |||
| v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); | |||
| v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); | |||
| res0 = VFMVFS_FLOAT_M1(v_m1_res0); | |||
| res1 = VFMVFS_FLOAT_M1(v_m1_res1); | |||
| C0[0] = res0 * alphar - res1 * alphai; | |||
| C0[1] = res1 * alphar + res0 * alphai; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; | |||
| #else | |||
| temp -= 1; | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; | |||
| #endif | |||
| C0 = C0+2; | |||
| } | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -3038,6 +3038,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(x280) | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2 | |||
| /* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) | |||
| * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. | |||
| * If VLMAX size is ever more than 1024, this should be increased also. */ | |||
| #define SGEMM_DEFAULT_UNROLL_MN 32 | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 | |||
| #define DGEMM_DEFAULT_UNROLL_MN 32 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 160 | |||
| #define DGEMM_DEFAULT_P 160 | |||
| #define CGEMM_DEFAULT_P 96 | |||
| #define ZGEMM_DEFAULT_P 64 | |||
| #define SGEMM_DEFAULT_Q 240 | |||
| #define DGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 120 | |||
| #define ZGEMM_DEFAULT_Q 120 | |||
| #define SGEMM_DEFAULT_R 12288 | |||
| #define DGEMM_DEFAULT_R 8192 | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #define SYMV_P 16 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #endif | |||
| #ifdef C910V | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||