| @@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
| float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
| float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | ||||
| CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | ||||
| CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| @@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS | |||||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||||
| void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||||
| void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | ||||
| @@ -130,6 +130,8 @@ endif () | |||||
| foreach (float_type ${FLOAT_TYPES}) | foreach (float_type ${FLOAT_TYPES}) | ||||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | ||||
| GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||||
| GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) | GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) | ||||
| GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) | GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) | ||||
| GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) | GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) | ||||
| @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ | |||||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | ||||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | ||||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | ||||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ | |||||
| cblas_samin.$(SUFFIX) | |||||
| CSBLAS2OBJS = \ | CSBLAS2OBJS = \ | ||||
| cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | ||||
| @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ | |||||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | ||||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | ||||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | ||||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ | |||||
| cblas_damin.$(SUFFIX) | |||||
| CDBLAS2OBJS = \ | CDBLAS2OBJS = \ | ||||
| cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | ||||
| @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ | |||||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | ||||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | ||||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | ||||
| cblas_caxpby.$(SUFFIX) \ | |||||
| cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ | |||||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | ||||
| CCBLAS2OBJS = \ | CCBLAS2OBJS = \ | ||||
| @@ -340,12 +342,12 @@ CXERBLAOBJ = \ | |||||
| CZBLAS1OBJS = \ | CZBLAS1OBJS = \ | ||||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | ||||
| cblas_zcopy.$(SUFFIX) \ | |||||
| cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ | |||||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | ||||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | ||||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | ||||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | ||||
| cblas_zaxpby.$(SUFFIX) \ | |||||
| cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ | |||||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | ||||
| @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c | |||||
| cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | ||||
| cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
| cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
| cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
| cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
| cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| @@ -1627,6 +1653,19 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c | |||||
| cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c | cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||||
| cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||||
| cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||||
| sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c | |||||
| cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c | cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| @@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||||
| #else | #else | ||||
| #ifdef COMPLEX | |||||
| FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||||
| FLOAT *x = (FLOAT*) vx; | |||||
| #else | |||||
| FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | ||||
| #endif | |||||
| FLOAT ret; | FLOAT ret; | ||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S | |||||
| SAMAXKERNEL = amax_lsx.S | SAMAXKERNEL = amax_lsx.S | ||||
| DAMAXKERNEL = amax_lsx.S | DAMAXKERNEL = amax_lsx.S | ||||
| CAMAXKERNEL = camax_lsx.S | CAMAXKERNEL = camax_lsx.S | ||||
| ZAMAXKERNEL = camax_lsx.S | |||||
| SAMINKERNEL = amin_lsx.S | SAMINKERNEL = amin_lsx.S | ||||
| DAMINKERNEL = amin_lsx.S | DAMINKERNEL = amin_lsx.S | ||||
| CAMINKERNEL = camin_lsx.S | CAMINKERNEL = camin_lsx.S | ||||
| ZAMINKERNEL = camin_lsx.S | |||||
| SMAXKERNEL = max_lsx.S | SMAXKERNEL = max_lsx.S | ||||
| DMAXKERNEL = max_lsx.S | DMAXKERNEL = max_lsx.S | ||||
| @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S | |||||
| SAMAXKERNEL = amax_lasx.S | SAMAXKERNEL = amax_lasx.S | ||||
| DAMAXKERNEL = amax_lasx.S | DAMAXKERNEL = amax_lasx.S | ||||
| CAMAXKERNEL = camax_lasx.S | CAMAXKERNEL = camax_lasx.S | ||||
| ZAMAXKERNEL = camax_lasx.S | |||||
| SAMINKERNEL = amin_lasx.S | SAMINKERNEL = amin_lasx.S | ||||
| DAMINKERNEL = amin_lasx.S | DAMINKERNEL = amin_lasx.S | ||||
| CAMINKERNEL = camin_lasx.S | CAMINKERNEL = camin_lasx.S | ||||
| ZAMINKERNEL = camin_lasx.S | |||||
| SMAXKERNEL = max_lsx.S | SMAXKERNEL = max_lsx.S | ||||
| DMAXKERNEL = max_lsx.S | DMAXKERNEL = max_lsx.S | ||||
| @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| xvldrepl.w VM0, X, 0 | xvldrepl.w VM0, X, 0 | ||||
| #endif | #endif | ||||
| XVFSUB VM0, VM0, VM0 | |||||
| bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
| srai.d I, N, 4 | srai.d I, N, 4 | ||||
| @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| vldrepl.w VM0, X, 0 | vldrepl.w VM0, X, 0 | ||||
| #endif | #endif | ||||
| VFSUB VM0, VM0, VM0 | |||||
| bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
| srai.d I, N, 3 | srai.d I, N, 3 | ||||
| @@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
| bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
| li.d TEMP, 1 | li.d TEMP, 1 | ||||
| li.w I, -1 | |||||
| slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
| slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
| xvreplgr2vr.w neg1, I | |||||
| xvffint.s.w neg1, neg1 | |||||
| srai.d I, N, 3 | srai.d I, N, 3 | ||||
| bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
| bge $r0, I, .L23 | bge $r0, I, .L23 | ||||
| .align 3 | .align 3 | ||||
| .L10: | .L10: | ||||
| xvld VX0, X, 0 * SIZE | |||||
| xvld VX1, X, 8 * SIZE | |||||
| addi.d I, I, -1 | |||||
| xvld VX0, X, 0 | |||||
| xvld VX1, X, 32 | |||||
| #ifdef DOUBLE | |||||
| xvpickev.d x1, VX1, VX0 | |||||
| xvpickod.d x2, VX1, VX0 | |||||
| #else | |||||
| xvpickev.w x1, VX1, VX0 | xvpickev.w x1, VX1, VX0 | ||||
| xvpickod.w x2, VX1, VX0 | xvpickod.w x2, VX1, VX0 | ||||
| xvfmul.s x3, neg1, x1 | |||||
| xvfmul.s x4, neg1, x2 | |||||
| xvfcmp.clt.s VT0, x1, res0 | |||||
| xvfcmp.clt.s VT1, x2, res0 | |||||
| xvbitsel.v x1, x1, x3, VT0 | |||||
| xvbitsel.v x2, x2, x4, VT1 | |||||
| #endif | |||||
| XVFSUB x3, res0, x1 | |||||
| XVFSUB x4, res0, x2 | |||||
| XVFMAX x1, x1, x3 | |||||
| XVFMAX x2, x2, x4 | |||||
| XVFADD VM1, x1, x2 | |||||
| XVFMAX VM0, VM0, VM1 | |||||
| #ifdef DOUBLE | |||||
| xvld VX0, X, 64 | |||||
| xvld VX1, X, 96 | |||||
| xvpickev.d x1, VX1, VX0 | |||||
| xvpickod.d x2, VX1, VX0 | |||||
| XVFSUB x3, res0, x1 | |||||
| XVFSUB x4, res0, x2 | |||||
| XVFMAX x1, x1, x3 | |||||
| XVFMAX x2, x2, x4 | |||||
| XVFADD VM1, x1, x2 | |||||
| XVFMAX VM0, VM0, VM1 | |||||
| #endif | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 16 * SIZE | addi.d X, X, 16 * SIZE | ||||
| xvfadd.s VM1, x1, x2 | |||||
| xvfmax.s VM0, VM0, VM1 | |||||
| blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
| .align 3 | .align 3 | ||||
| .L11: | .L11: | ||||
| #ifdef DOUBLE | |||||
| xvpickve.d x1, VM0, 0 | |||||
| xvpickve.d x2, VM0, 1 | |||||
| XVFMAX VM0, x1, x2 | |||||
| #else | |||||
| xvpickve.w x1, VM0, 0 | xvpickve.w x1, VM0, 0 | ||||
| xvpickve.w x2, VM0, 1 | xvpickve.w x2, VM0, 1 | ||||
| xvpickve.w x3, VM0, 2 | xvpickve.w x3, VM0, 2 | ||||
| xvpickve.w x4, VM0, 3 | xvpickve.w x4, VM0, 3 | ||||
| xvfmax.s VM1, x1, x2 | |||||
| xvfmax.s VM0, x3, x4 | |||||
| xvfmax.s VM0, VM0, VM1 | |||||
| XVFMAX VM0, x1, x2 | |||||
| XVFMAX VM1, x3, x4 | |||||
| XVFMAX VM0, VM0, VM1 | |||||
| #endif | |||||
| b .L23 | b .L23 | ||||
| .align 3 | .align 3 | ||||
| @@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .align 3 | .align 3 | ||||
| .L21: | .L21: | ||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmax.s s1, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMAX s1, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmax.s s1, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMAX s1, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmax.s s3, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMAX s3, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmax.s s4, t1, t3 | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMAX s4, t1, t3 | |||||
| blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
| .align 3 | .align 3 | ||||
| .L22: | .L22: | ||||
| fmax.s s1, s1, s2 | |||||
| fmax.s s3, s3, s4 | |||||
| fmax.s s1, s1, s3 | |||||
| FMAX s1, s1, s2 | |||||
| FMAX s3, s3, s4 | |||||
| FMAX s1, s1, s3 | |||||
| .align 3 | .align 3 | ||||
| .L23: //N<8 | .L23: //N<8 | ||||
| @@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| FABS a1, a1 | FABS a1, a1 | ||||
| ADD a0, a0, a1 | ADD a0, a0, a1 | ||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fmax.s s1, a0, s1 | |||||
| FMAX s1, a0, s1 | |||||
| blt $r0, I, .L24 | blt $r0, I, .L24 | ||||
| .align 3 | .align 3 | ||||
| .L999: | .L999: | ||||
| fmov.s $f0, $f22 | |||||
| MOV $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| .align 3 | .align 3 | ||||
| @@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
| bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
| li.d TEMP, 1 | li.d TEMP, 1 | ||||
| li.w I, -1 | |||||
| slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
| slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
| vreplgr2vr.w neg1, I | |||||
| vffint.s.w neg1, neg1 | |||||
| srai.d I, N, 3 | srai.d I, N, 3 | ||||
| bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
| bge $r0, I, .L23 | bge $r0, I, .L23 | ||||
| .align 3 | .align 3 | ||||
| .L10: | .L10: | ||||
| vld VX0, X, 0 * SIZE | |||||
| vld VX1, X, 4 * SIZE | |||||
| addi.d I, I, -1 | |||||
| vld VX0, X, 0 | |||||
| vld VX1, X, 16 | |||||
| #ifdef DOUBLE | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| #else | |||||
| vpickev.w x1, VX1, VX0 | vpickev.w x1, VX1, VX0 | ||||
| vpickod.w x2, VX1, VX0 | vpickod.w x2, VX1, VX0 | ||||
| vfmul.s x3, neg1, x1 | |||||
| vfmul.s x4, neg1, x2 | |||||
| vfcmp.clt.s VT0, x1, res0 | |||||
| vfcmp.clt.s VT1, x2, res0 | |||||
| vld VX0, X, 8 * SIZE | |||||
| vbitsel.v x1, x1, x3, VT0 | |||||
| vbitsel.v x2, x2, x4, VT1 | |||||
| vld VX1, X, 12 * SIZE | |||||
| vfadd.s VM1, x1, x2 | |||||
| #endif | |||||
| VFSUB x3, res0, x1 | |||||
| VFSUB x4, res0, x2 | |||||
| VFMAX x1, x1, x3 | |||||
| VFMAX x2, x2, x4 | |||||
| VFADD VM1, x1, x2 | |||||
| vld VX0, X, 32 | |||||
| vld VX1, X, 48 | |||||
| #ifdef DOUBLE | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| #else | |||||
| vpickev.w x1, VX1, VX0 | vpickev.w x1, VX1, VX0 | ||||
| vpickod.w x2, VX1, VX0 | vpickod.w x2, VX1, VX0 | ||||
| vfmul.s x3, neg1, x1 | |||||
| vfmul.s x4, neg1, x2 | |||||
| vfcmp.clt.s VT0, x1, res0 | |||||
| vfcmp.clt.s VT1, x2, res0 | |||||
| #endif | |||||
| VFSUB x3, res0, x1 | |||||
| VFSUB x4, res0, x2 | |||||
| VFMAX x1, x1, x3 | |||||
| VFMAX x2, x2, x4 | |||||
| VFADD x1, x1, x2 | |||||
| VFMAX VM1, x1, VM1 | |||||
| VFMAX VM0, VM0, VM1 | |||||
| #ifdef DOUBLE | |||||
| vld VX0, X, 64 | |||||
| vld VX1, X, 80 | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| VFSUB x3, res0, x1 | |||||
| VFSUB x4, res0, x2 | |||||
| VFMAX x1, x1, x3 | |||||
| VFMAX x2, x2, x4 | |||||
| VFADD VM1, x1, x2 | |||||
| vld VX0, X, 96 | |||||
| vld VX1, X, 112 | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| VFSUB x3, res0, x1 | |||||
| VFSUB x4, res0, x2 | |||||
| VFMAX x1, x1, x3 | |||||
| VFMAX x2, x2, x4 | |||||
| VFADD x1, x1, x2 | |||||
| VFMAX VM1, x1, VM1 | |||||
| VFMAX VM0, VM0, VM1 | |||||
| #endif | |||||
| addi.d X, X, 16 * SIZE | addi.d X, X, 16 * SIZE | ||||
| vbitsel.v x1, x1, x3, VT0 | |||||
| vbitsel.v x2, x2, x4, VT1 | |||||
| vfadd.s x1, x1, x2 | |||||
| vfmax.s VM1, x1, VM1 | |||||
| vfmax.s VM0, VM0, VM1 | |||||
| addi.d I, I, -1 | |||||
| blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
| .align 3 | .align 3 | ||||
| .L11: | .L11: | ||||
| #ifdef DOUBLE | |||||
| vreplvei.d x1, VM0, 0 | |||||
| vreplvei.d x2, VM0, 1 | |||||
| VFMAX VM0, x1, x2 | |||||
| #else | |||||
| vreplvei.w x1, VM0, 0 | vreplvei.w x1, VM0, 0 | ||||
| vreplvei.w x2, VM0, 1 | vreplvei.w x2, VM0, 1 | ||||
| vreplvei.w x3, VM0, 2 | vreplvei.w x3, VM0, 2 | ||||
| vreplvei.w x4, VM0, 3 | vreplvei.w x4, VM0, 3 | ||||
| vfmax.s VM1, x1, x2 | |||||
| vfmax.s VM0, x3, x4 | |||||
| vfmax.s VM0, VM0, VM1 | |||||
| VFMAX VM1, x1, x2 | |||||
| VFMAX VM0, x3, x4 | |||||
| VFMAX VM0, VM0, VM1 | |||||
| #endif | |||||
| b .L23 | b .L23 | ||||
| .align 3 | .align 3 | ||||
| @@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .align 3 | .align 3 | ||||
| .L21: | .L21: | ||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmax.s s1, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMAX s1, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmax.s s1, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMAX s1, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmax.s s3, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMAX s3, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmax.s s4, t1, t3 | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMAX s4, t1, t3 | |||||
| blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
| .align 3 | .align 3 | ||||
| .L22: | .L22: | ||||
| fmax.s s1, s1, s2 | |||||
| fmax.s s3, s3, s4 | |||||
| fmax.s s1, s1, s3 | |||||
| FMAX s1, s1, s2 | |||||
| FMAX s3, s3, s4 | |||||
| FMAX s1, s1, s3 | |||||
| .align 3 | .align 3 | ||||
| .L23: //N<8 | .L23: //N<8 | ||||
| @@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .align 3 | .align 3 | ||||
| .L24: | .L24: | ||||
| fld.s a0, X, 0 * SIZE | |||||
| fld.s a1, X, 1 * SIZE | |||||
| LD a0, X, 0 * SIZE | |||||
| LD a1, X, 1 * SIZE | |||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| fabs.s a0, a0 | |||||
| fabs.s a1, a1 | |||||
| fadd.s a0, a0, a1 | |||||
| FABS a0, a0 | |||||
| FABS a1, a1 | |||||
| ADD a0, a0, a1 | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fmax.s s1, a0, s1 | |||||
| FMAX s1, a0, s1 | |||||
| blt $r0, I, .L24 | blt $r0, I, .L24 | ||||
| .align 3 | .align 3 | ||||
| .L999: | .L999: | ||||
| fmov.s $f0, $f22 | |||||
| MOV $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| .align 3 | .align 3 | ||||
| @@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvxor.v res0, res0, res0 | xvxor.v res0, res0, res0 | ||||
| bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
| bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
| fld.s a0, X, 0 * SIZE | |||||
| fld.s a1, X, 1 * SIZE | |||||
| fabs.s a0, a0 | |||||
| fabs.s a1, a1 | |||||
| fadd.s s1, a1, a0 | |||||
| LD a0, X, 0 * SIZE | |||||
| LD a1, X, 1 * SIZE | |||||
| FABS a0, a0 | |||||
| FABS a1, a1 | |||||
| ADD s1, a1, a0 | |||||
| #ifdef DOUBLE | |||||
| xvreplve0.d VM0, VM0 | |||||
| #else | |||||
| xvreplve0.w VM0, VM0 | xvreplve0.w VM0, VM0 | ||||
| #endif | |||||
| li.d TEMP, 1 | li.d TEMP, 1 | ||||
| li.w I, -1 | |||||
| slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
| slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
| xvreplgr2vr.w neg1, I | |||||
| xvffint.s.w neg1, neg1 | |||||
| srai.d I, N, 3 | srai.d I, N, 3 | ||||
| bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
| bge $r0, I, .L23 | bge $r0, I, .L23 | ||||
| .align 3 | .align 3 | ||||
| .L10: | .L10: | ||||
| xvld VX0, X, 0 * SIZE | |||||
| xvld VX1, X, 8 * SIZE | |||||
| addi.d I, I, -1 | |||||
| xvld VX0, X, 0 | |||||
| xvld VX1, X, 32 | |||||
| #ifdef DOUBLE | |||||
| xvpickev.d x1, VX1, VX0 | |||||
| xvpickod.d x2, VX1, VX0 | |||||
| #else | |||||
| xvpickev.w x1, VX1, VX0 | xvpickev.w x1, VX1, VX0 | ||||
| xvpickod.w x2, VX1, VX0 | xvpickod.w x2, VX1, VX0 | ||||
| xvfmul.s x3, neg1, x1 | |||||
| xvfmul.s x4, neg1, x2 | |||||
| xvfcmp.clt.s VT0, x1, res0 | |||||
| xvfcmp.clt.s VT1, x2, res0 | |||||
| xvbitsel.v x1, x1, x3, VT0 | |||||
| xvbitsel.v x2, x2, x4, VT1 | |||||
| #endif | |||||
| XVFSUB x3, res0, x1 | |||||
| XVFSUB x4, res0, x2 | |||||
| XVFMAX x1, x1, x3 | |||||
| XVFMAX x2, x2, x4 | |||||
| XVFADD VM1, x1, x2 | |||||
| XVFMIN VM0, VM0, VM1 | |||||
| #ifdef DOUBLE | |||||
| xvld VX0, X, 64 | |||||
| xvld VX1, X, 96 | |||||
| xvpickev.d x1, VX1, VX0 | |||||
| xvpickod.d x2, VX1, VX0 | |||||
| XVFSUB x3, res0, x1 | |||||
| XVFSUB x4, res0, x2 | |||||
| XVFMAX x1, x1, x3 | |||||
| XVFMAX x2, x2, x4 | |||||
| XVFADD VM1, x1, x2 | |||||
| XVFMIN VM0, VM0, VM1 | |||||
| #endif | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 16 * SIZE | addi.d X, X, 16 * SIZE | ||||
| xvfadd.s VM1, x1, x2 | |||||
| xvfmin.s VM0, VM0, VM1 | |||||
| blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
| .align 3 | .align 3 | ||||
| .L11: | .L11: | ||||
| #ifdef DOUBLE | |||||
| xvpickve.d x1, VM0, 0 | |||||
| xvpickve.d x2, VM0, 1 | |||||
| XVFMIN VM0, x1, x2 | |||||
| #else | |||||
| xvpickve.w x1, VM0, 0 | xvpickve.w x1, VM0, 0 | ||||
| xvpickve.w x2, VM0, 1 | xvpickve.w x2, VM0, 1 | ||||
| xvpickve.w x3, VM0, 2 | xvpickve.w x3, VM0, 2 | ||||
| xvpickve.w x4, VM0, 3 | xvpickve.w x4, VM0, 3 | ||||
| xvfmin.s VM1, x1, x2 | |||||
| xvfmin.s VM0, x3, x4 | |||||
| xvfmin.s VM0, VM0, VM1 | |||||
| XVFMIN VM0, x1, x2 | |||||
| XVFMIN VM1, x3, x4 | |||||
| XVFMIN VM0, VM0, VM1 | |||||
| #endif | |||||
| b .L23 | b .L23 | ||||
| .align 3 | .align 3 | ||||
| @@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .align 3 | .align 3 | ||||
| .L21: | .L21: | ||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmin.s s1, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMIN s1, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmin.s s1, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMIN s1, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmin.s s3, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMIN s3, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmin.s s4, t1, t3 | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMIN s4, t1, t3 | |||||
| blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
| .align 3 | .align 3 | ||||
| .L22: | .L22: | ||||
| fmin.s s1, s1, s2 | |||||
| fmin.s s3, s3, s4 | |||||
| fmin.s s1, s1, s3 | |||||
| FMIN s1, s1, s2 | |||||
| FMIN s3, s3, s4 | |||||
| FMIN s1, s1, s3 | |||||
| .align 3 | .align 3 | ||||
| .L23: //N<8 | .L23: //N<8 | ||||
| @@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| FABS a1, a1 | FABS a1, a1 | ||||
| ADD a0, a0, a1 | ADD a0, a0, a1 | ||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fmin.s s1, a0, s1 | |||||
| FMIN s1, a0, s1 | |||||
| blt $r0, I, .L24 | blt $r0, I, .L24 | ||||
| .align 3 | .align 3 | ||||
| .L999: | .L999: | ||||
| fmov.s $f0, $f22 | |||||
| MOV $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| .align 3 | .align 3 | ||||
| @@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vxor.v res0, res0, res0 | vxor.v res0, res0, res0 | ||||
| bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
| bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
| fld.s a0, X, 0 * SIZE | |||||
| fld.s a1, X, 1 * SIZE | |||||
| fabs.s a0, a0 | |||||
| fabs.s a1, a1 | |||||
| fadd.s s1, a1, a0 | |||||
| LD a0, X, 0 * SIZE | |||||
| LD a1, X, 1 * SIZE | |||||
| FABS a0, a0 | |||||
| FABS a1, a1 | |||||
| ADD s1, a1, a0 | |||||
| #ifdef DOUBLE | |||||
| vreplvei.d VM0, VM0, 0 | |||||
| #else | |||||
| vreplvei.w VM0, VM0, 0 | vreplvei.w VM0, VM0, 0 | ||||
| #endif | |||||
| li.d TEMP, 1 | li.d TEMP, 1 | ||||
| li.w I, -1 | |||||
| slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
| slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
| vreplgr2vr.w neg1, I | |||||
| vffint.s.w neg1, neg1 | |||||
| srai.d I, N, 3 | srai.d I, N, 3 | ||||
| bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
| bge $r0, I, .L23 | bge $r0, I, .L23 | ||||
| .align 3 | .align 3 | ||||
| .L10: | .L10: | ||||
| vld VX0, X, 0 * SIZE | |||||
| vld VX1, X, 4 * SIZE | |||||
| addi.d I, I, -1 | |||||
| vld VX0, X, 0 | |||||
| vld VX1, X, 16 | |||||
| #ifdef DOUBLE | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| #else | |||||
| vpickev.w x1, VX1, VX0 | vpickev.w x1, VX1, VX0 | ||||
| vpickod.w x2, VX1, VX0 | vpickod.w x2, VX1, VX0 | ||||
| vfmul.s x3, neg1, x1 | |||||
| vfmul.s x4, neg1, x2 | |||||
| vfcmp.clt.s VT0, x1, res0 | |||||
| vfcmp.clt.s VT1, x2, res0 | |||||
| vld VX0, X, 8 * SIZE | |||||
| vbitsel.v x1, x1, x3, VT0 | |||||
| vbitsel.v x2, x2, x4, VT1 | |||||
| vld VX1, X, 12 * SIZE | |||||
| vfadd.s VM1, x1, x2 | |||||
| #endif | |||||
| VFSUB x3, res0, x1 | |||||
| VFSUB x4, res0, x2 | |||||
| VFMAX x1, x1, x3 | |||||
| VFMAX x2, x2, x4 | |||||
| VFADD VM1, x1, x2 | |||||
| vld VX0, X, 32 | |||||
| vld VX1, X, 48 | |||||
| #ifdef DOUBLE | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| #else | |||||
| vpickev.w x1, VX1, VX0 | vpickev.w x1, VX1, VX0 | ||||
| vpickod.w x2, VX1, VX0 | vpickod.w x2, VX1, VX0 | ||||
| vfmul.s x3, neg1, x1 | |||||
| vfmul.s x4, neg1, x2 | |||||
| vfcmp.clt.s VT0, x1, res0 | |||||
| vfcmp.clt.s VT1, x2, res0 | |||||
| #endif | |||||
| VFSUB x3, res0, x1 | |||||
| VFSUB x4, res0, x2 | |||||
| VFMAX x1, x1, x3 | |||||
| VFMAX x2, x2, x4 | |||||
| VFADD x1, x1, x2 | |||||
| VFMIN VM1, x1, VM1 | |||||
| VFMIN VM0, VM0, VM1 | |||||
| #ifdef DOUBLE | |||||
| vld VX0, X, 64 | |||||
| vld VX1, X, 80 | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| VFSUB x3, res0, x1 | |||||
| VFSUB x4, res0, x2 | |||||
| VFMAX x1, x1, x3 | |||||
| VFMAX x2, x2, x4 | |||||
| VFADD VM1, x1, x2 | |||||
| vld VX0, X, 96 | |||||
| vld VX1, X, 112 | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| VFSUB x3, res0, x1 | |||||
| VFSUB x4, res0, x2 | |||||
| VFMAX x1, x1, x3 | |||||
| VFMAX x2, x2, x4 | |||||
| VFADD x1, x1, x2 | |||||
| VFMIN VM1, x1, VM1 | |||||
| VFMIN VM0, VM0, VM1 | |||||
| #endif | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 16 * SIZE | addi.d X, X, 16 * SIZE | ||||
| vbitsel.v x1, x1, x3, VT0 | |||||
| vbitsel.v x2, x2, x4, VT1 | |||||
| vfadd.s x1, x1, x2 | |||||
| vfmin.s VM1, x1, VM1 | |||||
| vfmin.s VM0, VM0, VM1 | |||||
| blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
| .align 3 | .align 3 | ||||
| .L11: | .L11: | ||||
| #ifdef DOUBLE | |||||
| vreplvei.d x1, VM0, 0 | |||||
| vreplvei.d x2, VM0, 1 | |||||
| VFMIN VM0, x1, x2 | |||||
| #else | |||||
| vreplvei.w x1, VM0, 0 | vreplvei.w x1, VM0, 0 | ||||
| vreplvei.w x2, VM0, 1 | vreplvei.w x2, VM0, 1 | ||||
| vreplvei.w x3, VM0, 2 | vreplvei.w x3, VM0, 2 | ||||
| vreplvei.w x4, VM0, 3 | vreplvei.w x4, VM0, 3 | ||||
| vfmin.s VM1, x1, x2 | |||||
| vfmin.s VM0, x3, x4 | |||||
| vfmin.s VM0, VM0, VM1 | |||||
| VFMIN VM1, x1, x2 | |||||
| VFMIN VM0, x3, x4 | |||||
| VFMIN VM0, VM0, VM1 | |||||
| #endif | |||||
| b .L23 | b .L23 | ||||
| .align 3 | .align 3 | ||||
| @@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .align 3 | .align 3 | ||||
| .L21: | .L21: | ||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmin.s s1, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMIN s1, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmin.s s1, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMIN s1, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmin.s s3, t1, t3 | |||||
| fld.s t1, X, 0 * SIZE | |||||
| fld.s t2, X, 1 * SIZE | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMIN s3, t1, t3 | |||||
| LD t1, X, 0 * SIZE | |||||
| LD t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fld.s t3, X, 0 * SIZE | |||||
| fld.s t4, X, 1 * SIZE | |||||
| LD t3, X, 0 * SIZE | |||||
| LD t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fabs.s t1, t1 | |||||
| fabs.s t2, t2 | |||||
| fabs.s t3, t3 | |||||
| fabs.s t4, t4 | |||||
| fadd.s t1, t1, t2 | |||||
| fadd.s t3, t3, t4 | |||||
| fmin.s s4, t1, t3 | |||||
| FABS t1, t1 | |||||
| FABS t2, t2 | |||||
| FABS t3, t3 | |||||
| FABS t4, t4 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| FMIN s4, t1, t3 | |||||
| blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
| .align 3 | .align 3 | ||||
| .L22: | .L22: | ||||
| fmin.s s1, s1, s2 | |||||
| fmin.s s3, s3, s4 | |||||
| fmin.s s1, s1, s3 | |||||
| FMIN s1, s1, s2 | |||||
| FMIN s3, s3, s4 | |||||
| FMIN s1, s1, s3 | |||||
| .align 3 | .align 3 | ||||
| .L23: //N<8 | .L23: //N<8 | ||||
| @@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .align 3 | .align 3 | ||||
| .L24: | .L24: | ||||
| fld.s a0, X, 0 * SIZE | |||||
| fld.s a1, X, 1 * SIZE | |||||
| LD a0, X, 0 * SIZE | |||||
| LD a1, X, 1 * SIZE | |||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| fabs.s a0, a0 | |||||
| fabs.s a1, a1 | |||||
| fadd.s a0, a0, a1 | |||||
| FABS a0, a0 | |||||
| FABS a1, a1 | |||||
| ADD a0, a0, a1 | |||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| fmin.s s1, a0, s1 | |||||
| FMIN s1, a0, s1 | |||||
| blt $r0, I, .L24 | blt $r0, I, .L24 | ||||
| .align 3 | .align 3 | ||||
| .L999: | .L999: | ||||
| fmov.s $f0, $f22 | |||||
| MOV $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| .align 3 | .align 3 | ||||
| @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | ||||
| .L14: | .L14: | ||||
| bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | ||||
| .align 3 | .align 3 | ||||
| @@ -117,38 +117,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| b .L997 | b .L997 | ||||
| .align 3 | .align 3 | ||||
| .L112: //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| xvld VX0, X, 0 * SIZE | |||||
| #ifdef DOUBLE | |||||
| xvld VX1, X, 4 * SIZE | |||||
| xvpickev.d x1, VX1, VX0 | |||||
| xvpickod.d x2, VX1, VX0 | |||||
| xvfmul.d x3, VXAI, x2 | |||||
| xvfsub.d x3, VXZ, x3 | |||||
| xvfmul.d x4, VXAI, x1 | |||||
| xvilvl.d VX2, x4 ,x3 | |||||
| xvilvh.d VX3, x4, x3 | |||||
| xvst VX2, X, 0 * SIZE | |||||
| xvst VX3, X, 4 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| #else | |||||
| xvld VX1, X, 8 * SIZE | |||||
| xvpickev.w x1, VX1, VX0 | |||||
| xvpickod.w x2, VX1, VX0 | |||||
| xvfmul.s x3, VXAI, x2 | |||||
| xvfsub.s x3, VXZ, x3 | |||||
| xvfmul.s x4, VXAI, x1 | |||||
| xvilvl.w VX2, x4 ,x3 | |||||
| xvilvh.w VX3, x4, x3 | |||||
| xvst VX2, X, 0 * SIZE | |||||
| xvst VX3, X, 8 * SIZE | |||||
| addi.d X, X, 16 * SIZE | |||||
| #endif | |||||
| addi.d I, I, -1 | |||||
| blt $r0, I, .L112 | |||||
| b .L997 | |||||
| .align 3 | |||||
| .L113: //alpha_r != 0.0 && alpha_i == 0.0 | .L113: //alpha_r != 0.0 && alpha_i == 0.0 | ||||
| xvld VX0, X, 0 * SIZE | xvld VX0, X, 0 * SIZE | ||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| @@ -227,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | ||||
| .L24: | .L24: | ||||
| bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | ||||
| .align 3 | .align 3 | ||||
| @@ -275,119 +243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| b .L997 | b .L997 | ||||
| .align 3 | .align 3 | ||||
| .L222: //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| #ifdef DOUBLE | |||||
| ld.d t1, X, 0 * SIZE | |||||
| ld.d t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.d t3, X, 0 * SIZE | |||||
| ld.d t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| xvinsgr2vr.d x1, t1, 0 | |||||
| xvinsgr2vr.d x2, t2, 0 | |||||
| xvinsgr2vr.d x1, t3, 1 | |||||
| xvinsgr2vr.d x2, t4, 1 | |||||
| ld.d t1, X, 0 * SIZE | |||||
| ld.d t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.d t3, X, 0 * SIZE | |||||
| ld.d t4, X, 1 * SIZE | |||||
| xvinsgr2vr.d x1, t1, 2 | |||||
| xvinsgr2vr.d x2, t2, 2 | |||||
| xvinsgr2vr.d x1, t3, 3 | |||||
| xvinsgr2vr.d x2, t4, 3 | |||||
| add.d X, X, INCX | |||||
| xvfmul.d x3, VXAI, x2 | |||||
| xvfsub.d x3, VXZ, x3 | |||||
| xvfmul.d x4, VXAI, x1 | |||||
| addi.d I, I, -1 | |||||
| xvstelm.d x3, XX, 0 * SIZE, 0 | |||||
| xvstelm.d x4, XX, 1 * SIZE, 0 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.d x3, XX, 0 * SIZE, 1 | |||||
| xvstelm.d x4, XX, 1 * SIZE, 1 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.d x3, XX, 0 * SIZE, 2 | |||||
| xvstelm.d x4, XX, 1 * SIZE, 2 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.d x3, XX, 0 * SIZE, 3 | |||||
| xvstelm.d x4, XX, 1 * SIZE, 3 | |||||
| #else | |||||
| ld.w t1, X, 0 * SIZE | |||||
| ld.w t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.w t3, X, 0 * SIZE | |||||
| ld.w t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| xvinsgr2vr.w x1, t1, 0 | |||||
| xvinsgr2vr.w x2, t2, 0 | |||||
| xvinsgr2vr.w x1, t3, 1 | |||||
| xvinsgr2vr.w x2, t4, 1 | |||||
| ld.w t1, X, 0 * SIZE | |||||
| ld.w t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.w t3, X, 0 * SIZE | |||||
| ld.w t4, X, 1 * SIZE | |||||
| xvinsgr2vr.w x1, t1, 2 | |||||
| xvinsgr2vr.w x2, t2, 2 | |||||
| xvinsgr2vr.w x1, t3, 3 | |||||
| xvinsgr2vr.w x2, t4, 3 | |||||
| add.d X, X, INCX | |||||
| ld.w t1, X, 0 * SIZE | |||||
| ld.w t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.w t3, X, 0 * SIZE | |||||
| ld.w t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| xvinsgr2vr.w x1, t1, 4 | |||||
| xvinsgr2vr.w x2, t2, 4 | |||||
| xvinsgr2vr.w x1, t3, 5 | |||||
| xvinsgr2vr.w x2, t4, 5 | |||||
| ld.w t1, X, 0 * SIZE | |||||
| ld.w t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.w t3, X, 0 * SIZE | |||||
| ld.w t4, X, 1 * SIZE | |||||
| xvinsgr2vr.w x1, t1, 6 | |||||
| xvinsgr2vr.w x2, t2, 6 | |||||
| xvinsgr2vr.w x1, t3, 7 | |||||
| xvinsgr2vr.w x2, t4, 7 | |||||
| add.d X, X, INCX | |||||
| xvfmul.s x3, VXAI, x2 | |||||
| xvfsub.s x3, VXZ, x3 | |||||
| xvfmul.s x4, VXAI, x1 | |||||
| addi.d I, I, -1 | |||||
| xvstelm.w x3, XX, 0 * SIZE, 0 | |||||
| xvstelm.w x4, XX, 1 * SIZE, 0 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.w x3, XX, 0 * SIZE, 1 | |||||
| xvstelm.w x4, XX, 1 * SIZE, 1 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.w x3, XX, 0 * SIZE, 2 | |||||
| xvstelm.w x4, XX, 1 * SIZE, 2 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.w x3, XX, 0 * SIZE, 3 | |||||
| xvstelm.w x4, XX, 1 * SIZE, 3 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.w x3, XX, 0 * SIZE, 4 | |||||
| xvstelm.w x4, XX, 1 * SIZE, 4 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.w x3, XX, 0 * SIZE, 5 | |||||
| xvstelm.w x4, XX, 1 * SIZE, 5 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.w x3, XX, 0 * SIZE, 6 | |||||
| xvstelm.w x4, XX, 1 * SIZE, 6 | |||||
| add.d XX, XX, INCX | |||||
| xvstelm.w x3, XX, 0 * SIZE, 7 | |||||
| xvstelm.w x4, XX, 1 * SIZE, 7 | |||||
| #endif | |||||
| add.d XX, XX, INCX | |||||
| blt $r0, I, .L222 | |||||
| b .L997 | |||||
| .align 3 | |||||
| .L223: //alpha_r != 0.0 && alpha_i == 0.0 | .L223: //alpha_r != 0.0 && alpha_i == 0.0 | ||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| ld.d t1, X, 0 * SIZE | ld.d t1, X, 0 * SIZE | ||||
| @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | ||||
| .L14: | .L14: | ||||
| bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | ||||
| .align 3 | .align 3 | ||||
| @@ -116,48 +116,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| b .L997 | b .L997 | ||||
| .align 3 | .align 3 | ||||
| .L112: //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| vld VX0, X, 0 * SIZE | |||||
| #ifdef DOUBLE | |||||
| vld VX1, X, 2 * SIZE | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| vfmul.d x3, VXAI, x2 | |||||
| vfsub.d x3, VXZ, x3 | |||||
| vfmul.d x4, VXAI, x1 | |||||
| vilvl.d VX2, x4 ,x3 | |||||
| vilvh.d VX3, x4, x3 | |||||
| vst VX2, X, 0 * SIZE | |||||
| vst VX3, X, 2 * SIZE | |||||
| vld VX0, X, 4 * SIZE | |||||
| vld VX1, X, 6 * SIZE | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| vfmul.d x3, VXAI, x2 | |||||
| vfsub.d x3, VXZ, x3 | |||||
| vfmul.d x4, VXAI, x1 | |||||
| vilvl.d VX2, x4 ,x3 | |||||
| vilvh.d VX3, x4, x3 | |||||
| vst VX2, X, 4 * SIZE | |||||
| vst VX3, X, 6 * SIZE | |||||
| #else | |||||
| vld VX1, X, 4 * SIZE | |||||
| vpickev.w x1, VX1, VX0 | |||||
| vpickod.w x2, VX1, VX0 | |||||
| vfmul.s x3, VXAI, x2 | |||||
| vfsub.s x3, VXZ, x3 | |||||
| vfmul.s x4, VXAI, x1 | |||||
| vilvl.w VX2, x4 ,x3 | |||||
| vilvh.w VX3, x4, x3 | |||||
| vst VX2, X, 0 * SIZE | |||||
| vst VX3, X, 4 * SIZE | |||||
| #endif | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d I, I, -1 | |||||
| blt $r0, I, .L112 | |||||
| b .L997 | |||||
| .align 3 | |||||
| .L113: //alpha_r != 0.0 && alpha_i == 0.0 | .L113: //alpha_r != 0.0 && alpha_i == 0.0 | ||||
| vld VX0, X, 0 * SIZE | vld VX0, X, 0 * SIZE | ||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| @@ -256,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | ||||
| .L24: | .L24: | ||||
| bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | ||||
| .align 3 | .align 3 | ||||
| @@ -292,90 +250,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| b .L997 | b .L997 | ||||
| .align 3 | .align 3 | ||||
| .L222: //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| #ifdef DOUBLE | |||||
| ld.d t1, X, 0 * SIZE | |||||
| ld.d t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.d t3, X, 0 * SIZE | |||||
| ld.d t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| vinsgr2vr.d x1, t1, 0 | |||||
| vinsgr2vr.d x2, t2, 0 | |||||
| vinsgr2vr.d x1, t3, 1 | |||||
| vinsgr2vr.d x2, t4, 1 | |||||
| vfmul.d x3, VXAI, x2 | |||||
| vfsub.d x3, VXZ, x3 | |||||
| vfmul.d x4, VXAI, x1 | |||||
| vstelm.d x3, XX, 0 * SIZE, 0 | |||||
| vstelm.d x4, XX, 1 * SIZE, 0 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.d x3, XX, 0 * SIZE, 1 | |||||
| vstelm.d x4, XX, 1 * SIZE, 1 | |||||
| add.d XX, XX, INCX | |||||
| ld.d t1, X, 0 * SIZE | |||||
| ld.d t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.d t3, X, 0 * SIZE | |||||
| ld.d t4, X, 1 * SIZE | |||||
| vinsgr2vr.d x1, t1, 0 | |||||
| vinsgr2vr.d x2, t2, 0 | |||||
| vinsgr2vr.d x1, t3, 1 | |||||
| vinsgr2vr.d x2, t4, 1 | |||||
| add.d X, X, INCX | |||||
| vfmul.d x3, VXAI, x2 | |||||
| vfsub.d x3, VXZ, x3 | |||||
| vfmul.d x4, VXAI, x1 | |||||
| addi.d I, I, -1 | |||||
| vstelm.d x3, XX, 0 * SIZE, 0 | |||||
| vstelm.d x4, XX, 1 * SIZE, 0 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.d x3, XX, 0 * SIZE, 1 | |||||
| vstelm.d x4, XX, 1 * SIZE, 1 | |||||
| #else | |||||
| ld.w t1, X, 0 * SIZE | |||||
| ld.w t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.w t3, X, 0 * SIZE | |||||
| ld.w t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| vinsgr2vr.w x1, t1, 0 | |||||
| vinsgr2vr.w x2, t2, 0 | |||||
| vinsgr2vr.w x1, t3, 1 | |||||
| vinsgr2vr.w x2, t4, 1 | |||||
| ld.w t1, X, 0 * SIZE | |||||
| ld.w t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.w t3, X, 0 * SIZE | |||||
| ld.w t4, X, 1 * SIZE | |||||
| vinsgr2vr.w x1, t1, 2 | |||||
| vinsgr2vr.w x2, t2, 2 | |||||
| vinsgr2vr.w x1, t3, 3 | |||||
| vinsgr2vr.w x2, t4, 3 | |||||
| add.d X, X, INCX | |||||
| vfmul.s x3, VXAI, x2 | |||||
| vfsub.s x3, VXZ, x3 | |||||
| vfmul.s x4, VXAI, x1 | |||||
| addi.d I, I, -1 | |||||
| vstelm.w x3, XX, 0 * SIZE, 0 | |||||
| vstelm.w x4, XX, 1 * SIZE, 0 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.w x3, XX, 0 * SIZE, 1 | |||||
| vstelm.w x4, XX, 1 * SIZE, 1 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.w x3, XX, 0 * SIZE, 2 | |||||
| vstelm.w x4, XX, 1 * SIZE, 2 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.w x3, XX, 0 * SIZE, 3 | |||||
| vstelm.w x4, XX, 1 * SIZE, 3 | |||||
| #endif | |||||
| add.d XX, XX, INCX | |||||
| blt $r0, I, .L222 | |||||
| b .L997 | |||||
| .align 3 | |||||
| .L223: //alpha_r != 0.0 && alpha_i == 0.0 | .L223: //alpha_r != 0.0 && alpha_i == 0.0 | ||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| ld.d t1, X, 0 * SIZE | ld.d t1, X, 0 * SIZE | ||||
| @@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||||
| for( i=0; i<n; i+=4 ) | for( i=0; i<n; i+=4 ) | ||||
| { | { | ||||
| t0 = da_r *x[0] - da_i *x[1]; | |||||
| t1 = da_r *x[2] - da_i *x[3]; | |||||
| t2 = da_r *x[4] - da_i *x[5]; | |||||
| t3 = da_r *x[6] - da_i *x[7]; | |||||
| t0 = da_r *x[0] - da_i *x[1]; | |||||
| t1 = da_r *x[2] - da_i *x[3]; | |||||
| t2 = da_r *x[4] - da_i *x[5]; | |||||
| t3 = da_r *x[6] - da_i *x[7]; | |||||
| x[1] = da_r * x[1] + da_i * x[0]; | x[1] = da_r * x[1] + da_i * x[0]; | ||||
| x[3] = da_r * x[3] + da_i * x[2]; | x[3] = da_r * x[3] + da_i * x[2]; | ||||
| x[5] = da_r * x[5] + da_i * x[4]; | x[5] = da_r * x[5] + da_i * x[4]; | ||||
| x[7] = da_r * x[7] + da_i * x[6]; | x[7] = da_r * x[7] + da_i * x[6]; | ||||
| x[0] = t0; | x[0] = t0; | ||||
| x[2] = t1; | x[2] = t1; | ||||
| x[4] = t2; | x[4] = t2; | ||||
| @@ -99,16 +99,16 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||||
| for( i=0; i<n; i+=4 ) | for( i=0; i<n; i+=4 ) | ||||
| { | { | ||||
| t0 = - da_i *x[1]; | |||||
| t1 = - da_i *x[3]; | |||||
| t2 = - da_i *x[5]; | |||||
| t3 = - da_i *x[7]; | |||||
| t0 = - da_i *x[1]; | |||||
| t1 = - da_i *x[3]; | |||||
| t2 = - da_i *x[5]; | |||||
| t3 = - da_i *x[7]; | |||||
| x[1] = da_i * x[0]; | x[1] = da_i * x[0]; | ||||
| x[3] = da_i * x[2]; | x[3] = da_i * x[2]; | ||||
| x[5] = da_i * x[4]; | x[5] = da_i * x[4]; | ||||
| x[7] = da_i * x[6]; | x[7] = da_i * x[6]; | ||||
| x[0] = t0; | x[0] = t0; | ||||
| x[2] = t1; | x[2] = t1; | ||||
| x[4] = t2; | x[4] = t2; | ||||
| @@ -129,16 +129,16 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||||
| for( i=0; i<n; i+=4 ) | for( i=0; i<n; i+=4 ) | ||||
| { | { | ||||
| t0 = da_r *x[0]; | |||||
| t1 = da_r *x[2]; | |||||
| t2 = da_r *x[4]; | |||||
| t3 = da_r *x[6]; | |||||
| t0 = da_r *x[0]; | |||||
| t1 = da_r *x[2]; | |||||
| t2 = da_r *x[4]; | |||||
| t3 = da_r *x[6]; | |||||
| x[1] = da_r * x[1]; | x[1] = da_r * x[1]; | ||||
| x[3] = da_r * x[3]; | x[3] = da_r * x[3]; | ||||
| x[5] = da_r * x[5]; | x[5] = da_r * x[5]; | ||||
| x[7] = da_r * x[7]; | x[7] = da_r * x[7]; | ||||
| x[0] = t0; | x[0] = t0; | ||||
| x[2] = t1; | x[2] = t1; | ||||
| x[4] = t2; | x[4] = t2; | ||||
| @@ -157,14 +157,14 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||||
| BLASLONG i; | BLASLONG i; | ||||
| for( i=0; i<n; i+=4 ) | for( i=0; i<n; i+=4 ) | ||||
| { | { | ||||
| x[0] = 0.0; | |||||
| x[1] = 0.0; | |||||
| x[2] = 0.0; | |||||
| x[3] = 0.0; | |||||
| x[4] = 0.0; | |||||
| x[5] = 0.0; | |||||
| x[6] = 0.0; | |||||
| x[7] = 0.0; | |||||
| x[0] = 0.0; | |||||
| x[1] = 0.0; | |||||
| x[2] = 0.0; | |||||
| x[3] = 0.0; | |||||
| x[4] = 0.0; | |||||
| x[5] = 0.0; | |||||
| x[6] = 0.0; | |||||
| x[7] = 0.0; | |||||
| x+=8; | x+=8; | ||||
| } | } | ||||
| @@ -186,10 +186,10 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ | |||||
| for ( i=0; i<n; i+=4 ) | for ( i=0; i<n; i+=4 ) | ||||
| { | { | ||||
| t0 = da_r * x[0] - da_i *x[1]; | |||||
| t1 = da_r * x[inc_x] - da_i *x[inc_x + 1]; | |||||
| t2 = da_r * x[inc_x2] - da_i *x[inc_x2 + 1]; | |||||
| t3 = da_r * x[inc_x3] - da_i *x[inc_x3 + 1]; | |||||
| t0 = da_r * x[0] - da_i *x[1]; | |||||
| t1 = da_r * x[inc_x] - da_i *x[inc_x + 1]; | |||||
| t2 = da_r * x[inc_x2] - da_i *x[inc_x2 + 1]; | |||||
| t3 = da_r * x[inc_x3] - da_i *x[inc_x3 + 1]; | |||||
| x[1] = da_i * x[0] + da_r * x[1]; | x[1] = da_i * x[0] + da_r * x[1]; | ||||
| x[inc_x +1] = da_i * x[inc_x] + da_r * x[inc_x +1]; | x[inc_x +1] = da_i * x[inc_x] + da_r * x[inc_x +1]; | ||||
| @@ -228,7 +228,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| { | { | ||||
| while(j < n1) | while(j < n1) | ||||
| { | { | ||||
| x[i]=0.0; | x[i]=0.0; | ||||
| x[i+1]=0.0; | x[i+1]=0.0; | ||||
| x[i+inc_x]=0.0; | x[i+inc_x]=0.0; | ||||
| @@ -240,7 +240,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| x[i]=0.0; | x[i]=0.0; | ||||
| x[i+1]=0.0; | x[i+1]=0.0; | ||||
| i += inc_x ; | i += inc_x ; | ||||
| @@ -253,11 +253,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| { | { | ||||
| while(j < n1) | while(j < n1) | ||||
| { | { | ||||
| temp0 = -da_i * x[i+1]; | |||||
| if (isnan(x[i]) || isinf(x[i])) | |||||
| temp0 = NAN; | |||||
| else | |||||
| temp0 = -da_i * x[i+1]; | |||||
| x[i+1] = da_i * x[i]; | x[i+1] = da_i * x[i]; | ||||
| x[i] = temp0; | x[i] = temp0; | ||||
| temp1 = -da_i * x[i+1+inc_x]; | |||||
| if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) | |||||
| temp1 = NAN; | |||||
| else | |||||
| temp1 = -da_i * x[i+1+inc_x]; | |||||
| x[i+1+inc_x] = da_i * x[i+inc_x]; | x[i+1+inc_x] = da_i * x[i+inc_x]; | ||||
| x[i+inc_x] = temp1; | x[i+inc_x] = temp1; | ||||
| i += 2*inc_x ; | i += 2*inc_x ; | ||||
| @@ -267,8 +273,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| temp0 = -da_i * x[i+1]; | |||||
| if (isnan(x[i]) || isinf(x[i])) | |||||
| temp0 = NAN; | |||||
| else | |||||
| temp0 = -da_i * x[i+1]; | |||||
| x[i+1] = da_i * x[i]; | x[i+1] = da_i * x[i]; | ||||
| x[i] = temp0; | x[i] = temp0; | ||||
| i += inc_x ; | i += inc_x ; | ||||
| @@ -291,7 +300,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| while(j < n1) | while(j < n1) | ||||
| { | { | ||||
| temp0 = da_r * x[i]; | temp0 = da_r * x[i]; | ||||
| x[i+1] = da_r * x[i+1]; | x[i+1] = da_r * x[i+1]; | ||||
| x[i] = temp0; | x[i] = temp0; | ||||
| @@ -305,7 +314,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| temp0 = da_r * x[i]; | temp0 = da_r * x[i]; | ||||
| x[i+1] = da_r * x[i+1]; | x[i+1] = da_r * x[i+1]; | ||||
| x[i] = temp0; | x[i] = temp0; | ||||
| @@ -368,7 +377,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| } | } | ||||
| i = n1 << 1; | i = n1 << 1; | ||||
| j = n1; | j = n1; | ||||
| if ( da_r == 0.0 || da_r != da_r ) | if ( da_r == 0.0 || da_r != da_r ) | ||||
| { | { | ||||
| if ( da_i == 0.0 ) | if ( da_i == 0.0 ) | ||||
| @@ -385,7 +394,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| } | } | ||||
| } | } | ||||
| else if (da_r < -FLT_MAX || da_r > FLT_MAX) { | |||||
| else if (da_r < -FLT_MAX || da_r > FLT_MAX) { | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| x[i]= NAN; | x[i]= NAN; | ||||
| @@ -404,7 +413,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| if (x[i] < -FLT_MAX || x[i] > FLT_MAX) | if (x[i] < -FLT_MAX || x[i] > FLT_MAX) | ||||
| temp0 = NAN; | temp0 = NAN; | ||||
| x[i+1] = da_i * x[i]; | x[i+1] = da_i * x[i]; | ||||
| if ( x[i] == x[i]) //preserve NaN | |||||
| if ( x[i] == x[i]) //preserve NaN | |||||
| x[i] = temp0; | x[i] = temp0; | ||||
| i += 2 ; | i += 2 ; | ||||
| j++; | j++; | ||||
| @@ -420,7 +429,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| { | { | ||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| temp0 = da_r * x[i]; | temp0 = da_r * x[i]; | ||||
| x[i+1] = da_r * x[i+1]; | x[i+1] = da_r * x[i+1]; | ||||
| x[i] = temp0; | x[i] = temp0; | ||||
| @@ -442,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| } | } | ||||
| } | |||||
| } | |||||
| } | } | ||||
| @@ -16,6 +16,7 @@ else () | |||||
| test_dnrm2.c | test_dnrm2.c | ||||
| test_swap.c | test_swap.c | ||||
| test_zscal.c | test_zscal.c | ||||
| test_amin.c | |||||
| ) | ) | ||||
| endif () | endif () | ||||
| @@ -11,7 +11,8 @@ UTESTBIN=openblas_utest | |||||
| include $(TOPDIR)/Makefile.system | include $(TOPDIR)/Makefile.system | ||||
| OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o | |||||
| OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ | |||||
| test_amin.o | |||||
| #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o | #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o | ||||
| ifneq ($(NO_LAPACK), 1) | ifneq ($(NO_LAPACK), 1) | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011-2016, The OpenBLAS Project | |||||
| Copyright (c) 2011-2024, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -13,9 +13,9 @@ met: | |||||
| notice, this list of conditions and the following disclaimer in | notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | the documentation and/or other materials provided with the | ||||
| distribution. | distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| @@ -57,4 +57,31 @@ CTEST(amax, damax){ | |||||
| ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); | ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); | ||||
| } | } | ||||
| #endif | #endif | ||||
| #ifdef BUILD_COMPLEX | |||||
| CTEST(amax, scamax){ | |||||
| blasint N = 9, inc = 1; | |||||
| float te_max = 0.0, tr_max = 0.0; | |||||
| float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||||
| -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, | |||||
| -7.7, 8.8 }; | |||||
| te_max = BLASFUNC(scamax)(&N, x, &inc); | |||||
| tr_max = 20.0; | |||||
| ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); | |||||
| } | |||||
| #endif | |||||
| #ifdef BUILD_COMPLEX16 | |||||
| CTEST(amax, dzamax){ | |||||
| blasint N = 9, inc = 1; | |||||
| double te_max = 0.0, tr_max = 0.0; | |||||
| double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||||
| -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, | |||||
| -7.7, 8.8 }; | |||||
| te_max = BLASFUNC(dzamax)(&N, x, &inc); | |||||
| tr_max = 20.0; | |||||
| ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); | |||||
| } | |||||
| #endif | |||||
| @@ -0,0 +1,89 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011-2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include "openblas_utest.h" | |||||
| #ifdef BUILD_SINGLE | |||||
| CTEST(amin, samin){ | |||||
| blasint N = 3, inc = 1; | |||||
| float te_min = 0.0, tr_min = 0.0; | |||||
| float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||||
| -9.9 }; | |||||
| te_min = BLASFUNC(samin)(&N, x, &inc); | |||||
| tr_min = 1.1; | |||||
| ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); | |||||
| } | |||||
| #endif | |||||
| #ifdef BUILD_DOUBLE | |||||
| CTEST(amin, damin){ | |||||
| blasint N = 3, inc = 1; | |||||
| double te_min = 0.0, tr_min = 0.0; | |||||
| double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||||
| -9.9 }; | |||||
| te_min = BLASFUNC(damin)(&N, x, &inc); | |||||
| tr_min = 1.1; | |||||
| ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); | |||||
| } | |||||
| #endif | |||||
| #ifdef BUILD_COMPLEX | |||||
| CTEST(amin, scamin){ | |||||
| blasint N = 9, inc = 1; | |||||
| float te_min = 0.0, tr_min = 0.0; | |||||
| float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||||
| -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, | |||||
| -7.7, 8.8 }; | |||||
| te_min = BLASFUNC(scamin)(&N, x, &inc); | |||||
| tr_min = 3.3; | |||||
| ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); | |||||
| } | |||||
| #endif | |||||
| #ifdef BUILD_COMPLEX16 | |||||
| CTEST(amin, dzamin){ | |||||
| blasint N = 9, inc = 1; | |||||
| double te_min = 0.0, tr_min = 0.0; | |||||
| double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||||
| -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, | |||||
| -7.7, 8.8 }; | |||||
| te_min = BLASFUNC(dzamin)(&N, x, &inc); | |||||
| tr_min = 3.3; | |||||
| ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); | |||||
| } | |||||
| #endif | |||||
| @@ -20,6 +20,18 @@ CTEST(zscal, i_nan) | |||||
| ASSERT_TRUE(isnan(nan[17])); | ASSERT_TRUE(isnan(nan[17])); | ||||
| } | } | ||||
| CTEST(zscal, i_nan_inc_2) | |||||
| { | |||||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||||
| double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, | |||||
| NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; | |||||
| cblas_zscal(9, i, &nan, 2); | |||||
| ASSERT_TRUE(isnan(nan[0])); | |||||
| ASSERT_TRUE(isnan(nan[1])); | |||||
| ASSERT_TRUE(isnan(nan[16])); | |||||
| ASSERT_TRUE(isnan(nan[17])); | |||||
| } | |||||
| CTEST(zscal, nan_i) | CTEST(zscal, nan_i) | ||||
| { | { | ||||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | ||||
| @@ -30,7 +42,19 @@ CTEST(zscal, nan_i) | |||||
| ASSERT_TRUE(isnan(i[16])); | ASSERT_TRUE(isnan(i[16])); | ||||
| ASSERT_TRUE(isnan(i[17])); | ASSERT_TRUE(isnan(i[17])); | ||||
| } | } | ||||
| CTEST(zscal, nan_i_inc_2) | |||||
| { | |||||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, | |||||
| 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||||
| double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; | |||||
| cblas_zscal(9, &nan, &i, 2); | |||||
| ASSERT_TRUE(isnan(i[0])); | |||||
| ASSERT_TRUE(isnan(i[1])); | |||||
| ASSERT_TRUE(isnan(i[16])); | |||||
| ASSERT_TRUE(isnan(i[17])); | |||||
| } | |||||
| CTEST(zscal, i_inf) | CTEST(zscal, i_inf) | ||||
| { | { | ||||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | ||||
| @@ -40,7 +64,19 @@ CTEST(zscal, i_inf) | |||||
| ASSERT_TRUE(isinf(inf[1])); | ASSERT_TRUE(isinf(inf[1])); | ||||
| ASSERT_TRUE(isnan(inf[16])); | ASSERT_TRUE(isnan(inf[16])); | ||||
| ASSERT_TRUE(isinf(inf[17])); | ASSERT_TRUE(isinf(inf[17])); | ||||
| } | |||||
| } | |||||
| CTEST(zscal, i_inf_inc_2) | |||||
| { | |||||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||||
| double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, | |||||
| INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; | |||||
| cblas_zscal(9, i, &inf, 2); | |||||
| ASSERT_TRUE(isnan(inf[0])); | |||||
| ASSERT_TRUE(isinf(inf[1])); | |||||
| ASSERT_TRUE(isnan(inf[16])); | |||||
| ASSERT_TRUE(isinf(inf[17])); | |||||
| } | |||||
| CTEST(zscal, inf_i) | CTEST(zscal, inf_i) | ||||
| { | { | ||||
| @@ -53,4 +89,16 @@ CTEST(zscal, inf_i) | |||||
| ASSERT_TRUE(isinf(i[17])); | ASSERT_TRUE(isinf(i[17])); | ||||
| } | } | ||||
| CTEST(zscal, inf_i_inc_2) | |||||
| { | |||||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, | |||||
| 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||||
| double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; | |||||
| cblas_zscal(9, &inf, &i, 2); | |||||
| ASSERT_TRUE(isnan(i[0])); | |||||
| ASSERT_TRUE(isinf(i[1])); | |||||
| ASSERT_TRUE(isnan(i[16])); | |||||
| ASSERT_TRUE(isinf(i[17])); | |||||
| } | |||||
| #endif | #endif | ||||