| @@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| @@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS | |||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| @@ -130,6 +130,8 @@ endif () | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ | |||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ | |||
| cblas_samin.$(SUFFIX) | |||
| CSBLAS2OBJS = \ | |||
| cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | |||
| @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ | |||
| cblas_damin.$(SUFFIX) | |||
| CDBLAS2OBJS = \ | |||
| cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | |||
| @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ | |||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | |||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | |||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | |||
| CCBLAS2OBJS = \ | |||
| @@ -340,12 +342,12 @@ CXERBLAOBJ = \ | |||
| CZBLAS1OBJS = \ | |||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ | |||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | |||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | |||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | |||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | |||
| @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c | |||
| cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1627,6 +1653,19 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c | |||
| cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c | |||
| cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| #else | |||
| #ifdef COMPLEX | |||
| FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||
| FLOAT *x = (FLOAT*) vx; | |||
| #else | |||
| FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
| #endif | |||
| FLOAT ret; | |||
| PRINT_DEBUG_CNAME; | |||
| @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S | |||
| SAMAXKERNEL = amax_lsx.S | |||
| DAMAXKERNEL = amax_lsx.S | |||
| CAMAXKERNEL = camax_lsx.S | |||
| ZAMAXKERNEL = camax_lsx.S | |||
| SAMINKERNEL = amin_lsx.S | |||
| DAMINKERNEL = amin_lsx.S | |||
| CAMINKERNEL = camin_lsx.S | |||
| ZAMINKERNEL = camin_lsx.S | |||
| SMAXKERNEL = max_lsx.S | |||
| DMAXKERNEL = max_lsx.S | |||
| @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S | |||
| SAMAXKERNEL = amax_lasx.S | |||
| DAMAXKERNEL = amax_lasx.S | |||
| CAMAXKERNEL = camax_lasx.S | |||
| ZAMAXKERNEL = camax_lasx.S | |||
| SAMINKERNEL = amin_lasx.S | |||
| DAMINKERNEL = amin_lasx.S | |||
| CAMINKERNEL = camin_lasx.S | |||
| ZAMINKERNEL = camin_lasx.S | |||
| SMAXKERNEL = max_lsx.S | |||
| DMAXKERNEL = max_lsx.S | |||
| @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| xvldrepl.w VM0, X, 0 | |||
| #endif | |||
| XVFSUB VM0, VM0, VM0 | |||
| bne INCX, TEMP, .L20 | |||
| srai.d I, N, 4 | |||
| @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| vldrepl.w VM0, X, 0 | |||
| #endif | |||
| VFSUB VM0, VM0, VM0 | |||
| bne INCX, TEMP, .L20 | |||
| srai.d I, N, 3 | |||
| @@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| li.d TEMP, 1 | |||
| li.w I, -1 | |||
| slli.d TEMP, TEMP, ZBASE_SHIFT | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| xvreplgr2vr.w neg1, I | |||
| xvffint.s.w neg1, neg1 | |||
| srai.d I, N, 3 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L23 | |||
| .align 3 | |||
| .L10: | |||
| xvld VX0, X, 0 * SIZE | |||
| xvld VX1, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| xvld VX0, X, 0 | |||
| xvld VX1, X, 32 | |||
| #ifdef DOUBLE | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| #else | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, neg1, x1 | |||
| xvfmul.s x4, neg1, x2 | |||
| xvfcmp.clt.s VT0, x1, res0 | |||
| xvfcmp.clt.s VT1, x2, res0 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VT1 | |||
| #endif | |||
| XVFSUB x3, res0, x1 | |||
| XVFSUB x4, res0, x2 | |||
| XVFMAX x1, x1, x3 | |||
| XVFMAX x2, x2, x4 | |||
| XVFADD VM1, x1, x2 | |||
| XVFMAX VM0, VM0, VM1 | |||
| #ifdef DOUBLE | |||
| xvld VX0, X, 64 | |||
| xvld VX1, X, 96 | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| XVFSUB x3, res0, x1 | |||
| XVFSUB x4, res0, x2 | |||
| XVFMAX x1, x1, x3 | |||
| XVFMAX x2, x2, x4 | |||
| XVFADD VM1, x1, x2 | |||
| XVFMAX VM0, VM0, VM1 | |||
| #endif | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 16 * SIZE | |||
| xvfadd.s VM1, x1, x2 | |||
| xvfmax.s VM0, VM0, VM1 | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| .L11: | |||
| #ifdef DOUBLE | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| XVFMAX VM0, x1, x2 | |||
| #else | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| xvfmax.s VM1, x1, x2 | |||
| xvfmax.s VM0, x3, x4 | |||
| xvfmax.s VM0, VM0, VM1 | |||
| XVFMAX VM0, x1, x2 | |||
| XVFMAX VM1, x3, x4 | |||
| XVFMAX VM0, VM0, VM1 | |||
| #endif | |||
| b .L23 | |||
| .align 3 | |||
| @@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L21: | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmax.s s1, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMAX s1, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmax.s s1, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMAX s1, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| addi.d I, I, -1 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmax.s s3, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMAX s3, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmax.s s4, t1, t3 | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMAX s4, t1, t3 | |||
| blt $r0, I, .L21 | |||
| .align 3 | |||
| .L22: | |||
| fmax.s s1, s1, s2 | |||
| fmax.s s3, s3, s4 | |||
| fmax.s s1, s1, s3 | |||
| FMAX s1, s1, s2 | |||
| FMAX s3, s3, s4 | |||
| FMAX s1, s1, s3 | |||
| .align 3 | |||
| .L23: //N<8 | |||
| @@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FABS a1, a1 | |||
| ADD a0, a0, a1 | |||
| add.d X, X, INCX | |||
| fmax.s s1, a0, s1 | |||
| FMAX s1, a0, s1 | |||
| blt $r0, I, .L24 | |||
| .align 3 | |||
| .L999: | |||
| fmov.s $f0, $f22 | |||
| MOV $f0, $f22 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| @@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| li.d TEMP, 1 | |||
| li.w I, -1 | |||
| slli.d TEMP, TEMP, ZBASE_SHIFT | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| vreplgr2vr.w neg1, I | |||
| vffint.s.w neg1, neg1 | |||
| srai.d I, N, 3 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L23 | |||
| .align 3 | |||
| .L10: | |||
| vld VX0, X, 0 * SIZE | |||
| vld VX1, X, 4 * SIZE | |||
| addi.d I, I, -1 | |||
| vld VX0, X, 0 | |||
| vld VX1, X, 16 | |||
| #ifdef DOUBLE | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| #else | |||
| vpickev.w x1, VX1, VX0 | |||
| vpickod.w x2, VX1, VX0 | |||
| vfmul.s x3, neg1, x1 | |||
| vfmul.s x4, neg1, x2 | |||
| vfcmp.clt.s VT0, x1, res0 | |||
| vfcmp.clt.s VT1, x2, res0 | |||
| vld VX0, X, 8 * SIZE | |||
| vbitsel.v x1, x1, x3, VT0 | |||
| vbitsel.v x2, x2, x4, VT1 | |||
| vld VX1, X, 12 * SIZE | |||
| vfadd.s VM1, x1, x2 | |||
| #endif | |||
| VFSUB x3, res0, x1 | |||
| VFSUB x4, res0, x2 | |||
| VFMAX x1, x1, x3 | |||
| VFMAX x2, x2, x4 | |||
| VFADD VM1, x1, x2 | |||
| vld VX0, X, 32 | |||
| vld VX1, X, 48 | |||
| #ifdef DOUBLE | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| #else | |||
| vpickev.w x1, VX1, VX0 | |||
| vpickod.w x2, VX1, VX0 | |||
| vfmul.s x3, neg1, x1 | |||
| vfmul.s x4, neg1, x2 | |||
| vfcmp.clt.s VT0, x1, res0 | |||
| vfcmp.clt.s VT1, x2, res0 | |||
| #endif | |||
| VFSUB x3, res0, x1 | |||
| VFSUB x4, res0, x2 | |||
| VFMAX x1, x1, x3 | |||
| VFMAX x2, x2, x4 | |||
| VFADD x1, x1, x2 | |||
| VFMAX VM1, x1, VM1 | |||
| VFMAX VM0, VM0, VM1 | |||
| #ifdef DOUBLE | |||
| vld VX0, X, 64 | |||
| vld VX1, X, 80 | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| VFSUB x3, res0, x1 | |||
| VFSUB x4, res0, x2 | |||
| VFMAX x1, x1, x3 | |||
| VFMAX x2, x2, x4 | |||
| VFADD VM1, x1, x2 | |||
| vld VX0, X, 96 | |||
| vld VX1, X, 112 | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| VFSUB x3, res0, x1 | |||
| VFSUB x4, res0, x2 | |||
| VFMAX x1, x1, x3 | |||
| VFMAX x2, x2, x4 | |||
| VFADD x1, x1, x2 | |||
| VFMAX VM1, x1, VM1 | |||
| VFMAX VM0, VM0, VM1 | |||
| #endif | |||
| addi.d X, X, 16 * SIZE | |||
| vbitsel.v x1, x1, x3, VT0 | |||
| vbitsel.v x2, x2, x4, VT1 | |||
| vfadd.s x1, x1, x2 | |||
| vfmax.s VM1, x1, VM1 | |||
| vfmax.s VM0, VM0, VM1 | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| .L11: | |||
| #ifdef DOUBLE | |||
| vreplvei.d x1, VM0, 0 | |||
| vreplvei.d x2, VM0, 1 | |||
| VFMAX VM0, x1, x2 | |||
| #else | |||
| vreplvei.w x1, VM0, 0 | |||
| vreplvei.w x2, VM0, 1 | |||
| vreplvei.w x3, VM0, 2 | |||
| vreplvei.w x4, VM0, 3 | |||
| vfmax.s VM1, x1, x2 | |||
| vfmax.s VM0, x3, x4 | |||
| vfmax.s VM0, VM0, VM1 | |||
| VFMAX VM1, x1, x2 | |||
| VFMAX VM0, x3, x4 | |||
| VFMAX VM0, VM0, VM1 | |||
| #endif | |||
| b .L23 | |||
| .align 3 | |||
| @@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L21: | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmax.s s1, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMAX s1, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmax.s s1, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMAX s1, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| addi.d I, I, -1 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmax.s s3, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMAX s3, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmax.s s4, t1, t3 | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMAX s4, t1, t3 | |||
| blt $r0, I, .L21 | |||
| .align 3 | |||
| .L22: | |||
| fmax.s s1, s1, s2 | |||
| fmax.s s3, s3, s4 | |||
| fmax.s s1, s1, s3 | |||
| FMAX s1, s1, s2 | |||
| FMAX s3, s3, s4 | |||
| FMAX s1, s1, s3 | |||
| .align 3 | |||
| .L23: //N<8 | |||
| @@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L24: | |||
| fld.s a0, X, 0 * SIZE | |||
| fld.s a1, X, 1 * SIZE | |||
| LD a0, X, 0 * SIZE | |||
| LD a1, X, 1 * SIZE | |||
| addi.d I, I, -1 | |||
| fabs.s a0, a0 | |||
| fabs.s a1, a1 | |||
| fadd.s a0, a0, a1 | |||
| FABS a0, a0 | |||
| FABS a1, a1 | |||
| ADD a0, a0, a1 | |||
| add.d X, X, INCX | |||
| fmax.s s1, a0, s1 | |||
| FMAX s1, a0, s1 | |||
| blt $r0, I, .L24 | |||
| .align 3 | |||
| .L999: | |||
| fmov.s $f0, $f22 | |||
| MOV $f0, $f22 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| @@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvxor.v res0, res0, res0 | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| fld.s a0, X, 0 * SIZE | |||
| fld.s a1, X, 1 * SIZE | |||
| fabs.s a0, a0 | |||
| fabs.s a1, a1 | |||
| fadd.s s1, a1, a0 | |||
| LD a0, X, 0 * SIZE | |||
| LD a1, X, 1 * SIZE | |||
| FABS a0, a0 | |||
| FABS a1, a1 | |||
| ADD s1, a1, a0 | |||
| #ifdef DOUBLE | |||
| xvreplve0.d VM0, VM0 | |||
| #else | |||
| xvreplve0.w VM0, VM0 | |||
| #endif | |||
| li.d TEMP, 1 | |||
| li.w I, -1 | |||
| slli.d TEMP, TEMP, ZBASE_SHIFT | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| xvreplgr2vr.w neg1, I | |||
| xvffint.s.w neg1, neg1 | |||
| srai.d I, N, 3 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L23 | |||
| .align 3 | |||
| .L10: | |||
| xvld VX0, X, 0 * SIZE | |||
| xvld VX1, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| xvld VX0, X, 0 | |||
| xvld VX1, X, 32 | |||
| #ifdef DOUBLE | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| #else | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, neg1, x1 | |||
| xvfmul.s x4, neg1, x2 | |||
| xvfcmp.clt.s VT0, x1, res0 | |||
| xvfcmp.clt.s VT1, x2, res0 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VT1 | |||
| #endif | |||
| XVFSUB x3, res0, x1 | |||
| XVFSUB x4, res0, x2 | |||
| XVFMAX x1, x1, x3 | |||
| XVFMAX x2, x2, x4 | |||
| XVFADD VM1, x1, x2 | |||
| XVFMIN VM0, VM0, VM1 | |||
| #ifdef DOUBLE | |||
| xvld VX0, X, 64 | |||
| xvld VX1, X, 96 | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| XVFSUB x3, res0, x1 | |||
| XVFSUB x4, res0, x2 | |||
| XVFMAX x1, x1, x3 | |||
| XVFMAX x2, x2, x4 | |||
| XVFADD VM1, x1, x2 | |||
| XVFMIN VM0, VM0, VM1 | |||
| #endif | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 16 * SIZE | |||
| xvfadd.s VM1, x1, x2 | |||
| xvfmin.s VM0, VM0, VM1 | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| .L11: | |||
| #ifdef DOUBLE | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| XVFMIN VM0, x1, x2 | |||
| #else | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| xvfmin.s VM1, x1, x2 | |||
| xvfmin.s VM0, x3, x4 | |||
| xvfmin.s VM0, VM0, VM1 | |||
| XVFMIN VM0, x1, x2 | |||
| XVFMIN VM1, x3, x4 | |||
| XVFMIN VM0, VM0, VM1 | |||
| #endif | |||
| b .L23 | |||
| .align 3 | |||
| @@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L21: | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmin.s s1, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMIN s1, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmin.s s1, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMIN s1, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| addi.d I, I, -1 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmin.s s3, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMIN s3, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmin.s s4, t1, t3 | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMIN s4, t1, t3 | |||
| blt $r0, I, .L21 | |||
| .align 3 | |||
| .L22: | |||
| fmin.s s1, s1, s2 | |||
| fmin.s s3, s3, s4 | |||
| fmin.s s1, s1, s3 | |||
| FMIN s1, s1, s2 | |||
| FMIN s3, s3, s4 | |||
| FMIN s1, s1, s3 | |||
| .align 3 | |||
| .L23: //N<8 | |||
| @@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FABS a1, a1 | |||
| ADD a0, a0, a1 | |||
| add.d X, X, INCX | |||
| fmin.s s1, a0, s1 | |||
| FMIN s1, a0, s1 | |||
| blt $r0, I, .L24 | |||
| .align 3 | |||
| .L999: | |||
| fmov.s $f0, $f22 | |||
| MOV $f0, $f22 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| @@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vxor.v res0, res0, res0 | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| fld.s a0, X, 0 * SIZE | |||
| fld.s a1, X, 1 * SIZE | |||
| fabs.s a0, a0 | |||
| fabs.s a1, a1 | |||
| fadd.s s1, a1, a0 | |||
| LD a0, X, 0 * SIZE | |||
| LD a1, X, 1 * SIZE | |||
| FABS a0, a0 | |||
| FABS a1, a1 | |||
| ADD s1, a1, a0 | |||
| #ifdef DOUBLE | |||
| vreplvei.d VM0, VM0, 0 | |||
| #else | |||
| vreplvei.w VM0, VM0, 0 | |||
| #endif | |||
| li.d TEMP, 1 | |||
| li.w I, -1 | |||
| slli.d TEMP, TEMP, ZBASE_SHIFT | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| vreplgr2vr.w neg1, I | |||
| vffint.s.w neg1, neg1 | |||
| srai.d I, N, 3 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L23 | |||
| .align 3 | |||
| .L10: | |||
| vld VX0, X, 0 * SIZE | |||
| vld VX1, X, 4 * SIZE | |||
| addi.d I, I, -1 | |||
| vld VX0, X, 0 | |||
| vld VX1, X, 16 | |||
| #ifdef DOUBLE | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| #else | |||
| vpickev.w x1, VX1, VX0 | |||
| vpickod.w x2, VX1, VX0 | |||
| vfmul.s x3, neg1, x1 | |||
| vfmul.s x4, neg1, x2 | |||
| vfcmp.clt.s VT0, x1, res0 | |||
| vfcmp.clt.s VT1, x2, res0 | |||
| vld VX0, X, 8 * SIZE | |||
| vbitsel.v x1, x1, x3, VT0 | |||
| vbitsel.v x2, x2, x4, VT1 | |||
| vld VX1, X, 12 * SIZE | |||
| vfadd.s VM1, x1, x2 | |||
| #endif | |||
| VFSUB x3, res0, x1 | |||
| VFSUB x4, res0, x2 | |||
| VFMAX x1, x1, x3 | |||
| VFMAX x2, x2, x4 | |||
| VFADD VM1, x1, x2 | |||
| vld VX0, X, 32 | |||
| vld VX1, X, 48 | |||
| #ifdef DOUBLE | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| #else | |||
| vpickev.w x1, VX1, VX0 | |||
| vpickod.w x2, VX1, VX0 | |||
| vfmul.s x3, neg1, x1 | |||
| vfmul.s x4, neg1, x2 | |||
| vfcmp.clt.s VT0, x1, res0 | |||
| vfcmp.clt.s VT1, x2, res0 | |||
| #endif | |||
| VFSUB x3, res0, x1 | |||
| VFSUB x4, res0, x2 | |||
| VFMAX x1, x1, x3 | |||
| VFMAX x2, x2, x4 | |||
| VFADD x1, x1, x2 | |||
| VFMIN VM1, x1, VM1 | |||
| VFMIN VM0, VM0, VM1 | |||
| #ifdef DOUBLE | |||
| vld VX0, X, 64 | |||
| vld VX1, X, 80 | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| VFSUB x3, res0, x1 | |||
| VFSUB x4, res0, x2 | |||
| VFMAX x1, x1, x3 | |||
| VFMAX x2, x2, x4 | |||
| VFADD VM1, x1, x2 | |||
| vld VX0, X, 96 | |||
| vld VX1, X, 112 | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| VFSUB x3, res0, x1 | |||
| VFSUB x4, res0, x2 | |||
| VFMAX x1, x1, x3 | |||
| VFMAX x2, x2, x4 | |||
| VFADD x1, x1, x2 | |||
| VFMIN VM1, x1, VM1 | |||
| VFMIN VM0, VM0, VM1 | |||
| #endif | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 16 * SIZE | |||
| vbitsel.v x1, x1, x3, VT0 | |||
| vbitsel.v x2, x2, x4, VT1 | |||
| vfadd.s x1, x1, x2 | |||
| vfmin.s VM1, x1, VM1 | |||
| vfmin.s VM0, VM0, VM1 | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| .L11: | |||
| #ifdef DOUBLE | |||
| vreplvei.d x1, VM0, 0 | |||
| vreplvei.d x2, VM0, 1 | |||
| VFMIN VM0, x1, x2 | |||
| #else | |||
| vreplvei.w x1, VM0, 0 | |||
| vreplvei.w x2, VM0, 1 | |||
| vreplvei.w x3, VM0, 2 | |||
| vreplvei.w x4, VM0, 3 | |||
| vfmin.s VM1, x1, x2 | |||
| vfmin.s VM0, x3, x4 | |||
| vfmin.s VM0, VM0, VM1 | |||
| VFMIN VM1, x1, x2 | |||
| VFMIN VM0, x3, x4 | |||
| VFMIN VM0, VM0, VM1 | |||
| #endif | |||
| b .L23 | |||
| .align 3 | |||
| @@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L21: | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmin.s s1, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMIN s1, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmin.s s1, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMIN s1, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| addi.d I, I, -1 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmin.s s3, t1, t3 | |||
| fld.s t1, X, 0 * SIZE | |||
| fld.s t2, X, 1 * SIZE | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMIN s3, t1, t3 | |||
| LD t1, X, 0 * SIZE | |||
| LD t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fld.s t3, X, 0 * SIZE | |||
| fld.s t4, X, 1 * SIZE | |||
| LD t3, X, 0 * SIZE | |||
| LD t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| fabs.s t1, t1 | |||
| fabs.s t2, t2 | |||
| fabs.s t3, t3 | |||
| fabs.s t4, t4 | |||
| fadd.s t1, t1, t2 | |||
| fadd.s t3, t3, t4 | |||
| fmin.s s4, t1, t3 | |||
| FABS t1, t1 | |||
| FABS t2, t2 | |||
| FABS t3, t3 | |||
| FABS t4, t4 | |||
| ADD t1, t1, t2 | |||
| ADD t3, t3, t4 | |||
| FMIN s4, t1, t3 | |||
| blt $r0, I, .L21 | |||
| .align 3 | |||
| .L22: | |||
| fmin.s s1, s1, s2 | |||
| fmin.s s3, s3, s4 | |||
| fmin.s s1, s1, s3 | |||
| FMIN s1, s1, s2 | |||
| FMIN s3, s3, s4 | |||
| FMIN s1, s1, s3 | |||
| .align 3 | |||
| .L23: //N<8 | |||
| @@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L24: | |||
| fld.s a0, X, 0 * SIZE | |||
| fld.s a1, X, 1 * SIZE | |||
| LD a0, X, 0 * SIZE | |||
| LD a1, X, 1 * SIZE | |||
| addi.d I, I, -1 | |||
| fabs.s a0, a0 | |||
| fabs.s a1, a1 | |||
| fadd.s a0, a0, a1 | |||
| FABS a0, a0 | |||
| FABS a1, a1 | |||
| ADD a0, a0, a1 | |||
| add.d X, X, INCX | |||
| fmin.s s1, a0, s1 | |||
| FMIN s1, a0, s1 | |||
| blt $r0, I, .L24 | |||
| .align 3 | |||
| .L999: | |||
| fmov.s $f0, $f22 | |||
| MOV $f0, $f22 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| .L14: | |||
| bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| @@ -117,38 +117,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L997 | |||
| .align 3 | |||
| .L112: //alpha_r == 0.0 && alpha_i != 0.0 | |||
| xvld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvld VX1, X, 4 * SIZE | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| xvfmul.d x3, VXAI, x2 | |||
| xvfsub.d x3, VXZ, x3 | |||
| xvfmul.d x4, VXAI, x1 | |||
| xvilvl.d VX2, x4 ,x3 | |||
| xvilvh.d VX3, x4, x3 | |||
| xvst VX2, X, 0 * SIZE | |||
| xvst VX3, X, 4 * SIZE | |||
| addi.d X, X, 8 * SIZE | |||
| #else | |||
| xvld VX1, X, 8 * SIZE | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, VXAI, x2 | |||
| xvfsub.s x3, VXZ, x3 | |||
| xvfmul.s x4, VXAI, x1 | |||
| xvilvl.w VX2, x4 ,x3 | |||
| xvilvh.w VX3, x4, x3 | |||
| xvst VX2, X, 0 * SIZE | |||
| xvst VX3, X, 8 * SIZE | |||
| addi.d X, X, 16 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L112 | |||
| b .L997 | |||
| .align 3 | |||
| .L113: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| xvld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| @@ -227,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| .L24: | |||
| bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| @@ -275,119 +243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L997 | |||
| .align 3 | |||
| .L222: //alpha_r == 0.0 && alpha_i != 0.0 | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d x1, t1, 0 | |||
| xvinsgr2vr.d x2, t2, 0 | |||
| xvinsgr2vr.d x1, t3, 1 | |||
| xvinsgr2vr.d x2, t4, 1 | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| xvinsgr2vr.d x1, t1, 2 | |||
| xvinsgr2vr.d x2, t2, 2 | |||
| xvinsgr2vr.d x1, t3, 3 | |||
| xvinsgr2vr.d x2, t4, 3 | |||
| add.d X, X, INCX | |||
| xvfmul.d x3, VXAI, x2 | |||
| xvfsub.d x3, VXZ, x3 | |||
| xvfmul.d x4, VXAI, x1 | |||
| addi.d I, I, -1 | |||
| xvstelm.d x3, XX, 0 * SIZE, 0 | |||
| xvstelm.d x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 1 | |||
| xvstelm.d x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 2 | |||
| xvstelm.d x4, XX, 1 * SIZE, 2 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 3 | |||
| xvstelm.d x4, XX, 1 * SIZE, 3 | |||
| #else | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 0 | |||
| xvinsgr2vr.w x2, t2, 0 | |||
| xvinsgr2vr.w x1, t3, 1 | |||
| xvinsgr2vr.w x2, t4, 1 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| xvinsgr2vr.w x1, t1, 2 | |||
| xvinsgr2vr.w x2, t2, 2 | |||
| xvinsgr2vr.w x1, t3, 3 | |||
| xvinsgr2vr.w x2, t4, 3 | |||
| add.d X, X, INCX | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 4 | |||
| xvinsgr2vr.w x2, t2, 4 | |||
| xvinsgr2vr.w x1, t3, 5 | |||
| xvinsgr2vr.w x2, t4, 5 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| xvinsgr2vr.w x1, t1, 6 | |||
| xvinsgr2vr.w x2, t2, 6 | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| add.d X, X, INCX | |||
| xvfmul.s x3, VXAI, x2 | |||
| xvfsub.s x3, VXZ, x3 | |||
| xvfmul.s x4, VXAI, x1 | |||
| addi.d I, I, -1 | |||
| xvstelm.w x3, XX, 0 * SIZE, 0 | |||
| xvstelm.w x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 1 | |||
| xvstelm.w x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 2 | |||
| xvstelm.w x4, XX, 1 * SIZE, 2 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 3 | |||
| xvstelm.w x4, XX, 1 * SIZE, 3 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 4 | |||
| xvstelm.w x4, XX, 1 * SIZE, 4 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 5 | |||
| xvstelm.w x4, XX, 1 * SIZE, 5 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 6 | |||
| xvstelm.w x4, XX, 1 * SIZE, 6 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 7 | |||
| xvstelm.w x4, XX, 1 * SIZE, 7 | |||
| #endif | |||
| add.d XX, XX, INCX | |||
| blt $r0, I, .L222 | |||
| b .L997 | |||
| .align 3 | |||
| .L223: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| .L14: | |||
| bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| @@ -116,48 +116,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L997 | |||
| .align 3 | |||
| .L112: //alpha_r == 0.0 && alpha_i != 0.0 | |||
| vld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| vld VX1, X, 2 * SIZE | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| vfmul.d x3, VXAI, x2 | |||
| vfsub.d x3, VXZ, x3 | |||
| vfmul.d x4, VXAI, x1 | |||
| vilvl.d VX2, x4 ,x3 | |||
| vilvh.d VX3, x4, x3 | |||
| vst VX2, X, 0 * SIZE | |||
| vst VX3, X, 2 * SIZE | |||
| vld VX0, X, 4 * SIZE | |||
| vld VX1, X, 6 * SIZE | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| vfmul.d x3, VXAI, x2 | |||
| vfsub.d x3, VXZ, x3 | |||
| vfmul.d x4, VXAI, x1 | |||
| vilvl.d VX2, x4 ,x3 | |||
| vilvh.d VX3, x4, x3 | |||
| vst VX2, X, 4 * SIZE | |||
| vst VX3, X, 6 * SIZE | |||
| #else | |||
| vld VX1, X, 4 * SIZE | |||
| vpickev.w x1, VX1, VX0 | |||
| vpickod.w x2, VX1, VX0 | |||
| vfmul.s x3, VXAI, x2 | |||
| vfsub.s x3, VXZ, x3 | |||
| vfmul.s x4, VXAI, x1 | |||
| vilvl.w VX2, x4 ,x3 | |||
| vilvh.w VX3, x4, x3 | |||
| vst VX2, X, 0 * SIZE | |||
| vst VX3, X, 4 * SIZE | |||
| #endif | |||
| addi.d X, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L112 | |||
| b .L997 | |||
| .align 3 | |||
| .L113: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| vld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| @@ -256,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| .L24: | |||
| bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| @@ -292,90 +250,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L997 | |||
| .align 3 | |||
| .L222: //alpha_r == 0.0 && alpha_i != 0.0 | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| vinsgr2vr.d x1, t1, 0 | |||
| vinsgr2vr.d x2, t2, 0 | |||
| vinsgr2vr.d x1, t3, 1 | |||
| vinsgr2vr.d x2, t4, 1 | |||
| vfmul.d x3, VXAI, x2 | |||
| vfsub.d x3, VXZ, x3 | |||
| vfmul.d x4, VXAI, x1 | |||
| vstelm.d x3, XX, 0 * SIZE, 0 | |||
| vstelm.d x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| vstelm.d x3, XX, 0 * SIZE, 1 | |||
| vstelm.d x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| vinsgr2vr.d x1, t1, 0 | |||
| vinsgr2vr.d x2, t2, 0 | |||
| vinsgr2vr.d x1, t3, 1 | |||
| vinsgr2vr.d x2, t4, 1 | |||
| add.d X, X, INCX | |||
| vfmul.d x3, VXAI, x2 | |||
| vfsub.d x3, VXZ, x3 | |||
| vfmul.d x4, VXAI, x1 | |||
| addi.d I, I, -1 | |||
| vstelm.d x3, XX, 0 * SIZE, 0 | |||
| vstelm.d x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| vstelm.d x3, XX, 0 * SIZE, 1 | |||
| vstelm.d x4, XX, 1 * SIZE, 1 | |||
| #else | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| vinsgr2vr.w x1, t1, 0 | |||
| vinsgr2vr.w x2, t2, 0 | |||
| vinsgr2vr.w x1, t3, 1 | |||
| vinsgr2vr.w x2, t4, 1 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| vinsgr2vr.w x1, t1, 2 | |||
| vinsgr2vr.w x2, t2, 2 | |||
| vinsgr2vr.w x1, t3, 3 | |||
| vinsgr2vr.w x2, t4, 3 | |||
| add.d X, X, INCX | |||
| vfmul.s x3, VXAI, x2 | |||
| vfsub.s x3, VXZ, x3 | |||
| vfmul.s x4, VXAI, x1 | |||
| addi.d I, I, -1 | |||
| vstelm.w x3, XX, 0 * SIZE, 0 | |||
| vstelm.w x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| vstelm.w x3, XX, 0 * SIZE, 1 | |||
| vstelm.w x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| vstelm.w x3, XX, 0 * SIZE, 2 | |||
| vstelm.w x4, XX, 1 * SIZE, 2 | |||
| add.d XX, XX, INCX | |||
| vstelm.w x3, XX, 0 * SIZE, 3 | |||
| vstelm.w x4, XX, 1 * SIZE, 3 | |||
| #endif | |||
| add.d XX, XX, INCX | |||
| blt $r0, I, .L222 | |||
| b .L997 | |||
| .align 3 | |||
| .L223: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| @@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||
| for( i=0; i<n; i+=4 ) | |||
| { | |||
| t0 = da_r *x[0] - da_i *x[1]; | |||
| t1 = da_r *x[2] - da_i *x[3]; | |||
| t2 = da_r *x[4] - da_i *x[5]; | |||
| t3 = da_r *x[6] - da_i *x[7]; | |||
| t0 = da_r *x[0] - da_i *x[1]; | |||
| t1 = da_r *x[2] - da_i *x[3]; | |||
| t2 = da_r *x[4] - da_i *x[5]; | |||
| t3 = da_r *x[6] - da_i *x[7]; | |||
| x[1] = da_r * x[1] + da_i * x[0]; | |||
| x[3] = da_r * x[3] + da_i * x[2]; | |||
| x[5] = da_r * x[5] + da_i * x[4]; | |||
| x[7] = da_r * x[7] + da_i * x[6]; | |||
| x[0] = t0; | |||
| x[2] = t1; | |||
| x[4] = t2; | |||
| @@ -99,16 +99,16 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||
| for( i=0; i<n; i+=4 ) | |||
| { | |||
| t0 = - da_i *x[1]; | |||
| t1 = - da_i *x[3]; | |||
| t2 = - da_i *x[5]; | |||
| t3 = - da_i *x[7]; | |||
| t0 = - da_i *x[1]; | |||
| t1 = - da_i *x[3]; | |||
| t2 = - da_i *x[5]; | |||
| t3 = - da_i *x[7]; | |||
| x[1] = da_i * x[0]; | |||
| x[3] = da_i * x[2]; | |||
| x[5] = da_i * x[4]; | |||
| x[7] = da_i * x[6]; | |||
| x[0] = t0; | |||
| x[2] = t1; | |||
| x[4] = t2; | |||
| @@ -129,16 +129,16 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||
| for( i=0; i<n; i+=4 ) | |||
| { | |||
| t0 = da_r *x[0]; | |||
| t1 = da_r *x[2]; | |||
| t2 = da_r *x[4]; | |||
| t3 = da_r *x[6]; | |||
| t0 = da_r *x[0]; | |||
| t1 = da_r *x[2]; | |||
| t2 = da_r *x[4]; | |||
| t3 = da_r *x[6]; | |||
| x[1] = da_r * x[1]; | |||
| x[3] = da_r * x[3]; | |||
| x[5] = da_r * x[5]; | |||
| x[7] = da_r * x[7]; | |||
| x[0] = t0; | |||
| x[2] = t1; | |||
| x[4] = t2; | |||
| @@ -157,14 +157,14 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||
| BLASLONG i; | |||
| for( i=0; i<n; i+=4 ) | |||
| { | |||
| x[0] = 0.0; | |||
| x[1] = 0.0; | |||
| x[2] = 0.0; | |||
| x[3] = 0.0; | |||
| x[4] = 0.0; | |||
| x[5] = 0.0; | |||
| x[6] = 0.0; | |||
| x[7] = 0.0; | |||
| x[0] = 0.0; | |||
| x[1] = 0.0; | |||
| x[2] = 0.0; | |||
| x[3] = 0.0; | |||
| x[4] = 0.0; | |||
| x[5] = 0.0; | |||
| x[6] = 0.0; | |||
| x[7] = 0.0; | |||
| x+=8; | |||
| } | |||
| @@ -186,10 +186,10 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ | |||
| for ( i=0; i<n; i+=4 ) | |||
| { | |||
| t0 = da_r * x[0] - da_i *x[1]; | |||
| t1 = da_r * x[inc_x] - da_i *x[inc_x + 1]; | |||
| t2 = da_r * x[inc_x2] - da_i *x[inc_x2 + 1]; | |||
| t3 = da_r * x[inc_x3] - da_i *x[inc_x3 + 1]; | |||
| t0 = da_r * x[0] - da_i *x[1]; | |||
| t1 = da_r * x[inc_x] - da_i *x[inc_x + 1]; | |||
| t2 = da_r * x[inc_x2] - da_i *x[inc_x2 + 1]; | |||
| t3 = da_r * x[inc_x3] - da_i *x[inc_x3 + 1]; | |||
| x[1] = da_i * x[0] + da_r * x[1]; | |||
| x[inc_x +1] = da_i * x[inc_x] + da_r * x[inc_x +1]; | |||
| @@ -228,7 +228,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| { | |||
| while(j < n1) | |||
| { | |||
| x[i]=0.0; | |||
| x[i+1]=0.0; | |||
| x[i+inc_x]=0.0; | |||
| @@ -240,7 +240,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| while(j < n) | |||
| { | |||
| x[i]=0.0; | |||
| x[i+1]=0.0; | |||
| i += inc_x ; | |||
| @@ -253,11 +253,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| { | |||
| while(j < n1) | |||
| { | |||
| temp0 = -da_i * x[i+1]; | |||
| if (isnan(x[i]) || isinf(x[i])) | |||
| temp0 = NAN; | |||
| else | |||
| temp0 = -da_i * x[i+1]; | |||
| x[i+1] = da_i * x[i]; | |||
| x[i] = temp0; | |||
| temp1 = -da_i * x[i+1+inc_x]; | |||
| if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) | |||
| temp1 = NAN; | |||
| else | |||
| temp1 = -da_i * x[i+1+inc_x]; | |||
| x[i+1+inc_x] = da_i * x[i+inc_x]; | |||
| x[i+inc_x] = temp1; | |||
| i += 2*inc_x ; | |||
| @@ -267,8 +273,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| while(j < n) | |||
| { | |||
| temp0 = -da_i * x[i+1]; | |||
| if (isnan(x[i]) || isinf(x[i])) | |||
| temp0 = NAN; | |||
| else | |||
| temp0 = -da_i * x[i+1]; | |||
| x[i+1] = da_i * x[i]; | |||
| x[i] = temp0; | |||
| i += inc_x ; | |||
| @@ -291,7 +300,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| while(j < n1) | |||
| { | |||
| temp0 = da_r * x[i]; | |||
| x[i+1] = da_r * x[i+1]; | |||
| x[i] = temp0; | |||
| @@ -305,7 +314,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| while(j < n) | |||
| { | |||
| temp0 = da_r * x[i]; | |||
| x[i+1] = da_r * x[i+1]; | |||
| x[i] = temp0; | |||
| @@ -368,7 +377,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } | |||
| i = n1 << 1; | |||
| j = n1; | |||
| if ( da_r == 0.0 || da_r != da_r ) | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| @@ -385,7 +394,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } | |||
| } | |||
| else if (da_r < -FLT_MAX || da_r > FLT_MAX) { | |||
| else if (da_r < -FLT_MAX || da_r > FLT_MAX) { | |||
| while(j < n) | |||
| { | |||
| x[i]= NAN; | |||
| @@ -404,7 +413,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| if (x[i] < -FLT_MAX || x[i] > FLT_MAX) | |||
| temp0 = NAN; | |||
| x[i+1] = da_i * x[i]; | |||
| if ( x[i] == x[i]) //preserve NaN | |||
| if ( x[i] == x[i]) //preserve NaN | |||
| x[i] = temp0; | |||
| i += 2 ; | |||
| j++; | |||
| @@ -420,7 +429,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| { | |||
| while(j < n) | |||
| { | |||
| temp0 = da_r * x[i]; | |||
| x[i+1] = da_r * x[i+1]; | |||
| x[i] = temp0; | |||
| @@ -442,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -16,6 +16,7 @@ else () | |||
| test_dnrm2.c | |||
| test_swap.c | |||
| test_zscal.c | |||
| test_amin.c | |||
| ) | |||
| endif () | |||
| @@ -11,7 +11,8 @@ UTESTBIN=openblas_utest | |||
| include $(TOPDIR)/Makefile.system | |||
| OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o | |||
| OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ | |||
| test_amin.o | |||
| #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o | |||
| ifneq ($(NO_LAPACK), 1) | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2016, The OpenBLAS Project | |||
| Copyright (c) 2011-2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -13,9 +13,9 @@ met: | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| @@ -57,4 +57,31 @@ CTEST(amax, damax){ | |||
| ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); | |||
| } | |||
| #endif | |||
| #ifdef BUILD_COMPLEX | |||
| CTEST(amax, scamax){ | |||
| blasint N = 9, inc = 1; | |||
| float te_max = 0.0, tr_max = 0.0; | |||
| float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||
| -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, | |||
| -7.7, 8.8 }; | |||
| te_max = BLASFUNC(scamax)(&N, x, &inc); | |||
| tr_max = 20.0; | |||
| ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); | |||
| } | |||
| #endif | |||
| #ifdef BUILD_COMPLEX16 | |||
| CTEST(amax, dzamax){ | |||
| blasint N = 9, inc = 1; | |||
| double te_max = 0.0, tr_max = 0.0; | |||
| double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||
| -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, | |||
| -7.7, 8.8 }; | |||
| te_max = BLASFUNC(dzamax)(&N, x, &inc); | |||
| tr_max = 20.0; | |||
| ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,89 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "openblas_utest.h" | |||
| #ifdef BUILD_SINGLE | |||
| CTEST(amin, samin){ | |||
| blasint N = 3, inc = 1; | |||
| float te_min = 0.0, tr_min = 0.0; | |||
| float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||
| -9.9 }; | |||
| te_min = BLASFUNC(samin)(&N, x, &inc); | |||
| tr_min = 1.1; | |||
| ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); | |||
| } | |||
| #endif | |||
| #ifdef BUILD_DOUBLE | |||
| CTEST(amin, damin){ | |||
| blasint N = 3, inc = 1; | |||
| double te_min = 0.0, tr_min = 0.0; | |||
| double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||
| -9.9 }; | |||
| te_min = BLASFUNC(damin)(&N, x, &inc); | |||
| tr_min = 1.1; | |||
| ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); | |||
| } | |||
| #endif | |||
| #ifdef BUILD_COMPLEX | |||
| CTEST(amin, scamin){ | |||
| blasint N = 9, inc = 1; | |||
| float te_min = 0.0, tr_min = 0.0; | |||
| float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||
| -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, | |||
| -7.7, 8.8 }; | |||
| te_min = BLASFUNC(scamin)(&N, x, &inc); | |||
| tr_min = 3.3; | |||
| ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); | |||
| } | |||
| #endif | |||
| #ifdef BUILD_COMPLEX16 | |||
| CTEST(amin, dzamin){ | |||
| blasint N = 9, inc = 1; | |||
| double te_min = 0.0, tr_min = 0.0; | |||
| double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, | |||
| -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, | |||
| -7.7, 8.8 }; | |||
| te_min = BLASFUNC(dzamin)(&N, x, &inc); | |||
| tr_min = 3.3; | |||
| ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); | |||
| } | |||
| #endif | |||
| @@ -20,6 +20,18 @@ CTEST(zscal, i_nan) | |||
| ASSERT_TRUE(isnan(nan[17])); | |||
| } | |||
| CTEST(zscal, i_nan_inc_2) | |||
| { | |||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||
| double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, | |||
| NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; | |||
| cblas_zscal(9, i, &nan, 2); | |||
| ASSERT_TRUE(isnan(nan[0])); | |||
| ASSERT_TRUE(isnan(nan[1])); | |||
| ASSERT_TRUE(isnan(nan[16])); | |||
| ASSERT_TRUE(isnan(nan[17])); | |||
| } | |||
| CTEST(zscal, nan_i) | |||
| { | |||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||
| @@ -30,7 +42,19 @@ CTEST(zscal, nan_i) | |||
| ASSERT_TRUE(isnan(i[16])); | |||
| ASSERT_TRUE(isnan(i[17])); | |||
| } | |||
| CTEST(zscal, nan_i_inc_2) | |||
| { | |||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, | |||
| 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||
| double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; | |||
| cblas_zscal(9, &nan, &i, 2); | |||
| ASSERT_TRUE(isnan(i[0])); | |||
| ASSERT_TRUE(isnan(i[1])); | |||
| ASSERT_TRUE(isnan(i[16])); | |||
| ASSERT_TRUE(isnan(i[17])); | |||
| } | |||
| CTEST(zscal, i_inf) | |||
| { | |||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||
| @@ -40,7 +64,19 @@ CTEST(zscal, i_inf) | |||
| ASSERT_TRUE(isinf(inf[1])); | |||
| ASSERT_TRUE(isnan(inf[16])); | |||
| ASSERT_TRUE(isinf(inf[17])); | |||
| } | |||
| } | |||
| CTEST(zscal, i_inf_inc_2) | |||
| { | |||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||
| double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, | |||
| INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; | |||
| cblas_zscal(9, i, &inf, 2); | |||
| ASSERT_TRUE(isnan(inf[0])); | |||
| ASSERT_TRUE(isinf(inf[1])); | |||
| ASSERT_TRUE(isnan(inf[16])); | |||
| ASSERT_TRUE(isinf(inf[17])); | |||
| } | |||
| CTEST(zscal, inf_i) | |||
| { | |||
| @@ -53,4 +89,16 @@ CTEST(zscal, inf_i) | |||
| ASSERT_TRUE(isinf(i[17])); | |||
| } | |||
| CTEST(zscal, inf_i_inc_2) | |||
| { | |||
| double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, | |||
| 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; | |||
| double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; | |||
| cblas_zscal(9, &inf, &i, 2); | |||
| ASSERT_TRUE(isnan(i[0])); | |||
| ASSERT_TRUE(isinf(i[1])); | |||
| ASSERT_TRUE(isnan(i[16])); | |||
| ASSERT_TRUE(isinf(i[17])); | |||
| } | |||
| #endif | |||