@@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S | |||||
SAMAXKERNEL = amax_lsx.S | SAMAXKERNEL = amax_lsx.S | ||||
DAMAXKERNEL = amax_lsx.S | DAMAXKERNEL = amax_lsx.S | ||||
CAMAXKERNEL = camax_lsx.S | CAMAXKERNEL = camax_lsx.S | ||||
ZAMAXKERNEL = camax_lsx.S | |||||
SAMINKERNEL = amin_lsx.S | SAMINKERNEL = amin_lsx.S | ||||
DAMINKERNEL = amin_lsx.S | DAMINKERNEL = amin_lsx.S | ||||
CAMINKERNEL = camin_lsx.S | CAMINKERNEL = camin_lsx.S | ||||
ZAMINKERNEL = camin_lsx.S | |||||
SMAXKERNEL = max_lsx.S | SMAXKERNEL = max_lsx.S | ||||
DMAXKERNEL = max_lsx.S | DMAXKERNEL = max_lsx.S | ||||
@@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S | |||||
SAMAXKERNEL = amax_lasx.S | SAMAXKERNEL = amax_lasx.S | ||||
DAMAXKERNEL = amax_lasx.S | DAMAXKERNEL = amax_lasx.S | ||||
CAMAXKERNEL = camax_lasx.S | CAMAXKERNEL = camax_lasx.S | ||||
ZAMAXKERNEL = camax_lasx.S | |||||
SAMINKERNEL = amin_lasx.S | SAMINKERNEL = amin_lasx.S | ||||
DAMINKERNEL = amin_lasx.S | DAMINKERNEL = amin_lasx.S | ||||
CAMINKERNEL = camin_lasx.S | CAMINKERNEL = camin_lasx.S | ||||
ZAMINKERNEL = camin_lasx.S | |||||
SMAXKERNEL = max_lsx.S | SMAXKERNEL = max_lsx.S | ||||
DMAXKERNEL = max_lsx.S | DMAXKERNEL = max_lsx.S | ||||
@@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
li.d TEMP, 1 | li.d TEMP, 1 | ||||
li.w I, -1 | |||||
slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
xvreplgr2vr.w neg1, I | |||||
xvffint.s.w neg1, neg1 | |||||
srai.d I, N, 3 | srai.d I, N, 3 | ||||
bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
bge $r0, I, .L23 | bge $r0, I, .L23 | ||||
.align 3 | .align 3 | ||||
.L10: | .L10: | ||||
xvld VX0, X, 0 * SIZE | |||||
xvld VX1, X, 8 * SIZE | |||||
addi.d I, I, -1 | |||||
xvld VX0, X, 0 | |||||
xvld VX1, X, 32 | |||||
#ifdef DOUBLE | |||||
xvpickev.d x1, VX1, VX0 | |||||
xvpickod.d x2, VX1, VX0 | |||||
#else | |||||
xvpickev.w x1, VX1, VX0 | xvpickev.w x1, VX1, VX0 | ||||
xvpickod.w x2, VX1, VX0 | xvpickod.w x2, VX1, VX0 | ||||
xvfmul.s x3, neg1, x1 | |||||
xvfmul.s x4, neg1, x2 | |||||
xvfcmp.clt.s VT0, x1, res0 | |||||
xvfcmp.clt.s VT1, x2, res0 | |||||
xvbitsel.v x1, x1, x3, VT0 | |||||
xvbitsel.v x2, x2, x4, VT1 | |||||
#endif | |||||
XVFSUB x3, res0, x1 | |||||
XVFSUB x4, res0, x2 | |||||
XVFMAX x1, x1, x3 | |||||
XVFMAX x2, x2, x4 | |||||
XVFADD VM1, x1, x2 | |||||
XVFMAX VM0, VM0, VM1 | |||||
#ifdef DOUBLE | |||||
xvld VX0, X, 64 | |||||
xvld VX1, X, 96 | |||||
xvpickev.d x1, VX1, VX0 | |||||
xvpickod.d x2, VX1, VX0 | |||||
XVFSUB x3, res0, x1 | |||||
XVFSUB x4, res0, x2 | |||||
XVFMAX x1, x1, x3 | |||||
XVFMAX x2, x2, x4 | |||||
XVFADD VM1, x1, x2 | |||||
XVFMAX VM0, VM0, VM1 | |||||
#endif | |||||
addi.d I, I, -1 | |||||
addi.d X, X, 16 * SIZE | addi.d X, X, 16 * SIZE | ||||
xvfadd.s VM1, x1, x2 | |||||
xvfmax.s VM0, VM0, VM1 | |||||
blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
.align 3 | .align 3 | ||||
.L11: | .L11: | ||||
#ifdef DOUBLE | |||||
xvpickve.d x1, VM0, 0 | |||||
xvpickve.d x2, VM0, 1 | |||||
XVFMAX VM0, x1, x2 | |||||
#else | |||||
xvpickve.w x1, VM0, 0 | xvpickve.w x1, VM0, 0 | ||||
xvpickve.w x2, VM0, 1 | xvpickve.w x2, VM0, 1 | ||||
xvpickve.w x3, VM0, 2 | xvpickve.w x3, VM0, 2 | ||||
xvpickve.w x4, VM0, 3 | xvpickve.w x4, VM0, 3 | ||||
xvfmax.s VM1, x1, x2 | |||||
xvfmax.s VM0, x3, x4 | |||||
xvfmax.s VM0, VM0, VM1 | |||||
XVFMAX VM0, x1, x2 | |||||
XVFMAX VM1, x3, x4 | |||||
XVFMAX VM0, VM0, VM1 | |||||
#endif | |||||
b .L23 | b .L23 | ||||
.align 3 | .align 3 | ||||
@@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.align 3 | .align 3 | ||||
.L21: | .L21: | ||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmax.s s1, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMAX s1, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmax.s s1, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMAX s1, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmax.s s3, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMAX s3, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmax.s s4, t1, t3 | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMAX s4, t1, t3 | |||||
blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
.align 3 | .align 3 | ||||
.L22: | .L22: | ||||
fmax.s s1, s1, s2 | |||||
fmax.s s3, s3, s4 | |||||
fmax.s s1, s1, s3 | |||||
FMAX s1, s1, s2 | |||||
FMAX s3, s3, s4 | |||||
FMAX s1, s1, s3 | |||||
.align 3 | .align 3 | ||||
.L23: //N<8 | .L23: //N<8 | ||||
@@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
FABS a1, a1 | FABS a1, a1 | ||||
ADD a0, a0, a1 | ADD a0, a0, a1 | ||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fmax.s s1, a0, s1 | |||||
FMAX s1, a0, s1 | |||||
blt $r0, I, .L24 | blt $r0, I, .L24 | ||||
.align 3 | .align 3 | ||||
.L999: | .L999: | ||||
fmov.s $f0, $f22 | |||||
MOV $f0, $f22 | |||||
jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
.align 3 | .align 3 | ||||
@@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
li.d TEMP, 1 | li.d TEMP, 1 | ||||
li.w I, -1 | |||||
slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
vreplgr2vr.w neg1, I | |||||
vffint.s.w neg1, neg1 | |||||
srai.d I, N, 3 | srai.d I, N, 3 | ||||
bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
bge $r0, I, .L23 | bge $r0, I, .L23 | ||||
.align 3 | .align 3 | ||||
.L10: | .L10: | ||||
vld VX0, X, 0 * SIZE | |||||
vld VX1, X, 4 * SIZE | |||||
addi.d I, I, -1 | |||||
vld VX0, X, 0 | |||||
vld VX1, X, 16 | |||||
#ifdef DOUBLE | |||||
vpickev.d x1, VX1, VX0 | |||||
vpickod.d x2, VX1, VX0 | |||||
#else | |||||
vpickev.w x1, VX1, VX0 | vpickev.w x1, VX1, VX0 | ||||
vpickod.w x2, VX1, VX0 | vpickod.w x2, VX1, VX0 | ||||
vfmul.s x3, neg1, x1 | |||||
vfmul.s x4, neg1, x2 | |||||
vfcmp.clt.s VT0, x1, res0 | |||||
vfcmp.clt.s VT1, x2, res0 | |||||
vld VX0, X, 8 * SIZE | |||||
vbitsel.v x1, x1, x3, VT0 | |||||
vbitsel.v x2, x2, x4, VT1 | |||||
vld VX1, X, 12 * SIZE | |||||
vfadd.s VM1, x1, x2 | |||||
#endif | |||||
VFSUB x3, res0, x1 | |||||
VFSUB x4, res0, x2 | |||||
VFMAX x1, x1, x3 | |||||
VFMAX x2, x2, x4 | |||||
VFADD VM1, x1, x2 | |||||
vld VX0, X, 32 | |||||
vld VX1, X, 48 | |||||
#ifdef DOUBLE | |||||
vpickev.d x1, VX1, VX0 | |||||
vpickod.d x2, VX1, VX0 | |||||
#else | |||||
vpickev.w x1, VX1, VX0 | vpickev.w x1, VX1, VX0 | ||||
vpickod.w x2, VX1, VX0 | vpickod.w x2, VX1, VX0 | ||||
vfmul.s x3, neg1, x1 | |||||
vfmul.s x4, neg1, x2 | |||||
vfcmp.clt.s VT0, x1, res0 | |||||
vfcmp.clt.s VT1, x2, res0 | |||||
#endif | |||||
VFSUB x3, res0, x1 | |||||
VFSUB x4, res0, x2 | |||||
VFMAX x1, x1, x3 | |||||
VFMAX x2, x2, x4 | |||||
VFADD x1, x1, x2 | |||||
VFMAX VM1, x1, VM1 | |||||
VFMAX VM0, VM0, VM1 | |||||
#ifdef DOUBLE | |||||
vld VX0, X, 64 | |||||
vld VX1, X, 80 | |||||
vpickev.d x1, VX1, VX0 | |||||
vpickod.d x2, VX1, VX0 | |||||
VFSUB x3, res0, x1 | |||||
VFSUB x4, res0, x2 | |||||
VFMAX x1, x1, x3 | |||||
VFMAX x2, x2, x4 | |||||
VFADD VM1, x1, x2 | |||||
vld VX0, X, 96 | |||||
vld VX1, X, 112 | |||||
vpickev.d x1, VX1, VX0 | |||||
vpickod.d x2, VX1, VX0 | |||||
VFSUB x3, res0, x1 | |||||
VFSUB x4, res0, x2 | |||||
VFMAX x1, x1, x3 | |||||
VFMAX x2, x2, x4 | |||||
VFADD x1, x1, x2 | |||||
VFMAX VM1, x1, VM1 | |||||
VFMAX VM0, VM0, VM1 | |||||
#endif | |||||
addi.d X, X, 16 * SIZE | addi.d X, X, 16 * SIZE | ||||
vbitsel.v x1, x1, x3, VT0 | |||||
vbitsel.v x2, x2, x4, VT1 | |||||
vfadd.s x1, x1, x2 | |||||
vfmax.s VM1, x1, VM1 | |||||
vfmax.s VM0, VM0, VM1 | |||||
addi.d I, I, -1 | |||||
blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
.align 3 | .align 3 | ||||
.L11: | .L11: | ||||
#ifdef DOUBLE | |||||
vreplvei.d x1, VM0, 0 | |||||
vreplvei.d x2, VM0, 1 | |||||
VFMAX VM0, x1, x2 | |||||
#else | |||||
vreplvei.w x1, VM0, 0 | vreplvei.w x1, VM0, 0 | ||||
vreplvei.w x2, VM0, 1 | vreplvei.w x2, VM0, 1 | ||||
vreplvei.w x3, VM0, 2 | vreplvei.w x3, VM0, 2 | ||||
vreplvei.w x4, VM0, 3 | vreplvei.w x4, VM0, 3 | ||||
vfmax.s VM1, x1, x2 | |||||
vfmax.s VM0, x3, x4 | |||||
vfmax.s VM0, VM0, VM1 | |||||
VFMAX VM1, x1, x2 | |||||
VFMAX VM0, x3, x4 | |||||
VFMAX VM0, VM0, VM1 | |||||
#endif | |||||
b .L23 | b .L23 | ||||
.align 3 | .align 3 | ||||
@@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.align 3 | .align 3 | ||||
.L21: | .L21: | ||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmax.s s1, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMAX s1, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmax.s s1, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMAX s1, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmax.s s3, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMAX s3, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmax.s s4, t1, t3 | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMAX s4, t1, t3 | |||||
blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
.align 3 | .align 3 | ||||
.L22: | .L22: | ||||
fmax.s s1, s1, s2 | |||||
fmax.s s3, s3, s4 | |||||
fmax.s s1, s1, s3 | |||||
FMAX s1, s1, s2 | |||||
FMAX s3, s3, s4 | |||||
FMAX s1, s1, s3 | |||||
.align 3 | .align 3 | ||||
.L23: //N<8 | .L23: //N<8 | ||||
@@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.align 3 | .align 3 | ||||
.L24: | .L24: | ||||
fld.s a0, X, 0 * SIZE | |||||
fld.s a1, X, 1 * SIZE | |||||
LD a0, X, 0 * SIZE | |||||
LD a1, X, 1 * SIZE | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
fabs.s a0, a0 | |||||
fabs.s a1, a1 | |||||
fadd.s a0, a0, a1 | |||||
FABS a0, a0 | |||||
FABS a1, a1 | |||||
ADD a0, a0, a1 | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fmax.s s1, a0, s1 | |||||
FMAX s1, a0, s1 | |||||
blt $r0, I, .L24 | blt $r0, I, .L24 | ||||
.align 3 | .align 3 | ||||
.L999: | .L999: | ||||
fmov.s $f0, $f22 | |||||
MOV $f0, $f22 | |||||
jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
.align 3 | .align 3 | ||||
@@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
xvxor.v res0, res0, res0 | xvxor.v res0, res0, res0 | ||||
bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
fld.s a0, X, 0 * SIZE | |||||
fld.s a1, X, 1 * SIZE | |||||
fabs.s a0, a0 | |||||
fabs.s a1, a1 | |||||
fadd.s s1, a1, a0 | |||||
LD a0, X, 0 * SIZE | |||||
LD a1, X, 1 * SIZE | |||||
FABS a0, a0 | |||||
FABS a1, a1 | |||||
ADD s1, a1, a0 | |||||
#ifdef DOUBLE | |||||
xvreplve0.d VM0, VM0 | |||||
#else | |||||
xvreplve0.w VM0, VM0 | xvreplve0.w VM0, VM0 | ||||
#endif | |||||
li.d TEMP, 1 | li.d TEMP, 1 | ||||
li.w I, -1 | |||||
slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
xvreplgr2vr.w neg1, I | |||||
xvffint.s.w neg1, neg1 | |||||
srai.d I, N, 3 | srai.d I, N, 3 | ||||
bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
bge $r0, I, .L23 | bge $r0, I, .L23 | ||||
.align 3 | .align 3 | ||||
.L10: | .L10: | ||||
xvld VX0, X, 0 * SIZE | |||||
xvld VX1, X, 8 * SIZE | |||||
addi.d I, I, -1 | |||||
xvld VX0, X, 0 | |||||
xvld VX1, X, 32 | |||||
#ifdef DOUBLE | |||||
xvpickev.d x1, VX1, VX0 | |||||
xvpickod.d x2, VX1, VX0 | |||||
#else | |||||
xvpickev.w x1, VX1, VX0 | xvpickev.w x1, VX1, VX0 | ||||
xvpickod.w x2, VX1, VX0 | xvpickod.w x2, VX1, VX0 | ||||
xvfmul.s x3, neg1, x1 | |||||
xvfmul.s x4, neg1, x2 | |||||
xvfcmp.clt.s VT0, x1, res0 | |||||
xvfcmp.clt.s VT1, x2, res0 | |||||
xvbitsel.v x1, x1, x3, VT0 | |||||
xvbitsel.v x2, x2, x4, VT1 | |||||
#endif | |||||
XVFSUB x3, res0, x1 | |||||
XVFSUB x4, res0, x2 | |||||
XVFMAX x1, x1, x3 | |||||
XVFMAX x2, x2, x4 | |||||
XVFADD VM1, x1, x2 | |||||
XVFMIN VM0, VM0, VM1 | |||||
#ifdef DOUBLE | |||||
xvld VX0, X, 64 | |||||
xvld VX1, X, 96 | |||||
xvpickev.d x1, VX1, VX0 | |||||
xvpickod.d x2, VX1, VX0 | |||||
XVFSUB x3, res0, x1 | |||||
XVFSUB x4, res0, x2 | |||||
XVFMAX x1, x1, x3 | |||||
XVFMAX x2, x2, x4 | |||||
XVFADD VM1, x1, x2 | |||||
XVFMIN VM0, VM0, VM1 | |||||
#endif | |||||
addi.d I, I, -1 | |||||
addi.d X, X, 16 * SIZE | addi.d X, X, 16 * SIZE | ||||
xvfadd.s VM1, x1, x2 | |||||
xvfmin.s VM0, VM0, VM1 | |||||
blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
.align 3 | .align 3 | ||||
.L11: | .L11: | ||||
#ifdef DOUBLE | |||||
xvpickve.d x1, VM0, 0 | |||||
xvpickve.d x2, VM0, 1 | |||||
XVFMIN VM0, x1, x2 | |||||
#else | |||||
xvpickve.w x1, VM0, 0 | xvpickve.w x1, VM0, 0 | ||||
xvpickve.w x2, VM0, 1 | xvpickve.w x2, VM0, 1 | ||||
xvpickve.w x3, VM0, 2 | xvpickve.w x3, VM0, 2 | ||||
xvpickve.w x4, VM0, 3 | xvpickve.w x4, VM0, 3 | ||||
xvfmin.s VM1, x1, x2 | |||||
xvfmin.s VM0, x3, x4 | |||||
xvfmin.s VM0, VM0, VM1 | |||||
XVFMIN VM0, x1, x2 | |||||
XVFMIN VM1, x3, x4 | |||||
XVFMIN VM0, VM0, VM1 | |||||
#endif | |||||
b .L23 | b .L23 | ||||
.align 3 | .align 3 | ||||
@@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.align 3 | .align 3 | ||||
.L21: | .L21: | ||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmin.s s1, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMIN s1, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmin.s s1, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMIN s1, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmin.s s3, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMIN s3, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmin.s s4, t1, t3 | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMIN s4, t1, t3 | |||||
blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
.align 3 | .align 3 | ||||
.L22: | .L22: | ||||
fmin.s s1, s1, s2 | |||||
fmin.s s3, s3, s4 | |||||
fmin.s s1, s1, s3 | |||||
FMIN s1, s1, s2 | |||||
FMIN s3, s3, s4 | |||||
FMIN s1, s1, s3 | |||||
.align 3 | .align 3 | ||||
.L23: //N<8 | .L23: //N<8 | ||||
@@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
FABS a1, a1 | FABS a1, a1 | ||||
ADD a0, a0, a1 | ADD a0, a0, a1 | ||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fmin.s s1, a0, s1 | |||||
FMIN s1, a0, s1 | |||||
blt $r0, I, .L24 | blt $r0, I, .L24 | ||||
.align 3 | .align 3 | ||||
.L999: | .L999: | ||||
fmov.s $f0, $f22 | |||||
MOV $f0, $f22 | |||||
jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
.align 3 | .align 3 | ||||
@@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
vxor.v res0, res0, res0 | vxor.v res0, res0, res0 | ||||
bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
fld.s a0, X, 0 * SIZE | |||||
fld.s a1, X, 1 * SIZE | |||||
fabs.s a0, a0 | |||||
fabs.s a1, a1 | |||||
fadd.s s1, a1, a0 | |||||
LD a0, X, 0 * SIZE | |||||
LD a1, X, 1 * SIZE | |||||
FABS a0, a0 | |||||
FABS a1, a1 | |||||
ADD s1, a1, a0 | |||||
#ifdef DOUBLE | |||||
vreplvei.d VM0, VM0, 0 | |||||
#else | |||||
vreplvei.w VM0, VM0, 0 | vreplvei.w VM0, VM0, 0 | ||||
#endif | |||||
li.d TEMP, 1 | li.d TEMP, 1 | ||||
li.w I, -1 | |||||
slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
vreplgr2vr.w neg1, I | |||||
vffint.s.w neg1, neg1 | |||||
srai.d I, N, 3 | srai.d I, N, 3 | ||||
bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
bge $r0, I, .L23 | bge $r0, I, .L23 | ||||
.align 3 | .align 3 | ||||
.L10: | .L10: | ||||
vld VX0, X, 0 * SIZE | |||||
vld VX1, X, 4 * SIZE | |||||
addi.d I, I, -1 | |||||
vld VX0, X, 0 | |||||
vld VX1, X, 16 | |||||
#ifdef DOUBLE | |||||
vpickev.d x1, VX1, VX0 | |||||
vpickod.d x2, VX1, VX0 | |||||
#else | |||||
vpickev.w x1, VX1, VX0 | vpickev.w x1, VX1, VX0 | ||||
vpickod.w x2, VX1, VX0 | vpickod.w x2, VX1, VX0 | ||||
vfmul.s x3, neg1, x1 | |||||
vfmul.s x4, neg1, x2 | |||||
vfcmp.clt.s VT0, x1, res0 | |||||
vfcmp.clt.s VT1, x2, res0 | |||||
vld VX0, X, 8 * SIZE | |||||
vbitsel.v x1, x1, x3, VT0 | |||||
vbitsel.v x2, x2, x4, VT1 | |||||
vld VX1, X, 12 * SIZE | |||||
vfadd.s VM1, x1, x2 | |||||
#endif | |||||
VFSUB x3, res0, x1 | |||||
VFSUB x4, res0, x2 | |||||
VFMAX x1, x1, x3 | |||||
VFMAX x2, x2, x4 | |||||
VFADD VM1, x1, x2 | |||||
vld VX0, X, 32 | |||||
vld VX1, X, 48 | |||||
#ifdef DOUBLE | |||||
vpickev.d x1, VX1, VX0 | |||||
vpickod.d x2, VX1, VX0 | |||||
#else | |||||
vpickev.w x1, VX1, VX0 | vpickev.w x1, VX1, VX0 | ||||
vpickod.w x2, VX1, VX0 | vpickod.w x2, VX1, VX0 | ||||
vfmul.s x3, neg1, x1 | |||||
vfmul.s x4, neg1, x2 | |||||
vfcmp.clt.s VT0, x1, res0 | |||||
vfcmp.clt.s VT1, x2, res0 | |||||
#endif | |||||
VFSUB x3, res0, x1 | |||||
VFSUB x4, res0, x2 | |||||
VFMAX x1, x1, x3 | |||||
VFMAX x2, x2, x4 | |||||
VFADD x1, x1, x2 | |||||
VFMIN VM1, x1, VM1 | |||||
VFMIN VM0, VM0, VM1 | |||||
#ifdef DOUBLE | |||||
vld VX0, X, 64 | |||||
vld VX1, X, 80 | |||||
vpickev.d x1, VX1, VX0 | |||||
vpickod.d x2, VX1, VX0 | |||||
VFSUB x3, res0, x1 | |||||
VFSUB x4, res0, x2 | |||||
VFMAX x1, x1, x3 | |||||
VFMAX x2, x2, x4 | |||||
VFADD VM1, x1, x2 | |||||
vld VX0, X, 96 | |||||
vld VX1, X, 112 | |||||
vpickev.d x1, VX1, VX0 | |||||
vpickod.d x2, VX1, VX0 | |||||
VFSUB x3, res0, x1 | |||||
VFSUB x4, res0, x2 | |||||
VFMAX x1, x1, x3 | |||||
VFMAX x2, x2, x4 | |||||
VFADD x1, x1, x2 | |||||
VFMIN VM1, x1, VM1 | |||||
VFMIN VM0, VM0, VM1 | |||||
#endif | |||||
addi.d I, I, -1 | |||||
addi.d X, X, 16 * SIZE | addi.d X, X, 16 * SIZE | ||||
vbitsel.v x1, x1, x3, VT0 | |||||
vbitsel.v x2, x2, x4, VT1 | |||||
vfadd.s x1, x1, x2 | |||||
vfmin.s VM1, x1, VM1 | |||||
vfmin.s VM0, VM0, VM1 | |||||
blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
.align 3 | .align 3 | ||||
.L11: | .L11: | ||||
#ifdef DOUBLE | |||||
vreplvei.d x1, VM0, 0 | |||||
vreplvei.d x2, VM0, 1 | |||||
VFMIN VM0, x1, x2 | |||||
#else | |||||
vreplvei.w x1, VM0, 0 | vreplvei.w x1, VM0, 0 | ||||
vreplvei.w x2, VM0, 1 | vreplvei.w x2, VM0, 1 | ||||
vreplvei.w x3, VM0, 2 | vreplvei.w x3, VM0, 2 | ||||
vreplvei.w x4, VM0, 3 | vreplvei.w x4, VM0, 3 | ||||
vfmin.s VM1, x1, x2 | |||||
vfmin.s VM0, x3, x4 | |||||
vfmin.s VM0, VM0, VM1 | |||||
VFMIN VM1, x1, x2 | |||||
VFMIN VM0, x3, x4 | |||||
VFMIN VM0, VM0, VM1 | |||||
#endif | |||||
b .L23 | b .L23 | ||||
.align 3 | .align 3 | ||||
@@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.align 3 | .align 3 | ||||
.L21: | .L21: | ||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmin.s s1, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMIN s1, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmin.s s1, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMIN s1, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmin.s s3, t1, t3 | |||||
fld.s t1, X, 0 * SIZE | |||||
fld.s t2, X, 1 * SIZE | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMIN s3, t1, t3 | |||||
LD t1, X, 0 * SIZE | |||||
LD t2, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fld.s t3, X, 0 * SIZE | |||||
fld.s t4, X, 1 * SIZE | |||||
LD t3, X, 0 * SIZE | |||||
LD t4, X, 1 * SIZE | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fabs.s t1, t1 | |||||
fabs.s t2, t2 | |||||
fabs.s t3, t3 | |||||
fabs.s t4, t4 | |||||
fadd.s t1, t1, t2 | |||||
fadd.s t3, t3, t4 | |||||
fmin.s s4, t1, t3 | |||||
FABS t1, t1 | |||||
FABS t2, t2 | |||||
FABS t3, t3 | |||||
FABS t4, t4 | |||||
ADD t1, t1, t2 | |||||
ADD t3, t3, t4 | |||||
FMIN s4, t1, t3 | |||||
blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
.align 3 | .align 3 | ||||
.L22: | .L22: | ||||
fmin.s s1, s1, s2 | |||||
fmin.s s3, s3, s4 | |||||
fmin.s s1, s1, s3 | |||||
FMIN s1, s1, s2 | |||||
FMIN s3, s3, s4 | |||||
FMIN s1, s1, s3 | |||||
.align 3 | .align 3 | ||||
.L23: //N<8 | .L23: //N<8 | ||||
@@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.align 3 | .align 3 | ||||
.L24: | .L24: | ||||
fld.s a0, X, 0 * SIZE | |||||
fld.s a1, X, 1 * SIZE | |||||
LD a0, X, 0 * SIZE | |||||
LD a1, X, 1 * SIZE | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
fabs.s a0, a0 | |||||
fabs.s a1, a1 | |||||
fadd.s a0, a0, a1 | |||||
FABS a0, a0 | |||||
FABS a1, a1 | |||||
ADD a0, a0, a1 | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
fmin.s s1, a0, s1 | |||||
FMIN s1, a0, s1 | |||||
blt $r0, I, .L24 | blt $r0, I, .L24 | ||||
.align 3 | .align 3 | ||||
.L999: | .L999: | ||||
fmov.s $f0, $f22 | |||||
MOV $f0, $f22 | |||||
jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
.align 3 | .align 3 | ||||