@@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S | |||
SAMAXKERNEL = amax_lsx.S | |||
DAMAXKERNEL = amax_lsx.S | |||
CAMAXKERNEL = camax_lsx.S | |||
ZAMAXKERNEL = camax_lsx.S | |||
SAMINKERNEL = amin_lsx.S | |||
DAMINKERNEL = amin_lsx.S | |||
CAMINKERNEL = camin_lsx.S | |||
ZAMINKERNEL = camin_lsx.S | |||
SMAXKERNEL = max_lsx.S | |||
DMAXKERNEL = max_lsx.S | |||
@@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S | |||
SAMAXKERNEL = amax_lasx.S | |||
DAMAXKERNEL = amax_lasx.S | |||
CAMAXKERNEL = camax_lasx.S | |||
ZAMAXKERNEL = camax_lasx.S | |||
SAMINKERNEL = amin_lasx.S | |||
DAMINKERNEL = amin_lasx.S | |||
CAMINKERNEL = camin_lasx.S | |||
ZAMINKERNEL = camin_lasx.S | |||
SMAXKERNEL = max_lsx.S | |||
DMAXKERNEL = max_lsx.S | |||
@@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
bge $r0, N, .L999 | |||
bge $r0, INCX, .L999 | |||
li.d TEMP, 1 | |||
li.w I, -1 | |||
slli.d TEMP, TEMP, ZBASE_SHIFT | |||
slli.d INCX, INCX, ZBASE_SHIFT | |||
xvreplgr2vr.w neg1, I | |||
xvffint.s.w neg1, neg1 | |||
srai.d I, N, 3 | |||
bne INCX, TEMP, .L20 | |||
bge $r0, I, .L23 | |||
.align 3 | |||
.L10: | |||
xvld VX0, X, 0 * SIZE | |||
xvld VX1, X, 8 * SIZE | |||
addi.d I, I, -1 | |||
xvld VX0, X, 0 | |||
xvld VX1, X, 32 | |||
#ifdef DOUBLE | |||
xvpickev.d x1, VX1, VX0 | |||
xvpickod.d x2, VX1, VX0 | |||
#else | |||
xvpickev.w x1, VX1, VX0 | |||
xvpickod.w x2, VX1, VX0 | |||
xvfmul.s x3, neg1, x1 | |||
xvfmul.s x4, neg1, x2 | |||
xvfcmp.clt.s VT0, x1, res0 | |||
xvfcmp.clt.s VT1, x2, res0 | |||
xvbitsel.v x1, x1, x3, VT0 | |||
xvbitsel.v x2, x2, x4, VT1 | |||
#endif | |||
XVFSUB x3, res0, x1 | |||
XVFSUB x4, res0, x2 | |||
XVFMAX x1, x1, x3 | |||
XVFMAX x2, x2, x4 | |||
XVFADD VM1, x1, x2 | |||
XVFMAX VM0, VM0, VM1 | |||
#ifdef DOUBLE | |||
xvld VX0, X, 64 | |||
xvld VX1, X, 96 | |||
xvpickev.d x1, VX1, VX0 | |||
xvpickod.d x2, VX1, VX0 | |||
XVFSUB x3, res0, x1 | |||
XVFSUB x4, res0, x2 | |||
XVFMAX x1, x1, x3 | |||
XVFMAX x2, x2, x4 | |||
XVFADD VM1, x1, x2 | |||
XVFMAX VM0, VM0, VM1 | |||
#endif | |||
addi.d I, I, -1 | |||
addi.d X, X, 16 * SIZE | |||
xvfadd.s VM1, x1, x2 | |||
xvfmax.s VM0, VM0, VM1 | |||
blt $r0, I, .L10 | |||
.align 3 | |||
.L11: | |||
#ifdef DOUBLE | |||
xvpickve.d x1, VM0, 0 | |||
xvpickve.d x2, VM0, 1 | |||
XVFMAX VM0, x1, x2 | |||
#else | |||
xvpickve.w x1, VM0, 0 | |||
xvpickve.w x2, VM0, 1 | |||
xvpickve.w x3, VM0, 2 | |||
xvpickve.w x4, VM0, 3 | |||
xvfmax.s VM1, x1, x2 | |||
xvfmax.s VM0, x3, x4 | |||
xvfmax.s VM0, VM0, VM1 | |||
XVFMAX VM0, x1, x2 | |||
XVFMAX VM1, x3, x4 | |||
XVFMAX VM0, VM0, VM1 | |||
#endif | |||
b .L23 | |||
.align 3 | |||
@@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.align 3 | |||
.L21: | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmax.s s1, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMAX s1, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmax.s s1, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMAX s1, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
addi.d I, I, -1 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmax.s s3, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMAX s3, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmax.s s4, t1, t3 | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMAX s4, t1, t3 | |||
blt $r0, I, .L21 | |||
.align 3 | |||
.L22: | |||
fmax.s s1, s1, s2 | |||
fmax.s s3, s3, s4 | |||
fmax.s s1, s1, s3 | |||
FMAX s1, s1, s2 | |||
FMAX s3, s3, s4 | |||
FMAX s1, s1, s3 | |||
.align 3 | |||
.L23: //N<8 | |||
@@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
FABS a1, a1 | |||
ADD a0, a0, a1 | |||
add.d X, X, INCX | |||
fmax.s s1, a0, s1 | |||
FMAX s1, a0, s1 | |||
blt $r0, I, .L24 | |||
.align 3 | |||
.L999: | |||
fmov.s $f0, $f22 | |||
MOV $f0, $f22 | |||
jirl $r0, $r1, 0x0 | |||
.align 3 | |||
@@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
bge $r0, N, .L999 | |||
bge $r0, INCX, .L999 | |||
li.d TEMP, 1 | |||
li.w I, -1 | |||
slli.d TEMP, TEMP, ZBASE_SHIFT | |||
slli.d INCX, INCX, ZBASE_SHIFT | |||
vreplgr2vr.w neg1, I | |||
vffint.s.w neg1, neg1 | |||
srai.d I, N, 3 | |||
bne INCX, TEMP, .L20 | |||
bge $r0, I, .L23 | |||
.align 3 | |||
.L10: | |||
vld VX0, X, 0 * SIZE | |||
vld VX1, X, 4 * SIZE | |||
addi.d I, I, -1 | |||
vld VX0, X, 0 | |||
vld VX1, X, 16 | |||
#ifdef DOUBLE | |||
vpickev.d x1, VX1, VX0 | |||
vpickod.d x2, VX1, VX0 | |||
#else | |||
vpickev.w x1, VX1, VX0 | |||
vpickod.w x2, VX1, VX0 | |||
vfmul.s x3, neg1, x1 | |||
vfmul.s x4, neg1, x2 | |||
vfcmp.clt.s VT0, x1, res0 | |||
vfcmp.clt.s VT1, x2, res0 | |||
vld VX0, X, 8 * SIZE | |||
vbitsel.v x1, x1, x3, VT0 | |||
vbitsel.v x2, x2, x4, VT1 | |||
vld VX1, X, 12 * SIZE | |||
vfadd.s VM1, x1, x2 | |||
#endif | |||
VFSUB x3, res0, x1 | |||
VFSUB x4, res0, x2 | |||
VFMAX x1, x1, x3 | |||
VFMAX x2, x2, x4 | |||
VFADD VM1, x1, x2 | |||
vld VX0, X, 32 | |||
vld VX1, X, 48 | |||
#ifdef DOUBLE | |||
vpickev.d x1, VX1, VX0 | |||
vpickod.d x2, VX1, VX0 | |||
#else | |||
vpickev.w x1, VX1, VX0 | |||
vpickod.w x2, VX1, VX0 | |||
vfmul.s x3, neg1, x1 | |||
vfmul.s x4, neg1, x2 | |||
vfcmp.clt.s VT0, x1, res0 | |||
vfcmp.clt.s VT1, x2, res0 | |||
#endif | |||
VFSUB x3, res0, x1 | |||
VFSUB x4, res0, x2 | |||
VFMAX x1, x1, x3 | |||
VFMAX x2, x2, x4 | |||
VFADD x1, x1, x2 | |||
VFMAX VM1, x1, VM1 | |||
VFMAX VM0, VM0, VM1 | |||
#ifdef DOUBLE | |||
vld VX0, X, 64 | |||
vld VX1, X, 80 | |||
vpickev.d x1, VX1, VX0 | |||
vpickod.d x2, VX1, VX0 | |||
VFSUB x3, res0, x1 | |||
VFSUB x4, res0, x2 | |||
VFMAX x1, x1, x3 | |||
VFMAX x2, x2, x4 | |||
VFADD VM1, x1, x2 | |||
vld VX0, X, 96 | |||
vld VX1, X, 112 | |||
vpickev.d x1, VX1, VX0 | |||
vpickod.d x2, VX1, VX0 | |||
VFSUB x3, res0, x1 | |||
VFSUB x4, res0, x2 | |||
VFMAX x1, x1, x3 | |||
VFMAX x2, x2, x4 | |||
VFADD x1, x1, x2 | |||
VFMAX VM1, x1, VM1 | |||
VFMAX VM0, VM0, VM1 | |||
#endif | |||
addi.d X, X, 16 * SIZE | |||
vbitsel.v x1, x1, x3, VT0 | |||
vbitsel.v x2, x2, x4, VT1 | |||
vfadd.s x1, x1, x2 | |||
vfmax.s VM1, x1, VM1 | |||
vfmax.s VM0, VM0, VM1 | |||
addi.d I, I, -1 | |||
blt $r0, I, .L10 | |||
.align 3 | |||
.L11: | |||
#ifdef DOUBLE | |||
vreplvei.d x1, VM0, 0 | |||
vreplvei.d x2, VM0, 1 | |||
VFMAX VM0, x1, x2 | |||
#else | |||
vreplvei.w x1, VM0, 0 | |||
vreplvei.w x2, VM0, 1 | |||
vreplvei.w x3, VM0, 2 | |||
vreplvei.w x4, VM0, 3 | |||
vfmax.s VM1, x1, x2 | |||
vfmax.s VM0, x3, x4 | |||
vfmax.s VM0, VM0, VM1 | |||
VFMAX VM1, x1, x2 | |||
VFMAX VM0, x3, x4 | |||
VFMAX VM0, VM0, VM1 | |||
#endif | |||
b .L23 | |||
.align 3 | |||
@@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.align 3 | |||
.L21: | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmax.s s1, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMAX s1, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmax.s s1, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMAX s1, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
addi.d I, I, -1 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmax.s s3, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMAX s3, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmax.s s4, t1, t3 | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMAX s4, t1, t3 | |||
blt $r0, I, .L21 | |||
.align 3 | |||
.L22: | |||
fmax.s s1, s1, s2 | |||
fmax.s s3, s3, s4 | |||
fmax.s s1, s1, s3 | |||
FMAX s1, s1, s2 | |||
FMAX s3, s3, s4 | |||
FMAX s1, s1, s3 | |||
.align 3 | |||
.L23: //N<8 | |||
@@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.align 3 | |||
.L24: | |||
fld.s a0, X, 0 * SIZE | |||
fld.s a1, X, 1 * SIZE | |||
LD a0, X, 0 * SIZE | |||
LD a1, X, 1 * SIZE | |||
addi.d I, I, -1 | |||
fabs.s a0, a0 | |||
fabs.s a1, a1 | |||
fadd.s a0, a0, a1 | |||
FABS a0, a0 | |||
FABS a1, a1 | |||
ADD a0, a0, a1 | |||
add.d X, X, INCX | |||
fmax.s s1, a0, s1 | |||
FMAX s1, a0, s1 | |||
blt $r0, I, .L24 | |||
.align 3 | |||
.L999: | |||
fmov.s $f0, $f22 | |||
MOV $f0, $f22 | |||
jirl $r0, $r1, 0x0 | |||
.align 3 | |||
@@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvxor.v res0, res0, res0 | |||
bge $r0, N, .L999 | |||
bge $r0, INCX, .L999 | |||
fld.s a0, X, 0 * SIZE | |||
fld.s a1, X, 1 * SIZE | |||
fabs.s a0, a0 | |||
fabs.s a1, a1 | |||
fadd.s s1, a1, a0 | |||
LD a0, X, 0 * SIZE | |||
LD a1, X, 1 * SIZE | |||
FABS a0, a0 | |||
FABS a1, a1 | |||
ADD s1, a1, a0 | |||
#ifdef DOUBLE | |||
xvreplve0.d VM0, VM0 | |||
#else | |||
xvreplve0.w VM0, VM0 | |||
#endif | |||
li.d TEMP, 1 | |||
li.w I, -1 | |||
slli.d TEMP, TEMP, ZBASE_SHIFT | |||
slli.d INCX, INCX, ZBASE_SHIFT | |||
xvreplgr2vr.w neg1, I | |||
xvffint.s.w neg1, neg1 | |||
srai.d I, N, 3 | |||
bne INCX, TEMP, .L20 | |||
bge $r0, I, .L23 | |||
.align 3 | |||
.L10: | |||
xvld VX0, X, 0 * SIZE | |||
xvld VX1, X, 8 * SIZE | |||
addi.d I, I, -1 | |||
xvld VX0, X, 0 | |||
xvld VX1, X, 32 | |||
#ifdef DOUBLE | |||
xvpickev.d x1, VX1, VX0 | |||
xvpickod.d x2, VX1, VX0 | |||
#else | |||
xvpickev.w x1, VX1, VX0 | |||
xvpickod.w x2, VX1, VX0 | |||
xvfmul.s x3, neg1, x1 | |||
xvfmul.s x4, neg1, x2 | |||
xvfcmp.clt.s VT0, x1, res0 | |||
xvfcmp.clt.s VT1, x2, res0 | |||
xvbitsel.v x1, x1, x3, VT0 | |||
xvbitsel.v x2, x2, x4, VT1 | |||
#endif | |||
XVFSUB x3, res0, x1 | |||
XVFSUB x4, res0, x2 | |||
XVFMAX x1, x1, x3 | |||
XVFMAX x2, x2, x4 | |||
XVFADD VM1, x1, x2 | |||
XVFMIN VM0, VM0, VM1 | |||
#ifdef DOUBLE | |||
xvld VX0, X, 64 | |||
xvld VX1, X, 96 | |||
xvpickev.d x1, VX1, VX0 | |||
xvpickod.d x2, VX1, VX0 | |||
XVFSUB x3, res0, x1 | |||
XVFSUB x4, res0, x2 | |||
XVFMAX x1, x1, x3 | |||
XVFMAX x2, x2, x4 | |||
XVFADD VM1, x1, x2 | |||
XVFMIN VM0, VM0, VM1 | |||
#endif | |||
addi.d I, I, -1 | |||
addi.d X, X, 16 * SIZE | |||
xvfadd.s VM1, x1, x2 | |||
xvfmin.s VM0, VM0, VM1 | |||
blt $r0, I, .L10 | |||
.align 3 | |||
.L11: | |||
#ifdef DOUBLE | |||
xvpickve.d x1, VM0, 0 | |||
xvpickve.d x2, VM0, 1 | |||
XVFMIN VM0, x1, x2 | |||
#else | |||
xvpickve.w x1, VM0, 0 | |||
xvpickve.w x2, VM0, 1 | |||
xvpickve.w x3, VM0, 2 | |||
xvpickve.w x4, VM0, 3 | |||
xvfmin.s VM1, x1, x2 | |||
xvfmin.s VM0, x3, x4 | |||
xvfmin.s VM0, VM0, VM1 | |||
XVFMIN VM0, x1, x2 | |||
XVFMIN VM1, x3, x4 | |||
XVFMIN VM0, VM0, VM1 | |||
#endif | |||
b .L23 | |||
.align 3 | |||
@@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.align 3 | |||
.L21: | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmin.s s1, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMIN s1, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmin.s s1, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMIN s1, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
addi.d I, I, -1 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmin.s s3, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMIN s3, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmin.s s4, t1, t3 | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMIN s4, t1, t3 | |||
blt $r0, I, .L21 | |||
.align 3 | |||
.L22: | |||
fmin.s s1, s1, s2 | |||
fmin.s s3, s3, s4 | |||
fmin.s s1, s1, s3 | |||
FMIN s1, s1, s2 | |||
FMIN s3, s3, s4 | |||
FMIN s1, s1, s3 | |||
.align 3 | |||
.L23: //N<8 | |||
@@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
FABS a1, a1 | |||
ADD a0, a0, a1 | |||
add.d X, X, INCX | |||
fmin.s s1, a0, s1 | |||
FMIN s1, a0, s1 | |||
blt $r0, I, .L24 | |||
.align 3 | |||
.L999: | |||
fmov.s $f0, $f22 | |||
MOV $f0, $f22 | |||
jirl $r0, $r1, 0x0 | |||
.align 3 | |||
@@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
vxor.v res0, res0, res0 | |||
bge $r0, N, .L999 | |||
bge $r0, INCX, .L999 | |||
fld.s a0, X, 0 * SIZE | |||
fld.s a1, X, 1 * SIZE | |||
fabs.s a0, a0 | |||
fabs.s a1, a1 | |||
fadd.s s1, a1, a0 | |||
LD a0, X, 0 * SIZE | |||
LD a1, X, 1 * SIZE | |||
FABS a0, a0 | |||
FABS a1, a1 | |||
ADD s1, a1, a0 | |||
#ifdef DOUBLE | |||
vreplvei.d VM0, VM0, 0 | |||
#else | |||
vreplvei.w VM0, VM0, 0 | |||
#endif | |||
li.d TEMP, 1 | |||
li.w I, -1 | |||
slli.d TEMP, TEMP, ZBASE_SHIFT | |||
slli.d INCX, INCX, ZBASE_SHIFT | |||
vreplgr2vr.w neg1, I | |||
vffint.s.w neg1, neg1 | |||
srai.d I, N, 3 | |||
bne INCX, TEMP, .L20 | |||
bge $r0, I, .L23 | |||
.align 3 | |||
.L10: | |||
vld VX0, X, 0 * SIZE | |||
vld VX1, X, 4 * SIZE | |||
addi.d I, I, -1 | |||
vld VX0, X, 0 | |||
vld VX1, X, 16 | |||
#ifdef DOUBLE | |||
vpickev.d x1, VX1, VX0 | |||
vpickod.d x2, VX1, VX0 | |||
#else | |||
vpickev.w x1, VX1, VX0 | |||
vpickod.w x2, VX1, VX0 | |||
vfmul.s x3, neg1, x1 | |||
vfmul.s x4, neg1, x2 | |||
vfcmp.clt.s VT0, x1, res0 | |||
vfcmp.clt.s VT1, x2, res0 | |||
vld VX0, X, 8 * SIZE | |||
vbitsel.v x1, x1, x3, VT0 | |||
vbitsel.v x2, x2, x4, VT1 | |||
vld VX1, X, 12 * SIZE | |||
vfadd.s VM1, x1, x2 | |||
#endif | |||
VFSUB x3, res0, x1 | |||
VFSUB x4, res0, x2 | |||
VFMAX x1, x1, x3 | |||
VFMAX x2, x2, x4 | |||
VFADD VM1, x1, x2 | |||
vld VX0, X, 32 | |||
vld VX1, X, 48 | |||
#ifdef DOUBLE | |||
vpickev.d x1, VX1, VX0 | |||
vpickod.d x2, VX1, VX0 | |||
#else | |||
vpickev.w x1, VX1, VX0 | |||
vpickod.w x2, VX1, VX0 | |||
vfmul.s x3, neg1, x1 | |||
vfmul.s x4, neg1, x2 | |||
vfcmp.clt.s VT0, x1, res0 | |||
vfcmp.clt.s VT1, x2, res0 | |||
#endif | |||
VFSUB x3, res0, x1 | |||
VFSUB x4, res0, x2 | |||
VFMAX x1, x1, x3 | |||
VFMAX x2, x2, x4 | |||
VFADD x1, x1, x2 | |||
VFMIN VM1, x1, VM1 | |||
VFMIN VM0, VM0, VM1 | |||
#ifdef DOUBLE | |||
vld VX0, X, 64 | |||
vld VX1, X, 80 | |||
vpickev.d x1, VX1, VX0 | |||
vpickod.d x2, VX1, VX0 | |||
VFSUB x3, res0, x1 | |||
VFSUB x4, res0, x2 | |||
VFMAX x1, x1, x3 | |||
VFMAX x2, x2, x4 | |||
VFADD VM1, x1, x2 | |||
vld VX0, X, 96 | |||
vld VX1, X, 112 | |||
vpickev.d x1, VX1, VX0 | |||
vpickod.d x2, VX1, VX0 | |||
VFSUB x3, res0, x1 | |||
VFSUB x4, res0, x2 | |||
VFMAX x1, x1, x3 | |||
VFMAX x2, x2, x4 | |||
VFADD x1, x1, x2 | |||
VFMIN VM1, x1, VM1 | |||
VFMIN VM0, VM0, VM1 | |||
#endif | |||
addi.d I, I, -1 | |||
addi.d X, X, 16 * SIZE | |||
vbitsel.v x1, x1, x3, VT0 | |||
vbitsel.v x2, x2, x4, VT1 | |||
vfadd.s x1, x1, x2 | |||
vfmin.s VM1, x1, VM1 | |||
vfmin.s VM0, VM0, VM1 | |||
blt $r0, I, .L10 | |||
.align 3 | |||
.L11: | |||
#ifdef DOUBLE | |||
vreplvei.d x1, VM0, 0 | |||
vreplvei.d x2, VM0, 1 | |||
VFMIN VM0, x1, x2 | |||
#else | |||
vreplvei.w x1, VM0, 0 | |||
vreplvei.w x2, VM0, 1 | |||
vreplvei.w x3, VM0, 2 | |||
vreplvei.w x4, VM0, 3 | |||
vfmin.s VM1, x1, x2 | |||
vfmin.s VM0, x3, x4 | |||
vfmin.s VM0, VM0, VM1 | |||
VFMIN VM1, x1, x2 | |||
VFMIN VM0, x3, x4 | |||
VFMIN VM0, VM0, VM1 | |||
#endif | |||
b .L23 | |||
.align 3 | |||
@@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.align 3 | |||
.L21: | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmin.s s1, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMIN s1, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmin.s s1, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMIN s1, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
addi.d I, I, -1 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmin.s s3, t1, t3 | |||
fld.s t1, X, 0 * SIZE | |||
fld.s t2, X, 1 * SIZE | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMIN s3, t1, t3 | |||
LD t1, X, 0 * SIZE | |||
LD t2, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fld.s t3, X, 0 * SIZE | |||
fld.s t4, X, 1 * SIZE | |||
LD t3, X, 0 * SIZE | |||
LD t4, X, 1 * SIZE | |||
add.d X, X, INCX | |||
fabs.s t1, t1 | |||
fabs.s t2, t2 | |||
fabs.s t3, t3 | |||
fabs.s t4, t4 | |||
fadd.s t1, t1, t2 | |||
fadd.s t3, t3, t4 | |||
fmin.s s4, t1, t3 | |||
FABS t1, t1 | |||
FABS t2, t2 | |||
FABS t3, t3 | |||
FABS t4, t4 | |||
ADD t1, t1, t2 | |||
ADD t3, t3, t4 | |||
FMIN s4, t1, t3 | |||
blt $r0, I, .L21 | |||
.align 3 | |||
.L22: | |||
fmin.s s1, s1, s2 | |||
fmin.s s3, s3, s4 | |||
fmin.s s1, s1, s3 | |||
FMIN s1, s1, s2 | |||
FMIN s3, s3, s4 | |||
FMIN s1, s1, s3 | |||
.align 3 | |||
.L23: //N<8 | |||
@@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.align 3 | |||
.L24: | |||
fld.s a0, X, 0 * SIZE | |||
fld.s a1, X, 1 * SIZE | |||
LD a0, X, 0 * SIZE | |||
LD a1, X, 1 * SIZE | |||
addi.d I, I, -1 | |||
fabs.s a0, a0 | |||
fabs.s a1, a1 | |||
fadd.s a0, a0, a1 | |||
FABS a0, a0 | |||
FABS a1, a1 | |||
ADD a0, a0, a1 | |||
add.d X, X, INCX | |||
fmin.s s1, a0, s1 | |||
FMIN s1, a0, s1 | |||
blt $r0, I, .L24 | |||
.align 3 | |||
.L999: | |||
fmov.s $f0, $f22 | |||
MOV $f0, $f22 | |||
jirl $r0, $r1, 0x0 | |||
.align 3 | |||