Browse Source

Merge pull request #5248 from ErnstPeng/fix-lasx

Loongarch64: fixed some functions of LASX for lapck test
tags/v0.3.30
Martin Kroeker GitHub 5 months ago
parent
commit
52367eac67
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
12 changed files with 832 additions and 2087 deletions
  1. +1
    -6
      kernel/loongarch64/amax_lasx.S
  2. +10
    -12
      kernel/loongarch64/asum_lasx.S
  3. +1
    -1
      kernel/loongarch64/cdot_lasx.S
  4. +53
    -25
      kernel/loongarch64/cnrm2_lasx.S
  5. +4
    -4
      kernel/loongarch64/copy_lasx.S
  6. +61
    -186
      kernel/loongarch64/cscal_lasx.S
  7. +29
    -57
      kernel/loongarch64/dot_lasx.S
  8. +282
    -284
      kernel/loongarch64/iamax_lasx.S
  9. +165
    -247
      kernel/loongarch64/icamax_lasx.S
  10. +123
    -1177
      kernel/loongarch64/rot_lasx.S
  11. +94
    -32
      kernel/loongarch64/snrm2_lasx.S
  12. +9
    -56
      kernel/loongarch64/swap_lasx.S

+ 1
- 6
kernel/loongarch64/amax_lasx.S View File

@@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX)
#endif

xvxor.v VM0, VM0, VM0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
#ifdef DOUBLE
xvldrepl.d VM0, X, 0
#else
xvldrepl.w VM0, X, 0
#endif
XVFSUB VM0, VM0, VM0
bne INCX, TEMP, .L20

srai.d I, N, 4


+ 10
- 12
kernel/loongarch64/asum_lasx.S View File

@@ -103,21 +103,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
#else
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvpickve.w VX0, res1, 4
xvpickve.w VX1, res1, 5
xvpickve.w VX2, res1, 6
xvpickve.w VX3, res1, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
#endif
.align 3

@@ -217,21 +216,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
#else
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvpickve.w VX0, res1, 4
xvpickve.w VX1, res1, 5
xvpickve.w VX2, res1, 6
xvpickve.w VX3, res1, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
#endif
.align 3



+ 1
- 1
kernel/loongarch64/cdot_lasx.S View File

@@ -288,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w x2, t2, 6
xvinsgr2vr.w x1, t3, 7
xvinsgr2vr.w x2, t4, 7
addi.d Y, Y, 8 * SIZE
addi.d Y, Y, 16 * SIZE
xvpickev.w x3, VX3, VX2
xvpickod.w x4, VX3, VX2
xvfmadd.s res1, x1, x3, res1


+ 53
- 25
kernel/loongarch64/cnrm2_lasx.S View File

@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VX4 $xr21
#define res1 $xr19
#define res2 $xr20
#define RCP $f2
#define VALPHA $xr3

PROLOGUE

@@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX)
#endif

xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl camax_k_LA264
#else
bl camax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w $vr3, $vr2, 0
xvpermi.d VALPHA, $xr3,0x00
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999

li.d TEMP, SIZE
slli.d INCX, INCX, ZBASE_SHIFT
srai.d I, N, 2
@@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 3

.L10:
xvld VX0, X, 0 * SIZE
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
addi.d I, I, -1
addi.d X, X, 8 * SIZE

xvld VX0, X, 0 * SIZE
xvld VX1, X, 8 * SIZE
xvfmul.s VX0, VX0, VALPHA
xvfmul.s VX1, VX1, VALPHA
xvfmadd.s res1, VX0, VX0, res1
xvfmadd.s res2, VX1, VX1, res2

addi.d X, X, 16 * SIZE
blt $r0, I, .L10
.align 3
b .L996
@@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
xvfmul.s VX0, VX0, VALPHA
xvfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996

.L996:
xvfadd.d res1, res1, res2
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
xvfadd.s res1, res1, res2
xvpermi.d VX1, res1, 0x4e
xvfadd.s res1, res1, VX1
vreplvei.w $vr17, $vr19, 1
vreplvei.w $vr18, $vr19, 2
vreplvei.w $vr21, $vr19, 3
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvfadd.s res1, VX4, res1
.align 3

.L997:
@@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fld.s a1, X, 0 * SIZE
fld.s a2, X, 1 * SIZE
addi.d I, I, -1
fcvt.d.s a1, a1
fcvt.d.s a2, a2
fmadd.d res, a1, a1, res
fmadd.d res, a2, a2, res
fmul.s a1, a1, RCP
fmul.s a2, a2, RCP
fmadd.s res, a1, a1, res
fmadd.s res, a2, a2, res
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d res, res
fsqrt.s res, res
fmul.s $f0, res, $f0
move $r4, $r17
fcvt.s.d $f0, res
jirl $r0, $r1, 0x0

EPILOGUE

+ 4
- 4
kernel/loongarch64/copy_lasx.S View File

@@ -260,9 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d Y, Y, INCY
ST a2, Y, 0
add.d Y, Y, INCY
ST a3, X, 0
ST a3, Y, 0
add.d Y, Y, INCY
ST a4, X, 0
ST a4, Y, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
@@ -276,9 +276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d Y, Y, INCY
ST a2, Y, 0
add.d Y, Y, INCY
ST a3, X, 0
ST a3, Y, 0
add.d Y, Y, INCY
ST a4, X, 0
ST a4, Y, 0
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222


+ 61
- 186
kernel/loongarch64/cscal_lasx.S View File

@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHAI $f1
#define X $r7
#define INCX $r8
#define DUMMY2 $r9

#define I $r12
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

bge $r0, N, .L999
bge $r0, INCX, .L999
ld.d DUMMY2, $sp, 0
li.d TEMP, 1
movgr2fr.d a1, $r0
FFINT a1, a1
@@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
bne INCX, TEMP, .L22

/////// INCX == 1 ////////
.L11:
bge $r0, I, .L997
CMPEQ $fcc0, ALPHAR, a1
CMPEQ $fcc1, ALPHAI, a1
bceqz $fcc0, .L13
b .L14
.align 3
bge $r0, I, .L19
/////// INCX == 1 && N >= 4 ////////
bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.

.L13:
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
bceqz $fcc0, .L17

.L14:
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
.align 3
bceqz $fcc1, .L17

.L111: //alpha_r == 0.0 && alpha_i == 0.0
.L15: //alpha_r == 0.0 && alpha_i == 0.0
xvst VXZ, X, 0 * SIZE
#ifdef DOUBLE
xvst VXZ, X, 4 * SIZE
@@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d X, X, 16 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3

.L113: //alpha_r != 0.0 && alpha_i == 0.0
xvld VX0, X, 0 * SIZE
#ifdef DOUBLE
xvld VX1, X, 4 * SIZE
xvpickev.d x1, VX1, VX0
xvpickod.d x2, VX1, VX0
xvfmul.d x3, VXAR, x1
xvfmul.d x4, VXAR, x2
xvilvl.d VX2, x4 ,x3
xvilvh.d VX3, x4, x3
xvst VX2, X, 0 * SIZE
xvst VX3, X, 4 * SIZE
addi.d X, X, 8 * SIZE
#else
xvld VX1, X, 8 * SIZE
xvpickev.w x1, VX1, VX0
xvpickod.w x2, VX1, VX0
xvfmul.s x3, VXAR, x1
xvfmul.s x4, VXAR, x2
xvilvl.w VX2, x4 ,x3
xvilvh.w VX3, x4, x3
xvst VX2, X, 0 * SIZE
xvst VX3, X, 8 * SIZE
addi.d X, X, 16 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L113
b .L997
blt $r0, I, .L15
b .L19
.align 3

.L114: //alpha_r != 0.0 && alpha_i != 0.0
.L17:
xvld VX0, X, 0 * SIZE
#ifdef DOUBLE
xvld VX1, X, 4 * SIZE
@@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d X, X, 16 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L114
b .L997
blt $r0, I, .L17
b .L19
.align 3

/////// INCX == 1 && N < 8 ///////
.L19:
#ifdef DOUBLE
andi I, N, 3
#else
andi I, N, 7
#endif
beqz I, .L999
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.

bceqz $fcc0, .L998

bceqz $fcc1, .L998

b .L995 // alpha_r == 0.0 && alpha_i == 0.0
.align 3

/////// INCX != 1 ////////
.L22:
bge $r0, I, .L997
move XX, X
CMPEQ $fcc0, ALPHAR, a1
CMPEQ $fcc1, ALPHAI, a1
bceqz $fcc0, .L23
b .L24
.align 3

.L23:
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
move XX, X
bge $r0, I, .L29
bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
bceqz $fcc0, .L25

.L24:
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
.align 3
bceqz $fcc1, .L25

.L221: //alpha_r == 0.0 && alpha_i == 0.0
.L27: //alpha_r == 0.0 && alpha_i == 0.0
#ifdef DOUBLE
xvstelm.d VXZ, X, 0, 0
xvstelm.d VXZ, X, 1 * SIZE, 0
@@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
add.d X, X, INCX
addi.d I, I, -1
blt $r0, I, .L221
b .L997
.align 3

.L223: //alpha_r != 0.0 && alpha_i == 0.0
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.d x1, t1, 0
xvinsgr2vr.d x2, t2, 0
xvinsgr2vr.d x1, t3, 1
xvinsgr2vr.d x2, t4, 1
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
xvinsgr2vr.d x1, t1, 2
xvinsgr2vr.d x2, t2, 2
xvinsgr2vr.d x1, t3, 3
xvinsgr2vr.d x2, t4, 3
add.d X, X, INCX

xvfmul.d x3, VXAR, x1
xvfmul.d x4, VXAR, x2
addi.d I, I, -1
xvstelm.d x3, XX, 0 * SIZE, 0
xvstelm.d x4, XX, 1 * SIZE, 0
add.d XX, XX, INCX
xvstelm.d x3, XX, 0 * SIZE, 1
xvstelm.d x4, XX, 1 * SIZE, 1
add.d XX, XX, INCX
xvstelm.d x3, XX, 0 * SIZE, 2
xvstelm.d x4, XX, 1 * SIZE, 2
add.d XX, XX, INCX
xvstelm.d x3, XX, 0 * SIZE, 3
xvstelm.d x4, XX, 1 * SIZE, 3
#else
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.w x1, t1, 0
xvinsgr2vr.w x2, t2, 0
xvinsgr2vr.w x1, t3, 1
xvinsgr2vr.w x2, t4, 1
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
xvinsgr2vr.w x1, t1, 2
xvinsgr2vr.w x2, t2, 2
xvinsgr2vr.w x1, t3, 3
xvinsgr2vr.w x2, t4, 3
add.d X, X, INCX
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.w x1, t1, 4
xvinsgr2vr.w x2, t2, 4
xvinsgr2vr.w x1, t3, 5
xvinsgr2vr.w x2, t4, 5
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
xvinsgr2vr.w x1, t1, 6
xvinsgr2vr.w x2, t2, 6
xvinsgr2vr.w x1, t3, 7
xvinsgr2vr.w x2, t4, 7
add.d X, X, INCX

xvfmul.s x3, VXAR, x1
xvfmul.s x4, VXAR, x2
addi.d I, I, -1
xvstelm.w x3, XX, 0 * SIZE, 0
xvstelm.w x4, XX, 1 * SIZE, 0
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 1
xvstelm.w x4, XX, 1 * SIZE, 1
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 2
xvstelm.w x4, XX, 1 * SIZE, 2
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 3
xvstelm.w x4, XX, 1 * SIZE, 3
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 4
xvstelm.w x4, XX, 1 * SIZE, 4
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 5
xvstelm.w x4, XX, 1 * SIZE, 5
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 6
xvstelm.w x4, XX, 1 * SIZE, 6
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 7
xvstelm.w x4, XX, 1 * SIZE, 7
#endif
add.d XX, XX, INCX
blt $r0, I, .L223
b .L997
blt $r0, I, .L27
b .L29
.align 3

.L224: //alpha_r != 0.0 && alpha_i != 0.0
.L25:
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
@@ -376,7 +242,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.d x1, t3, 3
xvinsgr2vr.d x2, t4, 3
add.d X, X, INCX

xvfmul.d VX0, VXAI, x2
xvfmsub.d x3, VXAR, x1, VX0
xvfmul.d VX1, VXAI, x1
@@ -434,7 +299,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w x1, t3, 7
xvinsgr2vr.w x2, t4, 7
add.d X, X, INCX

xvfmul.s VX0, VXAI, x2
xvfmsub.s x3, VXAR, x1, VX0
xvfmul.s VX1, VXAI, x1
@@ -465,19 +329,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvstelm.w x4, XX, 1 * SIZE, 7
#endif
add.d XX, XX, INCX
blt $r0, I, .L224
b .L997
blt $r0, I, .L25
b .L29
.align 3

.L997:
/////// INCX != 1 && N < 8 ///////
.L29:
#ifdef DOUBLE
andi I, N, 3
andi I, N, 3
#else
andi I, N, 7
andi I, N, 7
#endif
bge $r0, I, .L999
.align 3
beqz I, .L999
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.

bceqz $fcc0, .L998

bceqz $fcc1, .L998

.L995: // alpha_r == 0.0 && alpha_i == 0.0
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L995
b .L999
.L998:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
@@ -490,11 +366,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ST s2, X, 1 * SIZE
add.d X, X, INCX
blt $r0, I, .L998
.align 3
b .L999

.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3

EPILOGUE

+ 29
- 57
kernel/loongarch64/dot_lasx.S View File

@@ -53,8 +53,8 @@ PROLOGUE
#endif

/* init $f8 and $f9 to zero */
SUB s1, s1, s1
SUB s2, s2, s2
xvxor.v $xr8, $xr8, $xr8
xvxor.v $xr9, $xr9, $xr9
slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
@@ -64,20 +64,6 @@ PROLOGUE

/* !((inc_x == 1) && (inc_y == 1)) */

/* init $xr8 and $xr9 to zero */
#ifdef DOUBLE
xvldrepl.d $xr0, X, 0
#else
xvldrepl.w $xr0, X, 0
#endif
#ifdef DSDOT
xvfcvtl.d.s $xr0, $xr0
xvfsub.d $xr8, $xr0, $xr0
xvfsub.d $xr9, $xr0, $xr0
#else
XVFSUB $xr8, $xr0, $xr0
XVFSUB $xr9, $xr0, $xr0
#endif

#ifdef DOUBLE
srai.d I, N, 4
@@ -99,31 +85,31 @@ PROLOGUE
addi.w I, I, -1
addi.d X, X, 128
addi.d Y, Y, 128
#ifdef DSDOT
#ifndef DOUBLE
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr1
xvfcvtl.d.s $xr11, $xr5
xvfcvth.d.s $xr12, $xr1
xvfcvth.d.s $xr13, $xr5
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr2
xvfcvtl.d.s $xr11, $xr6
xvfcvth.d.s $xr12, $xr2
xvfcvth.d.s $xr13, $xr6
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr3
xvfcvtl.d.s $xr11, $xr7
xvfcvth.d.s $xr12, $xr3
xvfcvth.d.s $xr13, $xr7
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8
XVFMADD $xr9, $xr1, $xr5, $xr9
@@ -149,13 +135,13 @@ PROLOGUE
addi.w I, I, -1
addi.d X, X, 32
addi.d Y, Y, 32
#ifdef DSDOT
#ifndef DOUBLE
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8
#endif
@@ -163,27 +149,12 @@ PROLOGUE
.align 3
.L14:
/* store dot in s1 $f8 */
#ifdef DSDOT
xvfadd.d $xr8, $xr8, $xr9
fsub.s s2, s2, s2 /* set s2 to 0.0 */
fsub.d s2, s2, s2 /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
vfadd.d $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
vfadd.d $vr8, $vr8, $vr0
#else
XVFADD $xr8, $xr8, $xr9
SUB s2, s2, s2 /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
VFADD $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
#ifdef DOUBLE
VFADD $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr0
vpackod.w $vr0, $vr8, $vr8
VFADD $vr8, $vr8, $vr0
#endif /* defined DOUBLE */
#endif /* defined DSDOT */
.align 3
.L15:
#ifdef DOUBLE
@@ -197,7 +168,7 @@ PROLOGUE
/* FLOAT: 1~7 ; DOUBLE: 1~3 */
LD a1, X, 0
LD b1, Y, 0
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -240,7 +211,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -252,7 +223,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
@@ -264,7 +235,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -276,7 +247,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
@@ -288,7 +259,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -300,7 +271,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
@@ -312,7 +283,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -325,7 +296,7 @@ PROLOGUE
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
@@ -346,7 +317,7 @@ PROLOGUE
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -357,12 +328,13 @@ PROLOGUE
.align 3

.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
move $r4, $r17
#if defined(DOUBLE)
#elif defined(DSDOT)
#else
ADD $f0, s1, s2
fcvt.s.d $f0, $f0
#endif
move $r4, $r17
jirl $r0, $r1, 0x0

EPILOGUE

+ 282
- 284
kernel/loongarch64/iamax_lasx.S View File

@@ -56,25 +56,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
#define VZE $xr3
#define VT1 $xr4
#define VT2 $xr5
#define VC0 $xr6

PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
xvldi VZE, 0
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
#ifdef DOUBLE
xvfsub.d VT1, VZE, VM0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 2 //4
xvfmaxa.d VM0, VM0, VT1
bge $r0, I, .L11
slli.d i0, i0, 1 //2
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
slli.d i0, i0, 1 //4
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
addi.d i0, i0, -7
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
@@ -82,19 +89,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
xvinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
xvinsgr2vr.d VI0, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
xvinsgr2vr.d VI0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3
#else
xvfsub.s VT1, VZE, VM0
addi.w i0, i0, 1
srai.d I, N, 3
xvfmaxa.s VM0, VM0, VT1
bge $r0, I, .L21
slli.w i0, i0, 3 //8
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
@@ -135,73 +146,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef DOUBLE
xvld VX0, X, 0 * SIZE
xvadd.d VI1, VI1, VINC8
xvld VX1, X, 4 * SIZE
xvld VX1, X, 2 * SIZE
xvadd.d VI2, VI1, VINC4
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1)
xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x2, VI1, VI2, VT0 //i

xvld VX0, X, 4 * SIZE
xvadd.d VI1, VI2, VINC4
xvld VX1, X, 6 * SIZE
xvadd.d VI2, VI1, VINC4
xvfmaxa.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x3, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x4, VI1, VI2, VT0 //i
xvfcmp.clt.d VC0, x1, x3
xvbitsel.v x1, x1, x3, VC0 //abs(maxf)
xvbitsel.v x2, x2, x4, VC0 //i
xvfcmp.clt.d VT0, VM0, x1
addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmaxa.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0
#else
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvadd.w VI1, VI1, VINC8
xvfmaxa.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VM1
xvld VX1, X, 4 * SIZE
xvadd.w VI2, VI1, VINC4
xvfsub.s VT1, VZE, VX0
xvfsub.s VT2, VZE, VX1
xvfmaxa.s VX0, VX0, VT1
xvfmaxa.s VX1, VX1, VT2
xvfcmp.clt.s VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x2, VI1, VI2, VT0 //i
addi.d I, I, -1
xvfcmp.clt.s VT0, VM0, x1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0

#endif
blt $r0, I, .L10
.align 3

.L15:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f9, $f10
bceqz $fcc0, .L16
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L17
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
b .L26
#endif
XVFMAXA VM1, x1, x2
XVCMPEQ VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
XVFMAXA VM0, x3, x4
XVCMPEQ VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
XVFMAXA VM0, VM0, VM1
XVCMPEQ VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
CMPEQ $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
.align 3

#ifdef DOUBLE
.L16:
xvfcmp.clt.d VT0, x1, x2
xvbitsel.v VI0, VI1, VI2, VT0
xvbitsel.v VM0, x1, x2, VT0
.align 3

.L17:
movfr2gr.d i0, $f20
.align 3

.L11: //INCX==1 and N<8
andi I, N, 7
bge $r0, I, .L14
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3

.L13:
fld.d $f9, X, 0
fsub.d $f10, $f3, $f9
xvfmaxa.d x1, x1, x2
xvfcmp.clt.d VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
addi.d I, I, -1
addi.d i1, i1, 1
addi.d X, X, SIZE
movgr2fr.d $f21, i1
blt $r0, I, .L13
movfr2gr.d i0, $f20
.align 3

.L14:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3

.L20: // INCX!=1
move TEMP, X
#ifdef DOUBLE
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
@@ -210,34 +272,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
slli.d i0, i0, 2 //4
slli.d i0, i0, 1 //2
xvfsub.d VT1, VZE, VM0
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
slli.d i0, i0, 1 //4
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
addi.d i0, i0, -7
xvfmaxa.d VM0, VM0, VT1
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
addi.d i0, i0, 3
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3

.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t2, 1
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t2, 1
xvadd.d VI2, VI1, VINC4

xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0
xvbitsel.v x2, VI1, VI2, VT0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t2, 1
xvadd.d VI1, VI2, VINC4
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t2, 1
xvadd.d VI2, VI1, VINC4
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x3, VX0, VX1, VT0
xvbitsel.v x4, VI1, VI2, VT0
xvfcmp.clt.d VC0, x1, x3
xvbitsel.v x1, x1, x3, VC0
xvbitsel.v x2, x2, x4, VC0
xvfcmp.clt.d VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0

addi.d I, I, -1
blt $r0, I, .L24
.align 3

.L25:
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f10, $f9
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L27
.align 3

.L26:
xvfcmp.clt.d VT0, x1, x2
xvbitsel.v VI0, VI1, VI2, VT0
xvbitsel.v VM0, x1, x2, VT0
.align 3

.L27:
movfr2gr.d i0, $f20
.align 3

#else
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
@@ -253,19 +384,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
slli.w i0, i0, 3 //8
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
@@ -275,15 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
addi.w i0, i0, 5
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
@@ -291,54 +404,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
#endif
.align 3

.L24:
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvadd.d VI2, VI1, VINC4
xvfmaxa.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmaxa.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
#else
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
@@ -351,6 +419,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
xvadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
@@ -359,158 +428,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvadd.w VI1, VI1, VINC8
xvfmaxa.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM1, VM0
xvinsgr2vr.w VX1, t1, 0
xvinsgr2vr.w VX1, t2, 1
xvinsgr2vr.w VX1, t3, 2
xvinsgr2vr.w VX1, t4, 3
xvadd.w VI2, VI1, VINC4
xvfsub.s VT1, VZE, VX0
xvfsub.s VT2, VZE, VX1
xvfmaxa.s VX0, VX0, VT1
xvfmaxa.s VX1, VX1, VT2
xvfcmp.clt.s VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0
xvbitsel.v x2, VI1, VI2, VT0 //i

addi.d I, I, -1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
#endif
xvfcmp.clt.s VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L24
.align 3

.L25:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfcmp.ceq.d VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.d VM0, x4, x3
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
#endif
CMPEQ $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
.align 3

.L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L27
XVCMPLT VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
fcmp.ceq.s $fcc0, $f9, $f10
bceqz $fcc0, .L31
xvfcmp.clt.s VT0, VI1, VI2
xvbitsel.v VI1, VI2, VI1, VT0
b .L32
.align 3

.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L28
XVCMPLT VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.L31:
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VI1, VI1, VI2, VT0
xvbitsel.v x1, x1, x2, VT0
.align 3
.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L29
XVCMPLT VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.L32:
fcmp.ceq.s $fcc0, $f11, $f12
bceqz $fcc0, .L33
xvfcmp.clt.s VT1, VI3, VI4
xvbitsel.v VI3, VI4, VI3, VT1
b .L34
.align 3

.L29:
#ifdef DOUBLE
movfr2gr.d i0, $f20
#else
fmov.s $f16, $f20
#endif
.L33:
xvfcmp.clt.s VT1, x3, x4
xvbitsel.v x3, x3, x4, VT1
xvbitsel.v VI3, VI3, VI4, VT1
.align 3

#ifdef DOUBLE

#else
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.L34:
fcmp.ceq.s $fcc0, $f9, $f11
bceqz $fcc0, .L35
xvfcmp.clt.s VT0, VI1, VI3
xvbitsel.v VI0, VI3, VI1, VT0
xvxor.v VM0, x1, VZE
b .L29
.align 3

.L262:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L272
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.L35:
xvfcmp.clt.s VT0, x1, x3
xvbitsel.v VM0, x1, x3, VT0
xvbitsel.v VI0, VI1, VI3, VT0
.align 3
.L272:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L282
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3

.L282:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L292
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.L29:
movfr2gr.s i0, $f20
.align 3

.L292:
xvfmaxa.s VM0, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VX0
xvbitsel.v VI0, VI0, VI1, VT0
movfr2gr.s i0, $f20
#endif

.L21: //N<8
.L21: // N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
@@ -521,17 +512,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 3

.L22:
LD $f9, X, 0
LD $f9, X, 0
#ifdef DOUBLE
fsub.d $f10, $f3, $f9
xvfmaxa.d x1, x1, x2
xvfcmp.clt.d VT0, VM0, x1
#else
fsub.s $f10, $f3, $f9
xvfmaxa.s x1, x1, x2
xvfcmp.clt.s VT0, VM0, x1
#endif
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, VI1, VT0
addi.d I, I, -1
XVFMAXA VM1, x1, VM0
XVCMPEQ VT0, VM0, VM1
add.d X, X, INCX
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
add.d X, X, INCX
movgr2fr.d $f21, i1
blt $r0, I, .L22
MTG i0, $f20
MTG i0, $f20
.align 3

.L999:


+ 165
- 247
kernel/loongarch64/icamax_lasx.S View File

@@ -76,66 +76,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d i0, i0, 1
srai.d I, N, 2
bge $r0, I, .L21
slli.d i0, i0, 2 //4
slli.d i0, i0, 1 //2
xvreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
addi.d i0, i0, -3
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, -1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 2
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 2
xvinsgr2vr.d VI0, i0, 1 //3
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, -1
xvinsgr2vr.d VI0, i0, 2 //2
addi.d i0, i0, 2
xvinsgr2vr.d VI0, i0, 3 //4
xvinsgr2vr.d VI0, i0, 0
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3
#else
li.w I, -1
xvreplgr2vr.w VI4, I
xvffint.s.w VI4, VI4 // -1
bne INCX, TEMP, .L20
addi.w i0, i0, 1
srai.d I, N, 3
srai.d I, N, 2
bge $r0, I, .L21
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
addi.w i0, i0, -7
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, -3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, -3
xvinsgr2vr.w VI0, i0, 0
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
xvinsgr2vr.w VI0, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 3
xvinsgr2vr.w VI0, i0, 2 //5
xvinsgr2vr.w VI0, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //6
addi.w i0, i0, -3
xvinsgr2vr.w VI0, i0, 4 //3
xvinsgr2vr.w VI0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //4
addi.w i0, i0, 3
xvinsgr2vr.w VI0, i0, 6 //7
xvinsgr2vr.w VI0, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
xvinsgr2vr.w VI0, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7
#endif
.align 3

@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvld VX0, X, 0 * SIZE
#ifdef DOUBLE
xvadd.d VI1, VI1, VINC4
xvld VX1, X, 4 * SIZE
xvld VX1, X, 2 * SIZE
addi.d I, I, -1
xvpickev.d x1, VX1, VX0
xvpickod.d x2, VX1, VX0
@@ -153,22 +153,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvfcmp.clt.d VINC8, x2, VI3
xvbitsel.v x1, x1, x3, VT0
xvbitsel.v x2, x2, x4, VINC8
xvfadd.d x1, x1, x2
xvfmax.d x3, VM0, x1
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VM0, x3, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
xvld VX0, X, 4 * SIZE
xvadd.d VI1, VI1, VINC4
xvld VX1, X, 6 * SIZE
xvpickev.d x1, VX1, VX0
xvpickod.d x2, VX1, VX0
xvfmul.d x3, VI4, x1
xvfmul.d x4, VI4, x2
#else
xvadd.w VI1, VI1, VINC8
xvld VX1, X, 8 * SIZE
xvadd.w VI1, VI1, VINC4
xvld VX1, X, 4 * SIZE
addi.d I, I, -1
xvpickev.w x1, VX1, VX0
xvpickod.w x2, VX1, VX0
xvfmul.s x3, VI4, x1
xvfmul.s x4, VI4, x2
xvfcmp.clt.s VT0, x1, VI3
xvfcmp.clt.s VINC4, x2, VI3
xvbitsel.v x1, x1, x3, VT0
xvbitsel.v x2, x2, x4, VINC4
#endif
XVFADD x1, x1, x2
XVFMAX x3, VM0, x1
XVCMPEQ VT0, x3, VM0
XVCMPLT VT0, x1, VI3
XVCMPLT VINC8, x2, VI3
xvbitsel.v x1, x1, x3, VT0
xvbitsel.v x2, x2, x4, VINC8
XVFADD x1, x1, x2
XVFMAX x3, VM0, x1
XVCMPEQ VT0, x3, VM0
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, x3, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
@@ -177,51 +189,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.L15:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmax.d VM1, x1, x2
xvfcmp.ceq.d VT0, VM1, x1
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f10, $f9
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L27
#else
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, VM1, x1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmax.d VM0, x3, x4
xvfcmp.ceq.d VT0, x3, VM0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmax.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
#endif
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
#endif
.align 3

.L20: // INCX!=1
@@ -229,62 +229,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d i0, i0, 1
srai.d I, N, 2
bge $r0, I, .L21
slli.d i0, i0, 2 //4
slli.d i0, i0, 1 //2
xvreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
addi.d i0, i0, -3
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, -1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 2
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 2
xvinsgr2vr.d VI0, i0, 1 //3
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, -1
xvinsgr2vr.d VI0, i0, 2 //2
addi.d i0, i0, 2
xvinsgr2vr.d VI0, i0, 3 //4
xvinsgr2vr.d VI0, i0, 0
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3
#else
addi.w i0, i0, 1
srai.d I, N, 3
srai.d I, N, 2
bge $r0, I, .L21
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
addi.w i0, i0, -7
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, -3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, -3
xvinsgr2vr.w VI0, i0, 0
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
xvinsgr2vr.w VI0, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 3
xvinsgr2vr.w VI0, i0, 2 //5
xvinsgr2vr.w VI0, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //6
addi.w i0, i0, -3
xvinsgr2vr.w VI0, i0, 4 //3
xvinsgr2vr.w VI0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //4
addi.w i0, i0, 3
xvinsgr2vr.w VI0, i0, 6 //7
xvinsgr2vr.w VI0, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
xvinsgr2vr.w VI0, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7
#endif
.align 3

@@ -301,16 +301,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.d x1, t3, 1
xvinsgr2vr.d x2, t4, 1
xvadd.d VI1, VI1, VINC4
xvfmul.d x3, VI4, x1
xvfmul.d x4, VI4, x2
xvfcmp.clt.d VT0, x1, VI3
xvfcmp.clt.d VINC8, x2, VI3
xvbitsel.v x1, x1, x3, VT0
xvbitsel.v x2, x2, x4, VINC8
xvfadd.d x1, x1, x2
xvfmax.d x3, VM0, x1
ld.d t1, X, 0 * SIZE
xvfcmp.ceq.d VT0, x3, VM0
ld.d t2, X, 1 * SIZE
xvbitsel.v VM0, x3, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.d x1, t1, 2
xvinsgr2vr.d x2, t2, 2
xvinsgr2vr.d x1, t3, 3
xvinsgr2vr.d x2, t4, 3
xvinsgr2vr.d x1, t1, 0
xvinsgr2vr.d x2, t2, 0
xvinsgr2vr.d x1, t3, 1
xvinsgr2vr.d x2, t4, 1
xvadd.d VI1, VI1, VINC4
addi.d I, I, -1
xvfmul.d x3, VI4, x1
xvfmul.d x4, VI4, x2
@@ -332,6 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w x2, t2, 0
xvinsgr2vr.w x1, t3, 1
xvinsgr2vr.w x2, t4, 1
xvadd.w VI1, VI1, VINC4
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
@@ -342,31 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w x2, t2, 2
xvinsgr2vr.w x1, t3, 3
xvinsgr2vr.w x2, t4, 3
xvadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.w x1, t1, 4
xvinsgr2vr.w x2, t2, 4
xvinsgr2vr.w x1, t3, 5
xvinsgr2vr.w x2, t4, 5
xvadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.w x1, t1, 6
xvinsgr2vr.w x2, t2, 6
xvinsgr2vr.w x1, t3, 7
xvinsgr2vr.w x2, t4, 7
addi.d I, I, -1
xvpickev.w x1, VX1, VX0
xvpickod.w x2, VX1, VX0
xvfmul.s x3, VI4, x1
xvfmul.s x4, VI4, x2
xvfcmp.clt.s VT0, x1, VI3
@@ -384,152 +373,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.L25:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfcmp.ceq.d VT0, VM1, x1
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f10, $f9
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L27
#else
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, VM1, x1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.d VM0, x3, x4
xvfcmp.ceq.d VT0, x3, VM0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
#endif
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
#endif
.align 3

#ifdef DOUBLE
.L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L27
XVCMPLT VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
xvfmaxa.d VM0, x1, x2
xvfcmp.ceq.d VT0, x1, VM0
xvbitsel.v VI0, VI2, VI1, VT0
.align 3

.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L28
XVCMPLT VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3

.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L29
XVCMPLT VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3

.L29:
#ifdef DOUBLE
movfr2gr.d i0, $f20
#else
fmov.s $f16, $f20
#endif
.align 3

#ifdef DOUBLE
#else
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v x1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3

.L262:
.L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L272
bceqz $fcc0, .L27
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3

.L272:
.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L282
bceqz $fcc0, .L28
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3

.L282:
.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L292
bceqz $fcc0, .L29
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3

.L292:
fcmp.clt.s $fcc0, $f15, $f13
fsel $f15, $f15, $f13, $fcc0
fsel $f20, $f20, $f16, $fcc0
.L29:
movfr2gr.s i0, $f20
.align 3

#endif
.L21: //N<8
#ifdef DOUBLE
.L21: //N<4
andi I, N, 3
bge $r0, I, .L999
srai.d i1, N, 2
slli.d i1, i1, 2
#else
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
#endif
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
@@ -550,10 +469,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
MTG i0, $f20
MTG i0, $f20
.align 3


.L999:
move $r4, $r17
jirl $r0, $r1, 0x0


+ 123
- 1177
kernel/loongarch64/rot_lasx.S
File diff suppressed because it is too large
View File


+ 94
- 32
kernel/loongarch64/snrm2_lasx.S View File

@@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define t2 $r13
#define t3 $r14
#define t4 $r15

/* Don't change following FR unless you know the effects. */
#define VX0 $xr15
#define VX1 $xr16
#define VX2 $xr17
#define VX3 $xr18
#define VX4 $xr21
#define VX5 $xr22
/* Don't change following FR unless you know the effects. */
#define res1 $xr19
#define res2 $xr20
#define RCP $f2
#define VALPHA $xr3

// The optimization for snrm2 cannot simply involve
// extending the data type from float to double and
// then summing the squares of the data. LAPACK tests
// have shown that this approach can still lead to data overflow.
// Instead, we need to find the maximum absolute value in the entire
// array and divide each data element by this maximum value before
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).

PROLOGUE

@@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
bge $r0, N, .L999
beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl samax_k_LA264
#else
bl samax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w $vr3, $vr2, 0
xvpermi.d VALPHA, $xr3,0x00
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
srai.d I, N, 4
bne INCX, TEMP, .L20
bge $r0, I, .L997
bge $r0, I, .L997
.align 3

.L10:
xvld VX0, X, 0
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
xvld VX0, X, 0
xvld VX5, X, 8 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d X, X, 16 * SIZE

xvfmul.s VX0, VX0, VALPHA
xvfmul.s VX5, VX5, VALPHA

xvfmadd.s res1, VX0, VX0, res1
xvfmadd.s res2, VX5, VX5, res2
blt $r0, I, .L10
.align 3
b .L996
.align 3

.L20:
bge $r0, I, .L997
@@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmul.s VX0, VX0, VALPHA
xvfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmul.s VX0, VX0, VALPHA
xvfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
.align 3

.L996:
xvfadd.d res1, res1, res2
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
fadd.d $f19, $f19, $f16
fadd.d $f19, $f19, $f17
fadd.d $f19, $f19, $f18
xvfadd.s res1, res1, res2
xvpermi.d VX1, res1, 0x4e
xvfadd.s res1, res1, VX1
vreplvei.w $vr16, $vr19, 1
vreplvei.w $vr17, $vr19, 2
vreplvei.w $vr18, $vr19, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
.align 3

.L997:
andi I, N, 7
andi I, N, 15
bge $r0, I, .L999
.align 3

.L998:
fld.s $f15, X, 0
add.d X, X, INCX
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
addi.d I, I, -1
fmul.s $f15, $f15, RCP
fmadd.s $f19, $f15, $f15, $f19
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d $f19, $f19
fsqrt.s $f19, $f19
fmul.s $f0, $f19, $f0
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
.align 3

EPILOGUE

+ 9
- 56
kernel/loongarch64/swap_lasx.S View File

@@ -318,62 +318,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
move XX, X

.L222:
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD b3, Y, 0
ST a3, Y, 0
add.d Y, Y, INCY
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
ST b1, XX, 0
add.d XX, XX, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD a2, X, 0
add.d X, X, INCX
ST b2, XX, 0
add.d XX, XX, INCX
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD a3, X, 0
add.d X, X, INCX
ST b3, XX, 0
add.d XX, XX, INCX
LD b3, Y, 0
ST a3, Y, 0
LD a4, X, 0
add.d X, X, INCX
ST b4, XX, 0
add.d XX, XX, INCX
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
ST b1, XX, 0
add.d XX, XX, INCX
ST b2, XX, 0
add.d XX, XX, INCX
ST b3, XX, 0
add.d XX, XX, INCX
ST b4, XX, 0
add.d XX, XX, INCX
addi.d I, I, -1
.rept 8
LD $f12, X, 0
LD $f14, Y, 0
ST $f12, Y, 0
ST $f14, X, 0
add.d X, X, INCX
add.d Y, Y, INCY
.endr
addi.d I, I, -1
blt $r0, I, .L222
.align 3



Loading…
Cancel
Save